Hi, Im hoping someone can help me out with a web crawling question. I'm using the process documents from web and extract information operators and the XPATH queries are returning the information that I need which I can export to CSV, the issue is that only the first record is output but I need all forum data from the 10 URL's to be output. I've seen a few posts where people are using loop examples and the cut document operator but I can't seem to get this working right - Has anyone come up with a novel approach to doing this?
Thanks
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<process expanded="true" height="415" width="748">
<operator activated="true" class="web:process_web" compatibility="5.2.003" expanded="true" height="60" name="Process Documents from Web" width="90" x="45" y="30">
<parameter key="url" value="http://www.airlinequality.com/Forum/ryan.htm"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value=".+Forum/ryan.+"/>
</list>
<parameter key="max_pages" value="10"/>
<parameter key="max_depth" value="100"/>
<parameter key="delay" value="500"/>
<parameter key="max_threads" value="10"/>
<parameter key="max_page_size" value="5000"/>
<parameter key="user_agent" value="Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)"/>
<process expanded="true" height="385" width="762">
<operator activated="true" class="text:extract_information" compatibility="5.2.004" expanded="true" height="60" name="Extract Information (2)" width="90" x="45" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="review" value="//h:p[@class='text2']/text()"/>
<parameter key="rating" value="//h:p[@class='text25' and contains(., 'Rating')]/text()"/>
<parameter key="recommended" value="//h:img[contains(@src,'_rvw.gif')]/@src"/>
<parameter key="value_for_money" value="//h:table[starts-with(@width,'193')]//h:tr[3]//h:td[2]//h:img/@src"/>
<parameter key="reviewed_by" value="//h:td[@class='airport']/h:h9/text()"/>
<parameter key="seat_comfort" value="//h:table[starts-with(@width,'193')]//h:tr[4]//h:td[2]//h:img/@src"/>
<parameter key="staff_service" value="//h:table[starts-with(@width,'193')]//h:tr[5]//h:td[2]//h:img/@src"/>
<parameter key="catering" value="//h:table[starts-with(@width,'193')]//h:tr[6]//h:td[2]//h:img/@src"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information (2)" to_port="document"/>
<connect from_op="Extract Information (2)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Process Documents from Web" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
[ /code]