This is my 2nd scraping project. I'm really new and just trying to learn this stuff.
I'm crawling with the Process Documents from Web operator and using Extract Information as a sub process. I'm querying 8 attributes with xpath querys. Last I use Write Excel to output the data into a spreadsheet.
However my xpath query as it's written will only retrun the first TR row. Below is the site I am crawling for reference:
http://www.mixedmartialarts.com/f/1BC00DA3949506AC/BJ-Penn/I'm trying to pull the professional record table. With my first successful scraping project I pulled the much simpler Bio table at the top of the page.
This is a sample of the type of xpath query I'm using.
//h:table[@class='data_table']/h:tr[@class='data_row']/h:td[2]/text()
That works fine for pulling the specified data from the first row with the defined class.
My question: is there any way I can increment the tr from 1 through the end of the table which would be a variable number of rows on different pages? I've been stuck here for about a day, I've done some searches and haven't found what I'm looking for. I've also tried a hand full of operators unsuccessfully.
Here is my code:
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<parameter key="logfile" value="C:\Users\Jeremy\Documents\Rapidminer Repository\logfile"/>
<parameter key="resultfile" value="C:\Users\Jeremy\Documents\Rapidminer Repository\resultfile"/>
<process expanded="true" height="620" width="435">
<operator activated="true" class="web:process_web" compatibility="5.2.001" expanded="true" height="60" name="Process Documents from Web" width="90" x="45" y="30">
<parameter key="url" value="http://www.mixedmartialarts.com/f/1BC00DA3949506AC/BJ-Penn/"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value="http://www\.mixedmartialarts\.com/f/.*"/>
</list>
<parameter key="max_pages" value="6"/>
<parameter key="max_depth" value="4"/>
<parameter key="domain" value="server"/>
<parameter key="delay" value="5000"/>
<parameter key="user_agent" value="Mozilla/5.0 (Windows NT 6.1; rv:12.0) Gecko/20120403211507 Firefox/14.0.1"/>
<parameter key="parallelize_process_webpage" value="true"/>
<process expanded="true" height="620" width="435">
<operator activated="true" class="text:extract_information" compatibility="5.2.004" expanded="true" height="60" name="Extract Information (2)" width="90" x="179" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Fight Date" value="//h:table[@class='data_table']/h:tr[@class='data_row']/h:td[2]/text()"/>
<parameter key="Result" value="//h:table[@class='data_table']/h:tr[@class='data_row']/h:td[3]/text()"/>
<parameter key="Opponent" value="//h:table[@class='data_table']/h:tr[@class='data_row']/h:td[4]/h:a/text()"/>
<parameter key="Event" value="//h:table[@class='data_table']/h:tr[@class='data_row']/h:td[5]/h:a/text()"/>
<parameter key="Method" value="//h:table[@class='data_table']/h:tr[@class='data_row']/h:td[6]/text()"/>
<parameter key="Round" value="//h:table[@class='data_table']/h:tr[@class='data_row']/h:td[7]/text()"/>
<parameter key="Official Time" value="//h:table[@class='data_table']/h:tr[@class='data_row']/h:td[8]/text()"/>
<parameter key="Verified" value="//h:table[@class='data_table']/h:tr[@class='data_row']/h:td[1]/h:img/@border | //h:table[@class='data_table']/h:tr[@class='data_row']/h:td[1]/text()"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information (2)" to_port="document"/>
<connect from_op="Extract Information (2)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="write_excel" compatibility="5.2.008" expanded="true" height="76" name="Write Excel" width="90" x="246" y="30">
<parameter key="excel_file" value="C:\Users\Jeremy\Documents\Rapidminer Repository\Results\record.xls"/>
</operator>
<connect from_op="Process Documents from Web" from_port="example set" to_op="Write Excel" to_port="input"/>
<connect from_op="Write Excel" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="18"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="108"/>
</process>
</operator>
</process>