I'm trying to learn screen scraping. I'm on my 3rd project now. I picked a site that had 2 very different data tables, a small bio, and a big professional record table with multiple rows and different data types. Those were my first 2 scraping projects. Now I'm trying to tie both together into one big process that scrapes both. The site is:
http://www.mixedmartialarts.com/f/1BC00DA3949506AC/BJ-Penn/Here's what I've done so far. I start with a Process Documents from Web operator with a sub process that starts with the Multiply operator and feeds the output to one extract information operator that scrapes the Bio table and one Cut Document operator with a sub process containing an Extract Information operator that pulls the Professional Record table.
All of those functions seem to work fine.
Where I'm hitting a snag is the Process Documents from Web operator only has 1 example set output. Accordingly, my 2 tables get mashed up into a mess. I'd like to pull 2 separate tables from each of the crawled pages and output them to excel, access, or a repository.
I've tried to divide the data after the Process Documents operator and before exporting, but I can't seem to find the correct operator.
Here's my code so far, any help or suggestions would be appreciated.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<parameter key="logfile" value="C:\Users\Jeremy\Documents\Rapidminer Repository\logfile"/>
<parameter key="resultfile" value="C:\Users\Jeremy\Documents\Rapidminer Repository\resultfile"/>
<process expanded="true" height="620" width="300">
<operator activated="true" class="web:process_web" compatibility="5.2.001" expanded="true" height="60" name="Process Documents from Web" width="90" x="45" y="30">
<parameter key="url" value="http://www.mixedmartialarts.com/f/1BC00DA3949506AC/BJ-Penn/"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value="http://www\.mixedmartialarts\.com/f/.*"/>
</list>
<parameter key="max_pages" value="6"/>
<parameter key="max_depth" value="4"/>
<parameter key="domain" value="server"/>
<parameter key="delay" value="5000"/>
<parameter key="user_agent" value="Mozilla/5.0 (Windows NT 6.1; rv:12.0) Gecko/20120403211507 Firefox/14.0.1"/>
<parameter key="parallelize_process_webpage" value="true"/>
<process expanded="true" height="620" width="480">
<operator activated="true" class="multiply" compatibility="5.2.008" expanded="true" height="94" name="Multiply (2)" width="90" x="45" y="30"/>
<operator activated="true" class="text:cut_document" compatibility="5.2.004" expanded="true" height="60" name="Cut Document (2)" width="90" x="246" y="120">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Table Row" value="//h:table[@class='data_table']/h:tr[@class='data_row'] | //h:table[@class='data_table']/h:tr[@class='data_row_alt']"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
<process expanded="true" height="638" width="521">
<operator activated="true" class="text:extract_information" compatibility="5.2.004" expanded="true" height="60" name="Extract Information (4)" width="90" x="215" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Fight_Date" value="//h:td[2]/text()"/>
<parameter key="Result" value="//h:td[3]/text()"/>
<parameter key="Opponent" value="//h:td[4]/h:a/text()"/>
<parameter key="Event" value="//h:td[5]/h:a/text()"/>
<parameter key="Method" value="//h:td[6]/text()"/>
<parameter key="Round" value="//h:td[7]/text()"/>
<parameter key="Official_Time" value="//h:td[8]/text()"/>
<parameter key="Verified" value="//h:td[1]/h:img/@border | //h:td[1]/text() "/>
<parameter key="Title" value="//h:td[9]/h:img/@alt"/>
<parameter key="Opponent_URL" value="//h:td[4]/h:a/@href"/>
<parameter key="Event_URL" value="//h:td[5]/h:a/@href"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="segment" to_op="Extract Information (4)" to_port="document"/>
<connect from_op="Extract Information (4)" from_port="document" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:extract_information" compatibility="5.2.004" expanded="true" height="60" name="Extract Information (3)" width="90" x="246" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Fighter" value="//h:div[@class='Resume']/h:h1/text()"/>
<parameter key="Pro_Record" value="//h:*[contains(.,'Pro Record:')]/../h:td[last()]/text()"/>
<parameter key="Team" value="//h:*[contains(.,'Team:')]/../h:td[last()]/text()"/>
<parameter key="Age" value="//h:*[contains(.,'Age:')]/../h:td[last()]/text()"/>
<parameter key="Sex" value="//h:*[contains(.,'Sex:')]/../h:td[last()]/text()"/>
<parameter key="Height" value="//h:*[contains(.,'Height:')]/../h:td[last()]/text()"/>
<parameter key="Weight" value="//h:*[contains(.,'Weight:')]/../h:td[last()]/text()"/>
<parameter key="Out_of" value="//h:*[contains(.,'Out of:')]/../h:td[last()]/text()"/>
<parameter key="From" value="//h:*[contains(.,'Born:')]/../h:td[last()]/text()"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Multiply (2)" to_port="input"/>
<connect from_op="Multiply (2)" from_port="output 1" to_op="Extract Information (3)" to_port="document"/>
<connect from_op="Multiply (2)" from_port="output 2" to_op="Cut Document (2)" to_port="document"/>
<connect from_op="Cut Document (2)" from_port="documents" to_port="document 2"/>
<connect from_op="Extract Information (3)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
<portSpacing port="sink_document 3" spacing="0"/>
</process>
</operator>
<operator activated="true" class="write_excel" compatibility="5.2.008" expanded="true" height="76" name="Write Excel" width="90" x="180" y="30"/>
<connect from_op="Process Documents from Web" from_port="example set" to_op="Write Excel" to_port="input"/>
<connect from_op="Write Excel" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="18"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="288"/>
</process>
</operator>
</process>