Get Pages with Pagination
Legacy User
New Altair Community Member
Hi everyone,
I am desperatly trying to crawl a list of Web Sites that all have a different number of Pages.
I read the Tutoriaal on http://www.simafore.com/blog/bid/112223/Text-mining-How-to-fine-tune-job-searches-using-web-crawling-2-of-4 and I can now store and process multiple Pages of one URL.
In the next step I want to process a list of URLs using the Get Pages Operator but I can't get it to also process the pages of these sites?
I know this is a probably hard to understand, so here an example ;-)
I want to extract Customer Reviews from yelp.com. For Example: http://www.yelp.com/biz/hertz-san-francisco-9
This site has 5 Pages with 40 reviews each. Using a Loop Operator I am capable of extracting all these Reviews. So far so good. ;-)
But how can I crawl multiple URLs with multiple pages each? For Example :
http://www.yelp.com/biz/hertz-san-francisco-9
http://www.yelp.com/biz/hertz-philadelphia
As you will see, I already tried to work with macros collecting the number of pages for each URL but I am missing something.
Any help would be greatly appreciated
Thank you ;D
I am desperatly trying to crawl a list of Web Sites that all have a different number of Pages.
I read the Tutoriaal on http://www.simafore.com/blog/bid/112223/Text-mining-How-to-fine-tune-job-searches-using-web-crawling-2-of-4 and I can now store and process multiple Pages of one URL.
In the next step I want to process a list of URLs using the Get Pages Operator but I can't get it to also process the pages of these sites?
I know this is a probably hard to understand, so here an example ;-)
I want to extract Customer Reviews from yelp.com. For Example: http://www.yelp.com/biz/hertz-san-francisco-9
This site has 5 Pages with 40 reviews each. Using a Loop Operator I am capable of extracting all these Reviews. So far so good. ;-)
But how can I crawl multiple URLs with multiple pages each? For Example :
http://www.yelp.com/biz/hertz-san-francisco-9
http://www.yelp.com/biz/hertz-philadelphia
As you will see, I already tried to work with macros collecting the number of pages for each URL but I am missing something.
Any help would be greatly appreciated
Thank you ;D
Tagged:
0
Answers
-
Here Is my code for Extracting Reviews from ONE URL wirth 5 Pages:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros>
<macro>
<key>pagePos</key>
<value>0</value>
</macro>
<macro>
<key>maxPage</key>
<value>0</value>
</macro>
<macro>
<key>URL</key>
<value>x</value>
</macro>
</macros>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<parameter key="notification_email" value="dominikcichon@gmx.de"/>
<parameter key="process_duration_for_mail" value="20"/>
<process expanded="true">
<operator activated="true" class="loop" compatibility="5.3.015" expanded="true" height="94" name="Loop" width="90" x="112" y="30">
<parameter key="iterations" value="5"/>
<parameter key="limit_time" value="true"/>
<parameter key="timeout" value="60"/>
<process expanded="true">
<operator activated="true" class="web:get_webpage" compatibility="5.3.001" expanded="true" height="60" name="Get Page (3)" width="90" x="112" y="390">
<parameter key="url" value="http://www.yelp.com/biz/hertz-san-francisco-9?start=%{pagePos}"/>
<parameter key="random_user_agent" value="true"/>
<list key="query_parameters"/>
<list key="request_properties"/>
</operator>
<operator activated="true" class="generate_macro" compatibility="5.3.015" expanded="true" height="76" name="Generate Macro" width="90" x="45" y="30">
<list key="function_descriptions">
<parameter key="pagePos" value="%{pagePos} + 40"/>
</list>
</operator>
<operator activated="true" class="log" compatibility="5.3.015" expanded="true" height="76" name="Log" width="90" x="179" y="30">
<list key="log"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.3.015" expanded="true" height="112" name="Multiply" width="90" x="246" y="390"/>
<operator activated="true" class="text:cut_document" compatibility="5.3.002" expanded="true" height="60" name="Cut Document (2)" width="90" x="648" y="390">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="REVIEW" value="//h:p[@class='review_comment ieSucks']"/>
</list>
<list key="namespaces"/>
<parameter key="ignore_CDATA" value="false"/>
<list key="index_queries"/>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="5.3.001" expanded="true" name="Extract Content"/>
<connect from_port="segment" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Documents (4)" width="90" x="849" y="390">
<parameter key="create_word_vector" value="false"/>
<parameter key="add_meta_information" value="false"/>
<parameter key="keep_text" value="true"/>
<process expanded="true">
<connect from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID" width="90" x="983" y="390"/>
<operator activated="true" class="rename" compatibility="5.3.015" expanded="true" height="76" name="Rename" width="90" x="1117" y="390">
<parameter key="old_name" value="text"/>
<parameter key="new_name" value="REVIEW_TEXT"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="text:cut_document" compatibility="5.3.002" expanded="true" height="60" name="Cut Document (3)" width="90" x="648" y="480">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="RATING" value="//h:div[@itemprop='reviewRating']/h:div[@class='rating-very-large']/h:i/@title[contains(.,'rating')]"/>
</list>
<list key="namespaces"/>
<parameter key="ignore_CDATA" value="false"/>
<list key="index_queries"/>
<process expanded="true">
<connect from_port="segment" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Documents (2)" width="90" x="849" y="525">
<parameter key="create_word_vector" value="false"/>
<parameter key="add_meta_information" value="false"/>
<parameter key="keep_text" value="true"/>
<process expanded="true">
<connect from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID (2)" width="90" x="983" y="525"/>
<operator activated="true" class="rename" compatibility="5.3.015" expanded="true" height="76" name="Rename (2)" width="90" x="1117" y="525">
<parameter key="old_name" value="text"/>
<parameter key="new_name" value="RATING_TEXT"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="5.3.015" expanded="true" height="76" name="Generate Attributes" width="90" x="1251" y="525">
<list key="function_descriptions">
<parameter key="STARS" value="cut(RATING_TEXT,0,3)"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes" width="90" x="1385" y="525">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="|id|STARS"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="text:cut_document" compatibility="5.3.002" expanded="true" height="60" name="Cut Document (5)" width="90" x="648" y="300">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="REVIEW_DATE" value="//h:meta[@itemprop='datePublished']/@content"/>
</list>
<list key="namespaces"/>
<parameter key="ignore_CDATA" value="false"/>
<list key="index_queries"/>
<process expanded="true">
<operator activated="false" class="web:extract_html_text_content" compatibility="5.3.001" expanded="true" name="Extract Content (3)"/>
<connect from_port="segment" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Documents (3)" width="90" x="849" y="255">
<parameter key="create_word_vector" value="false"/>
<parameter key="add_meta_information" value="false"/>
<parameter key="keep_text" value="true"/>
<process expanded="true">
<connect from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.015" expanded="true" height="76" name="Generate ID (3)" width="90" x="983" y="255"/>
<operator activated="true" class="rename" compatibility="5.3.015" expanded="true" height="76" name="Rename (3)" width="90" x="1117" y="255">
<parameter key="old_name" value="text"/>
<parameter key="new_name" value="REVIEW_DATE"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.015" expanded="true" height="76" name="Set Role (2)" width="90" x="1251" y="390">
<parameter key="attribute_name" value="REVIEW_TEXT"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="join" compatibility="5.3.015" expanded="true" height="76" name="Join (2)" width="90" x="1653" y="345">
<parameter key="join_type" value="right"/>
<list key="key_attributes"/>
</operator>
<operator activated="true" class="join" compatibility="5.3.015" expanded="true" height="76" name="Join" width="90" x="1854" y="480">
<parameter key="join_type" value="right"/>
<list key="key_attributes"/>
</operator>
<operator activated="true" class="guess_types" compatibility="5.3.015" expanded="true" height="76" name="Guess Types (2)" width="90" x="1985" y="480">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="STARS"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="5.3.015" expanded="true" height="76" name="Generate Attributes (2)" width="90" x="2120" y="480">
<list key="function_descriptions">
<parameter key="LABEL" value="if(STARS>=3,"P","N")"/>
</list>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.015" expanded="true" height="76" name="Set Role" width="90" x="2255" y="480">
<parameter key="attribute_name" value="REVIEW_TEXT"/>
<list key="set_additional_roles">
<parameter key="REVIEW_DATE" value="regular"/>
<parameter key="STARS" value="regular"/>
<parameter key="LABEL" value="label"/>
</list>
</operator>
<operator activated="true" class="nominal_to_date" compatibility="5.3.015" expanded="true" height="76" name="Nominal to Date" width="90" x="2390" y="480">
<parameter key="attribute_name" value="REVIEW_DATE"/>
<parameter key="date_format" value="yyyy-MM-dd"/>
</operator>
<connect from_port="input 1" to_op="Generate Macro" to_port="through 1"/>
<connect from_op="Get Page (3)" from_port="output" to_op="Multiply" to_port="input"/>
<connect from_op="Generate Macro" from_port="through 1" to_op="Log" to_port="through 1"/>
<connect from_op="Log" from_port="through 1" to_port="output 2"/>
<connect from_op="Multiply" from_port="output 1" to_op="Cut Document (5)" to_port="document"/>
<connect from_op="Multiply" from_port="output 2" to_op="Cut Document (2)" to_port="document"/>
<connect from_op="Multiply" from_port="output 3" to_op="Cut Document (3)" to_port="document"/>
<connect from_op="Cut Document (2)" from_port="documents" to_op="Process Documents (4)" to_port="documents 1"/>
<connect from_op="Process Documents (4)" from_port="example set" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Cut Document (3)" from_port="documents" to_op="Process Documents (2)" to_port="documents 1"/>
<connect from_op="Process Documents (2)" from_port="example set" to_op="Generate ID (2)" to_port="example set input"/>
<connect from_op="Generate ID (2)" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
<connect from_op="Rename (2)" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Join" to_port="right"/>
<connect from_op="Cut Document (5)" from_port="documents" to_op="Process Documents (3)" to_port="documents 1"/>
<connect from_op="Process Documents (3)" from_port="example set" to_op="Generate ID (3)" to_port="example set input"/>
<connect from_op="Generate ID (3)" from_port="example set output" to_op="Rename (3)" to_port="example set input"/>
<connect from_op="Rename (3)" from_port="example set output" to_op="Join (2)" to_port="left"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Join (2)" to_port="right"/>
<connect from_op="Join (2)" from_port="join" to_op="Join" to_port="left"/>
<connect from_op="Join" from_port="join" to_op="Guess Types (2)" to_port="example set input"/>
<connect from_op="Guess Types (2)" from_port="example set output" to_op="Generate Attributes (2)" to_port="example set input"/>
<connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Nominal to Date" to_port="example set input"/>
<connect from_op="Nominal to Date" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
<portSpacing port="sink_output 3" spacing="0"/>
</process>
</operator>
<operator activated="true" class="append" compatibility="5.3.015" expanded="true" height="76" name="Append" width="90" x="246" y="75"/>
<connect from_port="input 1" to_op="Loop" to_port="input 1"/>
<connect from_op="Loop" from_port="output 1" to_op="Append" to_port="example set 1"/>
<connect from_op="Append" from_port="merged set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>0 -
any update on this topic?
0 -
Normally you will need to find the root URL of the next page and then write a regular expression for it.
0