XPath multiple record extraction

vbulcan99
vbulcan99 New Altair Community Member
edited November 5 in Community Q&A
Hello everyone,

I am trying to use "Extract Information" operator with XPath query type in order to duplicate

the video example: "Web Scraping with RapidMiner and XPath" from: http://vancouverdata.blogspot.com/2011/04/web-scraping-rapidminer-xpath-web.html

The techniq is working well if I am trying to extract one record with one or more atributes per page.

But if I try to extract an example all job links (http://vancouver.en.craigslist.ca/jjj/)

using the query: //h:blockquote/h:p[not(@align='center')]/h:a/@href

as was described on the tutorial (and is working on google spreadsheets)

"Extract Information" is returning just one random record instead of multiple records of all available job links from the page

I tryed to extend little bit the example by using "Crawl Web" operator in order to get the job links from more than one page

process them thru using macro aproach with 'Loop Examples' operator and finally to agregate the results in one final record set by using "Append" operator

but the system is failing for unexplained reasons.

here is the XML code:


<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.011">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.1.011" expanded="true" name="Process">
    <process expanded="true" height="548" width="883">
      <operator activated="true" class="web:crawl_web" compatibility="5.1.004" expanded="true" height="60" name="Crawl Web (2)" width="90" x="45" y="165">
        <parameter key="url" value="http://vancouver.en.craigslist.ca/jjj/"/>
        <list key="crawling_rules">
          <parameter key="follow_link_with_matching_url" value=".*index.*"/>
          <parameter key="store_with_matching_url" value=".*index.*"/>
        </list>
        <parameter key="write_pages_into_files" value="false"/>
        <parameter key="add_pages_as_attribute" value="true"/>
        <parameter key="output_dir" value="C:\Users\Administrator\Documents\traRM"/>
        <parameter key="max_pages" value="3"/>
        <parameter key="max_depth" value="3"/>
        <parameter key="obey_robot_exclusion" value="false"/>
        <parameter key="really_ignore_exclusion" value="true"/>
      </operator>
      <operator activated="true" class="generate_id" compatibility="5.1.011" expanded="true" height="76" name="Generate ID" width="90" x="246" y="165"/>
      <operator activated="true" class="loop_examples" compatibility="5.1.011" expanded="true" height="94" name="Loop Examples (2)" width="90" x="447" y="165">
        <parameter key="iteration_macro" value="id"/>
        <process expanded="true" height="548" width="901">
          <operator activated="true" class="extract_macro" compatibility="5.1.011" expanded="true" height="60" name="Extract Macro (3)" width="90" x="246" y="30">
            <parameter key="macro" value="website_url"/>
            <parameter key="macro_type" value="data_value"/>
            <parameter key="attribute_name" value="Link"/>
            <parameter key="example_index" value="%{id}"/>
          </operator>
          <operator activated="true" class="web:get_webpage" compatibility="5.1.004" expanded="true" height="60" name="Get Page" width="90" x="112" y="210">
            <parameter key="url" value="%{website_url}"/>
            <list key="query_parameters"/>
          </operator>
          <operator activated="true" class="text:process_documents" compatibility="5.1.002" expanded="true" height="94" name="Process Documents (2)" width="90" x="313" y="165">
            <parameter key="create_word_vector" value="false"/>
            <parameter key="add_meta_information" value="false"/>
            <parameter key="keep_text" value="true"/>
            <process expanded="true" height="548" width="901">
              <operator activated="true" class="text:extract_information" compatibility="5.1.002" expanded="true" height="60" name="Extract Information (4)" width="90" x="380" y="30">
                <parameter key="query_type" value="XPath"/>
                <list key="string_machting_queries"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries"/>
                <list key="xpath_queries">
                  <parameter key="xpath" value="//h:blockquote/h:p[not(@align='center')]/h:a/@href"/&gt;
                </list>
                <list key="namespaces"/>
                <list key="index_queries"/>
              </operator>
              <connect from_port="document" to_op="Extract Information (4)" to_port="document"/>
              <connect from_op="Extract Information (4)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_port="example set" to_op="Extract Macro (3)" to_port="example set"/>
          <connect from_op="Extract Macro (3)" from_port="example set" to_port="example set"/>
          <connect from_op="Get Page" from_port="output" to_op="Process Documents (2)" to_port="documents 1"/>
          <connect from_op="Process Documents (2)" from_port="example set" to_port="output 1"/>
          <portSpacing port="source_example set" spacing="0"/>
          <portSpacing port="sink_example set" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" breakpoints="before" class="append" compatibility="5.1.011" expanded="true" height="76" name="Append (2)" width="90" x="648" y="165"/>
      <connect from_op="Crawl Web (2)" from_port="Example Set" to_op="Generate ID" to_port="example set input"/>
      <connect from_op="Generate ID" from_port="example set output" to_op="Loop Examples (2)" to_port="example set"/>
      <connect from_op="Loop Examples (2)" from_port="output 1" to_op="Append (2)" to_port="example set 1"/>
      <connect from_op="Append (2)" from_port="merged set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>



Could you help me to solve this problem please

Thanks
Tagged:

Answers

  • Loky
    Loky New Altair Community Member
    I'm working on something similar too.
    You'll have to use Cut Document to be able to extract multiple similar stuff from one document. Do a search on this forum and you'll fine some example. I start building my process based on this and it works pretty good.

    Good luck.
  • vbulcan99
    vbulcan99 New Altair Community Member
    Hi Loky,

    Thank you very much for advise to use "Cut Document" it is working excellent for me.

    for anybody experiencing same problem here is solution:

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.1.011">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.1.011" expanded="true" name="Process">
        <process expanded="true" height="657" width="955">
          <operator activated="true" class="web:crawl_web" compatibility="5.1.004" expanded="true" height="60" name="Crawl Web (2)" width="90" x="45" y="30">
            <parameter key="url" value="http://vancouver.en.craigslist.ca/jjj/"/>
            <list key="crawling_rules">
              <parameter key="follow_link_with_matching_url" value=".*index.*"/>
              <parameter key="store_with_matching_url" value=".*index.*"/>
            </list>
            <parameter key="write_pages_into_files" value="false"/>
            <parameter key="add_pages_as_attribute" value="true"/>
            <parameter key="output_dir" value="C:\Users\Administrator\Documents\traRM"/>
            <parameter key="max_pages" value="2"/>
            <parameter key="obey_robot_exclusion" value="false"/>
            <parameter key="really_ignore_exclusion" value="true"/>
          </operator>
          <operator activated="true" class="generate_id" compatibility="5.1.011" expanded="true" height="76" name="Generate ID" width="90" x="179" y="30"/>
          <operator activated="true" class="loop_examples" compatibility="5.1.011" expanded="true" height="94" name="Loop Examples (2)" width="90" x="313" y="30">
            <parameter key="iteration_macro" value="id"/>
            <process expanded="true" height="548" width="901">
              <operator activated="true" class="extract_macro" compatibility="5.1.011" expanded="true" height="60" name="Extract Macro (3)" width="90" x="246" y="30">
                <parameter key="macro" value="website_url"/>
                <parameter key="macro_type" value="data_value"/>
                <parameter key="attribute_name" value="Link"/>
                <parameter key="example_index" value="%{id}"/>
              </operator>
              <operator activated="true" class="web:get_webpage" compatibility="5.1.004" expanded="true" height="60" name="Get Page" width="90" x="112" y="210">
                <parameter key="url" value="%{website_url}"/>
                <list key="query_parameters"/>
              </operator>
              <operator activated="true" class="text:cut_document" compatibility="5.1.003" expanded="true" height="60" name="Cut Document (4)" width="90" x="313" y="210">
                <parameter key="query_type" value="XPath"/>
                <list key="string_machting_queries"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries"/>
                <list key="xpath_queries">
                  <parameter key="xpath" value="//h:blockquote/h:p[not(@align='center')]/h:a/@href"/&gt;
                </list>
                <list key="namespaces">
                  <parameter key="xx" value="xml"/>
                </list>
                <parameter key="ignore_CDATA" value="false"/>
                <list key="index_queries"/>
                <process expanded="true" height="598" width="984">
                  <connect from_port="segment" to_port="document 1"/>
                  <portSpacing port="source_segment" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                </process>
              </operator>
              <operator activated="true" class="extract_macro" compatibility="5.1.011" expanded="true" height="60" name="Extract Macro (2)" width="90" x="447" y="30">
                <parameter key="macro" value="website_page"/>
                <parameter key="macro_type" value="data_value"/>
                <parameter key="attribute_name" value="Page"/>
                <parameter key="example_index" value="%{id}"/>
              </operator>
              <connect from_port="example set" to_op="Extract Macro (3)" to_port="example set"/>
              <connect from_op="Extract Macro (3)" from_port="example set" to_op="Extract Macro (2)" to_port="example set"/>
              <connect from_op="Get Page" from_port="output" to_op="Cut Document (4)" to_port="document"/>
              <connect from_op="Cut Document (4)" from_port="documents" to_port="output 1"/>
              <connect from_op="Extract Macro (2)" from_port="example set" to_port="example set"/>
              <portSpacing port="source_example set" spacing="0"/>
              <portSpacing port="sink_example set" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="text:process_documents" compatibility="5.1.003" expanded="true" height="94" name="Process Documents" width="90" x="447" y="30">
            <parameter key="create_word_vector" value="false"/>
            <parameter key="add_meta_information" value="false"/>
            <parameter key="keep_text" value="true"/>
            <process expanded="true" height="580" width="966">
              <connect from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="generate_id" compatibility="5.1.011" expanded="true" height="76" name="Generate ID (2)" width="90" x="581" y="30"/>
          <operator activated="true" class="append" compatibility="5.1.011" expanded="true" height="76" name="Append (2)" width="90" x="715" y="30"/>
          <operator activated="true" class="write_excel" compatibility="5.1.011" expanded="true" height="60" name="Write Excel" width="90" x="849" y="30">
            <parameter key="excel_file" value="D:\Desktop\links.xls"/>
          </operator>
          <connect from_op="Crawl Web (2)" from_port="Example Set" to_op="Generate ID" to_port="example set input"/>
          <connect from_op="Generate ID" from_port="example set output" to_op="Loop Examples (2)" to_port="example set"/>
          <connect from_op="Loop Examples (2)" from_port="output 1" to_op="Process Documents" to_port="documents 1"/>
          <connect from_op="Process Documents" from_port="example set" to_op="Generate ID (2)" to_port="example set input"/>
          <connect from_op="Generate ID (2)" from_port="example set output" to_op="Append (2)" to_port="example set 1"/>
          <connect from_op="Append (2)" from_port="merged set" to_op="Write Excel" to_port="input"/>
          <connect from_op="Write Excel" from_port="through" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>