XPath empty results

MrRisoni
MrRisoni New Altair Community Member
edited November 5 in Community Q&A
Hello. I 'm trying to mine data using XPath from Google Scholar pages.
I ' trying to get the name ,h-index and the first 20 publications

I am using the following queries

substring-before(//title, " - Google Scholar Citations")

//*[contains(.,"h-index")]/../tr[3]//td[2]

//a[contains(@href,'citation_for_view')]

All of them work in Google Docs and in Java but none of them does in Rapidminer.
I can't figure out what's wrong...
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="false" class="web:crawl_web" compatibility="5.3.001" expanded="true" height="60" name="Crawl Web" width="90" x="112" y="30">
        <parameter key="url" value="http://scholar.google.gr/citations?view_op=search_authors&amp;hl=el&amp;mauthors=label:web_mining"/>
        <list key="crawling_rules">
          <parameter key="follow_link_with_matching_url" value=".+user=.+"/>
          <parameter key="follow_link_with_matching_url" value=".+8J&amp;astart=.+"/>
        </list>
        <parameter key="output_dir" value="/tmp"/>
        <parameter key="extension" value="html"/>
        <parameter key="max_pages" value="5000"/>
        <parameter key="max_depth" value="1"/>
        <parameter key="max_threads" value="2"/>
        <parameter key="max_page_size" value="300"/>
        <parameter key="user_agent" value=" Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0"/>
      </operator>
      <operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Files" width="90" x="112" y="165">
        <list key="text_directories">
          <parameter key="all" value="/home/phoenix/DataMine/SkolarCrawl"/>
        </list>
        <parameter key="use_file_extension_as_type" value="false"/>
        <parameter key="content_type" value="html"/>
        <parameter key="create_word_vector" value="false"/>
        <process expanded="true">
          <operator activated="true" class="text:extract_information" compatibility="5.3.002" expanded="true" height="60" name="Extract Information" width="90" x="45" y="30">
            <parameter key="query_type" value="XPath"/>
            <list key="string_machting_queries"/>
            <list key="regular_expression_queries"/>
            <list key="regular_region_queries"/>
            <list key="xpath_queries">
              <parameter key="Name" value="substring-before(//title, &quot; - Google Scholar Citations&quot;)"/>
              <parameter key="hindex" value="//*[contains(.,&quot;h-index&quot;)]/../tr[3]//td[2]"/>
              <parameter key="Publications" value="//a[contains(@href,'citation_for_view')]"/>
            </list>
            <list key="namespaces"/>
            <list key="index_queries"/>
          </operator>
          <connect from_port="document" to_op="Extract Information" to_port="document"/>
          <connect from_op="Extract Information" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Process Documents from Files" from_port="example set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>
Tagged:

Answers

  • fras
    fras New Altair Community Member
    The implementation of XPath in RapidMiner works a little bit different. The following process uses "Cut Documents" in combination
    with "Extract Information". This approach seems to be better in your case. Please check and  take into account the use of nested processes.

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="6.0.002">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="30">
            <list key="text_directories">
              <parameter key="all" value="/home/fras/Desktop/ScholarCrawl/"/>
            </list>
            <parameter key="extract_text_only" value="false"/>
            <parameter key="use_file_extension_as_type" value="false"/>
            <parameter key="content_type" value="html"/>
            <parameter key="create_word_vector" value="false"/>
            <process expanded="true">
              <operator activated="true" class="text:cut_document" compatibility="5.3.002" expanded="true" height="60" name="Cut Document" width="90" x="179" y="30">
                <parameter key="query_type" value="XPath"/>
                <list key="string_machting_queries"/>
                <list key="regular_expression_queries">
                  <parameter key="text" value="//h:a[contains(@href,'citation_for_view')]"/>
                </list>
                <list key="regular_region_queries"/>
                <list key="xpath_queries">
                  <parameter key="Publications" value="//h:a[contains(@href,'citation_for_view')]"/>
                  <parameter key="hindex" value="//*[contains(.,&quot;h-index&quot;)]/../h:tr[3]//h:td[2]"/>
                  <parameter key="Name" value="//h:title"/>
                </list>
                <list key="namespaces"/>
                <list key="index_queries"/>
                <process expanded="true">
                  <operator activated="true" class="text:extract_information" compatibility="5.3.002" expanded="true" height="60" name="Extract Information (3)" width="90" x="246" y="30">
                    <parameter key="query_type" value="Regular Expression"/>
                    <list key="string_machting_queries"/>
                    <list key="regular_expression_queries">
                      <parameter key="extract_index" value="&lt;td xmlns=&quot;http://www.w3.org/1999/xhtml&quot; colspan=&quot;1&quot; rowspan=&quot;1&quot; class=&quot;cit-borderleft cit-data&quot;&gt;(\d+)&lt;/td&gt; "/>
                      <parameter key="extract_title" value="&lt;title xmlns=&quot;http://www.w3.org/1999/xhtml&quot;&gt;(.+)&lt;/title&gt; "/>
                    </list>
                    <list key="regular_region_queries"/>
                    <list key="xpath_queries">
                      <parameter key="Name" value="//h:title"/>
                      <parameter key="hindex" value="//*[contains(.,&quot;h-index&quot;)]/../h:tr[3]//h:td[2]"/>
                      <parameter key="Publications" value="//h:a[contains(@href,'citation_for_view')]"/>
                    </list>
                    <list key="namespaces"/>
                    <list key="index_queries"/>
                  </operator>
                  <connect from_port="segment" to_op="Extract Information (3)" to_port="document"/>
                  <connect from_op="Extract Information (3)" from_port="document" to_port="document 1"/>
                  <portSpacing port="source_segment" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                </process>
              </operator>
              <connect from_port="document" to_op="Cut Document" to_port="document"/>
              <connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Process Documents from Files" from_port="example set" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>