XPath empty results
MrRisoni
New Altair Community Member
Hello. I 'm trying to mine data using XPath from Google Scholar pages.
I ' trying to get the name ,h-index and the first 20 publications
I am using the following queries
substring-before(//title, " - Google Scholar Citations")
//*[contains(.,"h-index")]/../tr[3]//td[2]
//a[contains(@href,'citation_for_view')]
All of them work in Google Docs and in Java but none of them does in Rapidminer.
I can't figure out what's wrong...
I ' trying to get the name ,h-index and the first 20 publications
I am using the following queries
substring-before(//title, " - Google Scholar Citations")
//*[contains(.,"h-index")]/../tr[3]//td[2]
//a[contains(@href,'citation_for_view')]
All of them work in Google Docs and in Java but none of them does in Rapidminer.
I can't figure out what's wrong...
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
<process expanded="true">
<operator activated="false" class="web:crawl_web" compatibility="5.3.001" expanded="true" height="60" name="Crawl Web" width="90" x="112" y="30">
<parameter key="url" value="http://scholar.google.gr/citations?view_op=search_authors&hl=el&mauthors=label:web_mining"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value=".+user=.+"/>
<parameter key="follow_link_with_matching_url" value=".+8J&astart=.+"/>
</list>
<parameter key="output_dir" value="/tmp"/>
<parameter key="extension" value="html"/>
<parameter key="max_pages" value="5000"/>
<parameter key="max_depth" value="1"/>
<parameter key="max_threads" value="2"/>
<parameter key="max_page_size" value="300"/>
<parameter key="user_agent" value=" Mozilla/5.0 (X11; Linux x86_64; rv:27.0) Gecko/20100101 Firefox/27.0"/>
</operator>
<operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Files" width="90" x="112" y="165">
<list key="text_directories">
<parameter key="all" value="/home/phoenix/DataMine/SkolarCrawl"/>
</list>
<parameter key="use_file_extension_as_type" value="false"/>
<parameter key="content_type" value="html"/>
<parameter key="create_word_vector" value="false"/>
<process expanded="true">
<operator activated="true" class="text:extract_information" compatibility="5.3.002" expanded="true" height="60" name="Extract Information" width="90" x="45" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Name" value="substring-before(//title, " - Google Scholar Citations")"/>
<parameter key="hindex" value="//*[contains(.,"h-index")]/../tr[3]//td[2]"/>
<parameter key="Publications" value="//a[contains(@href,'citation_for_view')]"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Tagged:
0
Answers
-
The implementation of XPath in RapidMiner works a little bit different. The following process uses "Cut Documents" in combination
with "Extract Information". This approach seems to be better in your case. Please check and take into account the use of nested processes.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.0.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="30">
<list key="text_directories">
<parameter key="all" value="/home/fras/Desktop/ScholarCrawl/"/>
</list>
<parameter key="extract_text_only" value="false"/>
<parameter key="use_file_extension_as_type" value="false"/>
<parameter key="content_type" value="html"/>
<parameter key="create_word_vector" value="false"/>
<process expanded="true">
<operator activated="true" class="text:cut_document" compatibility="5.3.002" expanded="true" height="60" name="Cut Document" width="90" x="179" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries">
<parameter key="text" value="//h:a[contains(@href,'citation_for_view')]"/>
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Publications" value="//h:a[contains(@href,'citation_for_view')]"/>
<parameter key="hindex" value="//*[contains(.,"h-index")]/../h:tr[3]//h:td[2]"/>
<parameter key="Name" value="//h:title"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
<process expanded="true">
<operator activated="true" class="text:extract_information" compatibility="5.3.002" expanded="true" height="60" name="Extract Information (3)" width="90" x="246" y="30">
<parameter key="query_type" value="Regular Expression"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries">
<parameter key="extract_index" value="<td xmlns="http://www.w3.org/1999/xhtml" colspan="1" rowspan="1" class="cit-borderleft cit-data">(\d+)</td> "/>
<parameter key="extract_title" value="<title xmlns="http://www.w3.org/1999/xhtml">(.+)</title> "/>
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Name" value="//h:title"/>
<parameter key="hindex" value="//*[contains(.,"h-index")]/../h:tr[3]//h:td[2]"/>
<parameter key="Publications" value="//h:a[contains(@href,'citation_for_view')]"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="segment" to_op="Extract Information (3)" to_port="document"/>
<connect from_op="Extract Information (3)" from_port="document" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_port="document" to_op="Cut Document" to_port="document"/>
<connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>0