Hi,
I am trying to grab the text of abstracts from a journal using XPATH in cut documents. I downloaded a test set and saved as html and am using Process Document from Files with a Cut document operator nested inside. The site I am testing is here:
http://onlinelibrary.wiley.com/doi/10.1111/j.1467-9221.2010.00797.x/abstractUsing Firebug in FireFox, I inspected the element and determined that the XPATH is both:
/html/body/div[3]/div/div[5]/div[4]/div[3]/div/div[2]/p
and,
//div[
@class='para']
I simplified the first one to: /div/div/div/div/div/div/div[2]/p. I tested both XPATH queries online using Google Docs and the extraction worked fine. However, I have not been able to successfuly replicate the result in RapidMiner. Am I missing something in the namespace? I have tried various versions of the XPATH syntax and the namespace settings. Note that I have run an extract content sequence etc. in parellel with a port multiplier and have not had problems getting the text tokenized, turned into word vectors etc. Here is the XML for just a simple Cut Document inside Process Doc from Files chain.
My XML:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.006">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.006" expanded="true" name="Process">
<process expanded="true" height="655" width="918">
<operator activated="true" class="text:process_document_from_file" compatibility="5.1.001" expanded="true" height="76" name="Process Documents from Files" width="90" x="149" y="116">
<list key="text_directories">
<parameter key="all-pp" value="/Users/williamfchiu/Desktop/politicalpsych_test"/>
</list>
<parameter key="extract_text_only" value="false"/>
<parameter key="create_word_vector" value="false"/>
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="absolute"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="999"/>
<process expanded="true" height="637" width="867">
<operator activated="true" class="text:cut_document" compatibility="5.1.001" expanded="true" height="60" name="Cut Document" width="90" x="112" y="210">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="fulltext" value="//div/div/div/div/div/div/div[2]/p"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
<process expanded="true" height="655" width="919">
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
</process>
</operator>
<connect from_port="document" to_op="Cut Document" to_port="document"/>
<connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
<connect from_op="Process Documents from Files" from_port="example set" to_port="result 1"/>
<connect from_op="Process Documents from Files" from_port="word list" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
William