I'm trying to extract some data using the XPath parsing capability of the Extract Information component. It's not going very well. I've used the Read Document component to read a text file and now I would like to parse the document. However, when I use the Extract Information component inside a Process Document component I'm getting an error: java.lang.String cannot be cast to org.jdom.Text. Changing the assume_html flags (or any other flags) doesn't seem to make any difference.
Here is my job:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<parameter key="logverbosity" value="warning"/>
<parameter key="logfile" value="/Users/wardloving/Documents/Data Mining/log.out"/>
<parameter key="resultfile" value="/Users/wardloving/Documents/Data Mining/results.out"/>
<process expanded="true" height="184" width="807">
<operator activated="true" class="text:read_document" compatibility="5.2.004" expanded="true" height="60" name="Read Document" width="90" x="62" y="49">
<parameter key="file" value="/Users/wardloving/Documents/Projects/Churches/Episcopal Church Pages/5615.html"/>
<parameter key="extract_text_only" value="false"/>
</operator>
<operator activated="true" class="text:process_documents" compatibility="5.2.004" expanded="true" height="94" name="Process Documents" width="90" x="246" y="30">
<parameter key="create_word_vector" value="false"/>
<parameter key="keep_text" value="true"/>
<process expanded="true" height="206" width="708">
<operator activated="true" class="text:extract_information" compatibility="5.2.004" expanded="true" height="60" name="Extract Information" width="90" x="112" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Name" value="substring-before(//title, ',')"/>
<parameter key="Staff" value="substring-before(//*[@class = 'field field-type-text field-field-clergy']/div/div/node()[not(self::div)],',')"/>
</list>
<list key="namespaces"/>
<parameter key="ignore_CDATA" value="false"/>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read Document" from_port="output" to_op="Process Documents" to_port="documents 1"/>
<connect from_op="Process Documents" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>