Extract Information Component Fails when using XPath
I'm trying to extract some data using the XPath parsing capability of the Extract Information component. It's not going very well. I've used the Read Document component to read a text file and now I would like to parse the document. However, when I use the Extract Information component inside a Process Document component I'm getting an error: java.lang.String cannot be cast to org.jdom.Text. Changing the assume_html flags (or any other flags) doesn't seem to make any difference.
Here is my job:
Here is my job:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<parameter key="logverbosity" value="warning"/>
<parameter key="logfile" value="/Users/wardloving/Documents/Data Mining/log.out"/>
<parameter key="resultfile" value="/Users/wardloving/Documents/Data Mining/results.out"/>
<process expanded="true" height="184" width="807">
<operator activated="true" class="text:read_document" compatibility="5.2.004" expanded="true" height="60" name="Read Document" width="90" x="62" y="49">
<parameter key="file" value="/Users/wardloving/Documents/Projects/Churches/Episcopal Church Pages/5615.html"/>
<parameter key="extract_text_only" value="false"/>
</operator>
<operator activated="true" class="text:process_documents" compatibility="5.2.004" expanded="true" height="94" name="Process Documents" width="90" x="246" y="30">
<parameter key="create_word_vector" value="false"/>
<parameter key="keep_text" value="true"/>
<process expanded="true" height="206" width="708">
<operator activated="true" class="text:extract_information" compatibility="5.2.004" expanded="true" height="60" name="Extract Information" width="90" x="112" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Name" value="substring-before(//title, ',')"/>
<parameter key="Staff" value="substring-before(//*[@class = 'field field-type-text field-field-clergy']/div/div/node()[not(self::div)],',')"/>
</list>
<list key="namespaces"/>
<parameter key="ignore_CDATA" value="false"/>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read Document" from_port="output" to_op="Process Documents" to_port="documents 1"/>
<connect from_op="Process Documents" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>