Sentence analysis - keeping origin of sentence?
Hello there,
after successfully breaking documents into sentences, I stumbled over a problem which didn't seem one at first.
When extracting the sentences with the linguistic sentence tokenizer, I need to de-pivot it, so I get the sentences actually into rows. So far so good, however I can't keep the text itself (which would be possible due to the option in the "Process Documents" operator) since the de-pivot operator brings it to a stop because of mismatching types (obviously I think I haven't completely understood that operator), neither I can keep the title of the document (which is also an attribute but gets lost during tokenizing). So is there a way of keeping either the text attribute (which contains the complete text) or (which might be even more suitable) keep the document title attribute which is there when retrieving, but gets lost during processing?
Maybe this can be usful for understanding what I'm doing - my process looks like this:
<?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.6.001" expanded="true" height="68" name="Retrieve (2)" width="90" x="45" y="34">
<parameter key="repository_entry" value="../Data/HighQualityTestTexts"/>
</operator>
<operator activated="true" class="generate_id" compatibility="7.6.001" expanded="true" height="82" name="Generate ID" width="90" x="179" y="34"/>
<operator activated="true" class="nominal_to_text" compatibility="7.6.001" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="136">
<parameter key="attribute" value="Sentences"/>
</operator>
<operator activated="true" class="set_role" compatibility="7.6.001" expanded="true" height="82" name="Set Role" width="90" x="179" y="238">
<parameter key="attribute_name" value="id"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data (3)" width="90" x="313" y="34">
<parameter key="vector_creation" value="Term Frequency"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="9999"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="false" class="text:aggregate_token_length" compatibility="7.5.000" expanded="true" height="68" name="Aggregate Token Length (2)" width="90" x="715" y="34">
<parameter key="aggregation" value="count"/>
</operator>
<operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content (2)" width="90" x="45" y="34">
<parameter key="minimum_text_block_length" value="3"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize (3)" width="90" x="179" y="34">
<parameter key="mode" value="linguistic sentences"/>
<parameter key="language" value="German"/>
</operator>
<connect from_port="document" to_op="Extract Content (2)" to_port="document"/>
<connect from_op="Extract Content (2)" from_port="document" to_op="Tokenize (3)" to_port="document"/>
<connect from_op="Tokenize (3)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="remove_useless_attributes" compatibility="7.6.001" expanded="true" height="82" name="Remove Useless Attributes" width="90" x="313" y="136"/>
<operator activated="true" class="select_attributes" compatibility="7.6.001" expanded="true" height="82" name="Select Attributes (3)" width="90" x="447" y="34">
<parameter key="attribute" value="id"/>
<parameter key="attributes" value="Title|Language|Description|Keywords|Robots|Id"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="de_pivot" compatibility="7.6.001" expanded="true" height="82" name="De-Pivot" width="90" x="581" y="34">
<list key="attribute_name">
<parameter key="ProzentualerAnteil" value=".*"/>
</list>
<parameter key="index_attribute" value="Sentences"/>
<parameter key="create_nominal_index" value="true"/>
<parameter key="keep_missings" value="true"/>
</operator>
<connect from_op="Retrieve (2)" from_port="output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Process Documents from Data (3)" to_port="example set"/>
<connect from_op="Process Documents from Data (3)" from_port="example set" to_op="Remove Useless Attributes" to_port="example set input"/>
<connect from_op="Remove Useless Attributes" from_port="example set output" to_op="Select Attributes (3)" to_port="example set input"/>
<connect from_op="Select Attributes (3)" from_port="example set output" to_op="De-Pivot" to_port="example set input"/>
<connect from_op="De-Pivot" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Thanks in advance.
Oliver