A program to recognize and reward our most engaged community members
<div class="posting"> <div>Author</div> <div>Time</div> <div>Text</div></div>
Author: /div/div[1]Time: /div/div[2]Text: /div/div[3]
<?xml version="1.0" encoding="UTF-8" standalone="no"?><process version="5.0"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="5.0.11" expanded="true" name="Process"> <process expanded="true" height="415" width="685"> <operator activated="true" class="web:get_webpage" compatibility="5.0.4" expanded="true" height="60" name="Get Page" width="90" x="45" y="255"> <parameter key="url" value="http://forum.spiegel.de/showthread.php?t=22981&page=6"/> <list key="query_parameters"/> </operator> <operator activated="true" class="text:cut_document" compatibility="5.0.7" expanded="true" height="60" name="Cut Document" width="90" x="313" y="120"> <parameter key="query_type" value="XPath"/> <list key="string_machting_queries"/> <list key="regular_expression_queries"/> <list key="regular_region_queries"/> <list key="xpath_queries"> <parameter key="Segmenter" value="/h:html/h:body/h:div[4]/h:div[1]/h:div[2]/h:div[2]/h:div[2]/h:div/h:div/h:div/h:div/h:table"/> </list> <list key="namespaces"> <parameter key="xx" value="xml"/> </list> <parameter key="ignore_CDATA" value="false"/> <list key="index_queries"/> <process expanded="true" height="499" width="750"> <operator activated="true" class="text:remove_document_parts" compatibility="5.0.7" expanded="true" height="60" name="Remove Document Parts" width="90" x="112" y="75"> <parameter key="deletion_regex" value="(<br clear="none" />)"/> </operator> <operator activated="true" class="multiply" compatibility="5.0.11" expanded="true" height="94" name="Multiply" width="90" x="279" y="97"/> <operator activated="true" class="text:cut_document" compatibility="5.0.7" expanded="true" height="60" name="Cut Document (2)" width="90" x="447" y="120"> <parameter key="query_type" value="XPath"/> <list key="string_machting_queries"/> <list key="regular_expression_queries"/> <list key="regular_region_queries"/> <list key="xpath_queries"> <parameter key="Zitate" value="//h:div[@style='font-style:italic']/text()"/> </list> <list key="namespaces"/> <parameter key="ignore_CDATA" value="false"/> <list key="index_queries"/> <process expanded="true" height="499" width="750"> <connect from_port="segment" to_port="document 1"/> <portSpacing port="source_segment" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="text:cut_document" compatibility="5.0.7" expanded="true" height="60" name="Cut Document (3)" width="90" x="447" y="255"> <parameter key="query_type" value="XPath"/> <list key="string_machting_queries"/> <list key="regular_expression_queries"/> <list key="regular_region_queries"/> <list key="xpath_queries"> <parameter key="Posting" value="//h:table/h:tr[2]/h:td[2]/h:div[2]/text()[2]|/h:table/h:tbody/h:tr[2]/h:td[2]/h:div[2]/text()"/> </list> <list key="namespaces"/> <parameter key="ignore_CDATA" value="false"/> <list key="index_queries"/> <process expanded="true" height="499" width="750"> <connect from_port="segment" to_port="document 1"/> <portSpacing port="source_segment" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <connect from_port="segment" to_op="Remove Document Parts" to_port="document"/> <connect from_op="Remove Document Parts" from_port="document" to_op="Multiply" to_port="input"/> <connect from_op="Multiply" from_port="output 1" to_op="Cut Document (2)" to_port="document"/> <connect from_op="Multiply" from_port="output 2" to_op="Cut Document (3)" to_port="document"/> <connect from_op="Cut Document (2)" from_port="documents" to_port="document 1"/> <connect from_op="Cut Document (3)" from_port="documents" to_port="document 2"/> <portSpacing port="source_segment" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> <portSpacing port="sink_document 3" spacing="0"/> </process> </operator> <operator activated="true" class="text:documents_to_data" compatibility="5.0.7" expanded="true" height="76" name="Documents to Data" width="90" x="581" y="120"> <parameter key="text_attribute" value="Testattr"/> <parameter key="label_attribute" value="testattribut"/> </operator> <connect from_op="Get Page" from_port="output" to_op="Cut Document" to_port="document"/> <connect from_op="Cut Document" from_port="documents" to_op="Documents to Data" to_port="documents 1"/> <connect from_op="Documents to Data" from_port="example set" to_port="result 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> </process> </operator></process>
Sebastian Land wrote:Hi,actually there should be no problem to use multiple Cut documents inside each other. Memory consumption of Documents should be fairly low...Actually we could add an option to include all matches inside a meta information. I will note this down for the next version.