A program to recognize and reward our most engaged community members
<?xml version="1.0" encoding="UTF-8" standalone="no"?><process version="5.0"> <context> <input> <location/> </input> <output> <location/> <location/> </output> <macros/> </context> <operator activated="true" class="process" expanded="true" name="Process"> <parameter key="parallelize_main_process" value="true"/> <process expanded="true" height="746" width="1091"> <operator activated="true" class="text:create_document" expanded="true" height="60" name="Create Document" width="90" x="313" y="165"> <parameter key="text" value="<html> <title>Hallo Titel</title> <h4>Hallo Überschrift 3</h4> <h3>Hallo Überschrift 3</h3> <p><h4>Ein H4</h4> <span>in einem Paragraph</span></p> </html>"/> </operator> <operator activated="true" class="text:process_documents" expanded="true" height="94" name="Process Documents" width="90" x="581" y="75"> <process expanded="true" height="724" width="770"> <connect from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="text:generate_extract" expanded="true" height="60" name="Generate Extract" width="90" x="782" y="75"> <parameter key="source_attribute" value="source_ATTR"/> <parameter key="query_type" value="XPath"/> <list key="string_machting_queries"/> <list key="regular_expression_queries"/> <list key="regular_region_queries"/> <list key="xpath_queries"> <parameter key="title_html" value="//h:title/text()"/> </list> <list key="namespaces"/> </operator> <connect from_op="Create Document" from_port="output" to_op="Process Documents" to_port="documents 1"/> <connect from_op="Process Documents" from_port="example set" to_op="Generate Extract" to_port="Example Set"/> <connect from_op="Generate Extract" from_port="Example Set" to_port="result 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> </process> </operator></process>
<?xml version="1.0" encoding="UTF-8" standalone="no"?><process version="5.0"> <context> <input> <location/> </input> <output> <location/> <location/> </output> <macros/> </context> <operator activated="true" class="process" expanded="true" name="Process"> <parameter key="parallelize_main_process" value="true"/> <process expanded="true" height="746" width="1091"> <operator activated="true" class="text:create_document" expanded="true" height="60" name="Create Document" width="90" x="112" y="75"> <parameter key="text" value="<html> <title>Hallo Titel</title> <h4>Hallo Überschrift 3</h4> <h3>Hallo Überschrift 3</h3> <p><h4>Ein H4</h4> <span>in einem Paragraph</span></p> </html>"/> </operator> <operator activated="true" class="text:process_documents" expanded="true" height="94" name="Process Documents" width="90" x="246" y="75"> <parameter key="create_word_vector" value="false"/> <parameter key="keep_text" value="true"/> <process expanded="true" height="724" width="770"> <connect from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="text:generate_extract" expanded="true" height="60" name="Generate Extract" width="90" x="380" y="75"> <parameter key="source_attribute" value="text"/> <parameter key="query_type" value="XPath"/> <list key="string_machting_queries"/> <list key="regular_expression_queries"/> <list key="regular_region_queries"/> <list key="xpath_queries"> <parameter key="title_html" value="//h:title/text()"/> </list> <list key="namespaces"/> </operator> <connect from_op="Create Document" from_port="output" to_op="Process Documents" to_port="documents 1"/> <connect from_op="Process Documents" from_port="example set" to_op="Generate Extract" to_port="Example Set"/> <connect from_op="Generate Extract" from_port="Example Set" to_port="result 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> </process> </operator></process>
<?xml version="1.0" encoding="UTF-8" standalone="no"?><process version="5.0"> <context> <input> <location/> </input> <output> <location/> <location/> </output> <macros/> </context> <operator activated="true" class="process" expanded="true" name="Process"> <parameter key="logverbosity" value="3"/> <parameter key="random_seed" value="2001"/> <parameter key="send_mail" value="1"/> <parameter key="process_duration_for_mail" value="30"/> <parameter key="encoding" value="SYSTEM"/> <parameter key="parallelize_main_process" value="false"/> <process expanded="true" height="629" width="950"> <operator activated="true" class="text:create_document" expanded="true" height="60" name="Create Document" width="90" x="112" y="255"> <parameter key="text" value="<html> 	<a href="1">Details</a> 	<a href="2">Details</a> 	<a href="3">Details</a> 	<a href="4">Details</a> 	<a href="5">Details</a> 	<a href="6">Details</a> 	<a href="7">Details</a> 	<a href="8">Details</a> 	<a href="9">Details</a> 	<a href="0">Details</a> </html> "/> <parameter key="add label" value="false"/> <parameter key="label_type" value="0"/> </operator> <operator activated="true" class="text:process_documents" expanded="true" height="94" name="Process Documents" width="90" x="447" y="255"> <parameter key="create_word_vector" value="false"/> <parameter key="vector_creation" value="0"/> <parameter key="add_meta_information" value="true"/> <parameter key="keep_text" value="true"/> <parameter key="prune_method" value="0"/> <parameter key="prunde_below_percent" value="3.0"/> <parameter key="prune_above_percent" value="30.0"/> <parameter key="prune_below_rank" value="5.0"/> <parameter key="prune_above_rank" value="5.0"/> <parameter key="datamanagement" value="7"/> <parameter key="parallelize_vector_creation" value="false"/> <process expanded="true" height="629" width="950"> <connect from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="text:generate_extract" expanded="true" height="60" name="Generate Extract" width="90" x="648" y="255"> <parameter key="source_attribute" value="text"/> <parameter key="query_type" value="XPath"/> <list key="string_machting_queries"/> <parameter key="attribute_type" value="Nominal"/> <list key="regular_expression_queries"/> <list key="regular_region_queries"/> <list key="xpath_queries"> <parameter key="DetailsPage" value="//h:a[text()='Details']/@href"/> </list> <list key="namespaces"/> <parameter key="ignore_CDATA" value="true"/> <parameter key="assume_html" value="true"/> <parameter key="value_seperator" value=";"/> </operator> <connect from_op="Create Document" from_port="output" to_op="Process Documents" to_port="documents 1"/> <connect from_op="Process Documents" from_port="example set" to_op="Generate Extract" to_port="Example Set"/> <connect from_op="Generate Extract" from_port="Example Set" to_port="result 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> </process> </operator></process>
<html> <a href="1">Details</a> <a href="2">Details</a> <a href="3">Details</a> <a href="4">Details</a> <a href="5">Details</a> <a href="6">Details</a> <a href="7">Details</a> <a href="8">Details</a> <a href="9">Details</a> <a href="0">Details</a></html>
//a/@href
<?xml version="1.0" encoding="UTF-8" standalone="no"?><process version="5.0"> <context> <input> <location/> </input> <output> <location/> <location/> <location/> </output> <macros/> </context> <operator activated="true" class="process" expanded="true" name="Process"> <process expanded="true" height="296" width="480"> <operator activated="true" class="text:create_document" expanded="true" height="60" name="Create Document" width="90" x="3" y="45"> <parameter key="text" value="<html> 	<a href="1">Details</a> 	<a href="2">Details</a> 	<a href="3">Details</a> 	<a href="4">Details</a> 	<a href="5">Details</a> 	<a href="6">Details</a> 	<a href="7">Details</a> 	<a href="8">Details</a> 	<a href="9">Details</a> 	<a href="0">Details</a> </html>"/> </operator> <operator activated="true" class="text:documents_to_data" expanded="true" height="76" name="Documents to Data" width="90" x="112" y="120"> <parameter key="text_attribute" value="text"/> </operator> <operator activated="true" class="multiply" expanded="true" height="94" name="Multiply" width="90" x="246" y="120"/> <operator activated="true" class="text:process_document_from_data" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="210"> <parameter key="create_word_vector" value="false"/> <list key="specify_weights"/> <process expanded="true" height="585" width="904"> <operator activated="true" class="text:cut_document" expanded="true" height="60" name="Cut Document" width="90" x="112" y="30"> <parameter key="query_type" value="XPath"/> <list key="string_machting_queries"/> <list key="regular_expression_queries"/> <list key="regular_region_queries"/> <list key="xpath_queries"> <parameter key="unimportant" value="//a/@href"/> </list> <list key="namespaces"/> <parameter key="assume_html" value="false"/> <process expanded="true" height="585" width="904"> <operator activated="true" class="text:extract_information" expanded="true" height="60" name="Extract Information" width="90" x="45" y="30"> <parameter key="query_type" value="Regular Expression"/> <list key="string_machting_queries"/> <list key="regular_expression_queries"> <parameter key="hrefNumber" value="(.*)"/> </list> <list key="regular_region_queries"/> <list key="xpath_queries"/> <list key="namespaces"/> </operator> <connect from_port="segment" to_op="Extract Information" to_port="document"/> <connect from_op="Extract Information" from_port="document" to_port="document 1"/> <portSpacing port="source_segment" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <connect from_port="document" to_op="Cut Document" to_port="document"/> <connect from_op="Cut Document" from_port="documents" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="text:generate_extract" expanded="true" height="60" name="Generate Extract" width="90" x="380" y="75"> <parameter key="source_attribute" value="text"/> <parameter key="query_type" value="XPath"/> <list key="string_machting_queries"/> <list key="regular_expression_queries"/> <list key="regular_region_queries"/> <list key="xpath_queries"> <parameter key="AttributeName1" value="//a[1]"/> <parameter key="AttributeName2" value="//a[2]"/> </list> <list key="namespaces"/> <parameter key="assume_html" value="false"/> </operator> <connect from_op="Create Document" from_port="output" to_op="Documents to Data" to_port="documents 1"/> <connect from_op="Documents to Data" from_port="example set" to_op="Multiply" to_port="input"/> <connect from_op="Multiply" from_port="output 1" to_op="Generate Extract" to_port="Example Set"/> <connect from_op="Multiply" from_port="output 2" to_op="Process Documents from Data" to_port="example set"/> <connect from_op="Process Documents from Data" from_port="example set" to_port="result 2"/> <connect from_op="Generate Extract" from_port="Example Set" to_port="result 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> <portSpacing port="sink_result 3" spacing="0"/> </process> </operator></process>
<?xml version="1.0" encoding="UTF-8" standalone="no"?><process version="5.0"> <context> <input> <location/> </input> <output> <location/> <location/> </output> <macros/> </context> <operator activated="true" class="process" expanded="true" name="Process"> <process expanded="true" height="546" width="1016"> <operator activated="true" class="text:create_document" expanded="true" height="60" name="Create Document" width="90" x="45" y="165"> <parameter key="text" value="<html> 	<head><title>Der Titel ist sehr toll</title></head> 	<a href="http://f12010.info">formel1</a> 	 <a href="http://dsds-2009.info">und einen dritten link</a> 	<a href="http://simonknoll.com">semmel</a> 	<title>Wir Haben auch einen zweitet Titel</title> </html>"/> <parameter key="label_type" value="numeric"/> </operator> <operator activated="true" class="multiply" expanded="true" height="94" name="Multiply" width="90" x="179" y="165"/> <operator activated="true" class="text:process_documents" expanded="true" height="94" name="Process Documents (2)" width="90" x="313" y="255"> <parameter key="create_word_vector" value="false"/> <process expanded="true"> <operator activated="true" class="text:cut_document" expanded="true" height="60" name="Cut Document (2)" width="90" x="394" y="30"> <parameter key="query_type" value="XPath"/> <list key="string_machting_queries"/> <list key="regular_expression_queries"/> <list key="regular_region_queries"/> <list key="xpath_queries"> <parameter key="html_linktext" value="//h:a/text()"/> </list> <list key="namespaces"/> <process expanded="true"> <operator activated="true" class="text:extract_information" expanded="true" height="60" name="Extract Information (2)" width="90" x="394" y="30"> <parameter key="query_type" value="Regular Expression"/> <list key="string_machting_queries"/> <list key="regular_expression_queries"> <parameter key="use_it" value="(.*)"/> </list> <list key="regular_region_queries"/> <list key="xpath_queries"/> <list key="namespaces"/> </operator> <connect from_port="segment" to_op="Extract Information (2)" to_port="document"/> <connect from_op="Extract Information (2)" from_port="document" to_port="document 1"/> <portSpacing port="source_segment" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <connect from_port="document" to_op="Cut Document (2)" to_port="document"/> <connect from_op="Cut Document (2)" from_port="documents" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="text:process_documents" expanded="true" height="94" name="Process Documents" width="90" x="313" y="30"> <parameter key="create_word_vector" value="false"/> <process expanded="true"> <operator activated="true" class="text:cut_document" expanded="true" height="60" name="Cut Document" width="90" x="246" y="165"> <parameter key="query_type" value="XPath"/> <list key="string_machting_queries"/> <list key="regular_expression_queries"/> <list key="regular_region_queries"/> <list key="xpath_queries"> <parameter key="html_title" value="//h:title/text()"/> </list> <list key="namespaces"/> <process expanded="true"> <operator activated="true" class="text:extract_information" expanded="true" height="60" name="Extract Information" width="90" x="246" y="30"> <parameter key="query_type" value="Regular Expression"/> <list key="string_machting_queries"/> <list key="regular_expression_queries"> <parameter key="use_it" value="(.*)"/> </list> <list key="regular_region_queries"/> <list key="xpath_queries"/> <list key="namespaces"/> </operator> <connect from_port="segment" to_op="Extract Information" to_port="document"/> <connect from_op="Extract Information" from_port="document" to_port="document 1"/> <portSpacing port="source_segment" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <connect from_port="document" to_op="Cut Document" to_port="document"/> <connect from_op="Cut Document" from_port="documents" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="generate_id" expanded="true" height="76" name="Generate ID" width="90" x="447" y="30"/> <operator activated="true" class="generate_id" expanded="true" height="76" name="Generate ID (2)" width="90" x="447" y="255"/> <operator activated="true" class="union" expanded="true" height="76" name="Union" width="90" x="581" y="120"/> <connect from_op="Create Document" from_port="output" to_op="Multiply" to_port="input"/> <connect from_op="Multiply" from_port="output 1" to_op="Process Documents" to_port="documents 1"/> <connect from_op="Multiply" from_port="output 2" to_op="Process Documents (2)" to_port="documents 1"/> <connect from_op="Process Documents (2)" from_port="example set" to_op="Generate ID (2)" to_port="example set input"/> <connect from_op="Process Documents" from_port="example set" to_op="Generate ID" to_port="example set input"/> <connect from_op="Generate ID" from_port="example set output" to_op="Union" to_port="example set 1"/> <connect from_op="Generate ID (2)" from_port="example set output" to_op="Union" to_port="example set 2"/> <connect from_op="Union" from_port="union" to_port="result 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> </process> </operator></process>
Row No. id query_key use_it-----------------------------------------------------------------1 1.0 html_title Der Titel ist sehr toll2 2.0 html_title Wir Haben auch einen zweitet Titel3 1.0 html_linktext formel14 2.0 html_linktext und einen dritten link5 3.0 html_linktext semmel
Row No. id query_key use_it weight---------------------------------------------------------------------------------1 1.0 html_title Der Titel ist sehr toll 22 2.0 html_title Wir Haben auch einen zweitet Titel 23 1.0 html_linktext formel1 14 2.0 html_linktext und einen dritten link 15 3.0 html_linktext semmel 1