Hello,
how can I filter documents from an input stream that match one or more keywords in a collection of keywords that are stored in a wordlist or similar? Filter Documents (by Content) is not the right solution as the filter-keywords have to be hardcoded into the operator (See example below). I would rather that the filter uses a second inputstream that ca be easily manipulated.
Regards
mrpopper
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.0.0" expanded="true" name="Process">
<parameter key="parallelize_main_process" value="true"/>
<process expanded="true" height="529" width="705">
<operator activated="true" class="web:crawl_web" compatibility="5.0.3" expanded="true" height="60" name="Crawl Sueddeutsche" width="90" x="45" y="30">
<parameter key="url" value="http://www.sueddeutsche.de/"/>
<list key="crawling_rules"/>
<parameter key="write_pages_into_files" value="false"/>
<parameter key="add_pages_as_attribute" value="true"/>
<parameter key="output_dir" value="C:\Users\abc\Documents\Rapid_Miner\Tester_Webmining\Sueddeutsche"/>
<parameter key="max_pages" value="60"/>
<parameter key="domain" value="server"/>
<parameter key="max_threads" value="75"/>
<parameter key="max_page_size" value="250"/>
<parameter key="user_agent" value="Tester"/>
</operator>
<operator activated="true" class="text:data_to_documents" compatibility="5.0.6" expanded="true" height="60" name="Data to Documents" width="90" x="246" y="30">
<list key="specify_weights"/>
</operator>
<operator activated="true" class="text:filter_documents_by_content" compatibility="5.0.6" expanded="true" height="76" name="Filter nach Keyword(s)" width="90" x="447" y="30">
<parameter key="string" value="Obama"/>
<parameter key="regular_expression" value="(Obama)"/>
</operator>
<connect from_op="Crawl Sueddeutsche" from_port="Example Set" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Data to Documents" from_port="documents" to_op="Filter nach Keyword(s)" to_port="documents 1"/>
<connect from_op="Filter nach Keyword(s)" from_port="documents" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
[glow=red,2,300][/glow]