"Webmining: keep only documents that contain certain keywords"

User: "mrpropper"
New Altair Community Member
Updated by Jocelyn
Hello,

how can I filter documents from an input stream that match one or more keywords in a collection of keywords that are stored in a wordlist or similar? Filter Documents (by Content) is not the right solution as the filter-keywords have to be hardcoded into the operator (See example below). I would rather that the filter uses a second inputstream that ca be easily manipulated.

Regards
mrpopper

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.0.0" expanded="true" name="Process">
    <parameter key="parallelize_main_process" value="true"/>
    <process expanded="true" height="529" width="705">
      <operator activated="true" class="web:crawl_web" compatibility="5.0.3" expanded="true" height="60" name="Crawl Sueddeutsche" width="90" x="45" y="30">
        <parameter key="url" value="http://www.sueddeutsche.de/"/>
        <list key="crawling_rules"/>
        <parameter key="write_pages_into_files" value="false"/>
        <parameter key="add_pages_as_attribute" value="true"/>
        <parameter key="output_dir" value="C:\Users\abc\Documents\Rapid_Miner\Tester_Webmining\Sueddeutsche"/>
        <parameter key="max_pages" value="60"/>
        <parameter key="domain" value="server"/>
        <parameter key="max_threads" value="75"/>
        <parameter key="max_page_size" value="250"/>
        <parameter key="user_agent" value="Tester"/>
      </operator>
      <operator activated="true" class="text:data_to_documents" compatibility="5.0.6" expanded="true" height="60" name="Data to Documents" width="90" x="246" y="30">
        <list key="specify_weights"/>
      </operator>
      <operator activated="true" class="text:filter_documents_by_content" compatibility="5.0.6" expanded="true" height="76" name="Filter nach Keyword(s)" width="90" x="447" y="30">
        <parameter key="string" value="Obama"/>
        <parameter key="regular_expression" value="(Obama)"/>
      </operator>
      <connect from_op="Crawl Sueddeutsche" from_port="Example Set" to_op="Data to Documents" to_port="example set"/>
      <connect from_op="Data to Documents" from_port="documents" to_op="Filter nach Keyword(s)" to_port="documents 1"/>
      <connect from_op="Filter nach Keyword(s)" from_port="documents" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>
[glow=red,2,300][/glow]

Find more posts tagged with