"Creating a comparing white list of words to a wordlist from a data mined webpage"

User: "idunn"
New Altair Community Member
Updated by Jocelyn

Currently, I need to compare a wordlist that I have created, 'White List' to a word list that RapidMiner has created from a webpage. I have the wordlist from the webpage tokenized, and filtered. What I want to do is import a wordlist I have created into the process so that I can compare the wordlist I have made to the output of the process that works so far so that I can create a matching scheme. E.g., the 'White List' contains imaging while the word list from the output contains imaging, thus creating a match and moving that into a new output file.

 

If you need more information, let me know.

 

Ian

Find more posts tagged with

Sort by:
1 - 1 of 11
    User: "MartinLiebig"
    Altair Employee
    Accepted Answer

    Hi Idunn,

     

    have a look at the attached process, it is working well for me.

     

    ~Martin

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.3.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.3.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="generate_data_user_specification" compatibility="7.3.001" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="45" y="85">
    <list key="attribute_values">
    <parameter key="text" value="&quot;this is a text&quot;"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="nominal_to_text" compatibility="7.3.001" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="85"/>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.2.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="380" y="85">
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.2.000" expanded="true" height="68" name="Tokenize" width="90" x="179" y="85"/>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:wordlist_to_data" compatibility="7.2.000" expanded="true" height="82" name="WordList to Data" width="90" x="514" y="85"/>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.3.001" expanded="true" height="68" name="Generate Data by User Specification (2)" width="90" x="45" y="238">
    <list key="attribute_values">
    <parameter key="text" value="&quot;this is another text&quot;"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="nominal_to_text" compatibility="7.3.001" expanded="true" height="82" name="Nominal to Text (2)" width="90" x="179" y="238"/>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.2.000" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="380" y="238">
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.2.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="179" y="85"/>
    <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
    <connect from_op="Tokenize (2)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:wordlist_to_data" compatibility="7.2.000" expanded="true" height="82" name="WordList to Data (2)" width="90" x="514" y="238"/>
    <operator activated="true" class="join" compatibility="7.3.001" expanded="true" height="82" name="Join" width="90" x="648" y="136">
    <parameter key="remove_double_attributes" value="false"/>
    <parameter key="join_type" value="outer"/>
    <parameter key="use_id_attribute_as_key" value="false"/>
    <list key="key_attributes">
    <parameter key="word" value="word"/>
    </list>
    </operator>
    <connect from_op="Generate Data by User Specification" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
    <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
    <connect from_op="WordList to Data" from_port="example set" to_op="Join" to_port="left"/>
    <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Nominal to Text (2)" to_port="example set input"/>
    <connect from_op="Nominal to Text (2)" from_port="example set output" to_op="Process Documents from Data (2)" to_port="example set"/>
    <connect from_op="Process Documents from Data (2)" from_port="word list" to_op="WordList to Data (2)" to_port="word list"/>
    <connect from_op="WordList to Data (2)" from_port="example set" to_op="Join" to_port="right"/>
    <connect from_op="Join" from_port="join" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>