🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

text mining filter stopwords (dictionary)

kulturevultureUser: "kulturevulture"
New Altair Community Member
Updated by Jocelyn
Hello. I have a process to read an Excel file, process documents from data, and create correlation matrix. The process runs fine except when I want to add a filter stopwords (dictionary) after the filter stopwords (English) and before Filter Tokens (by Length). It seems to ignore my list of stopwords (dictionary). I've tried using a csv and txt file for stopwords (dictionary) but the stopwords still show up in the output. I've set preference rapidminer.general.encoding to UTF-8. All the files should be in English. Any ideas?
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.003">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.003" expanded="true" name="Process">
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true" height="415" width="547">
      <operator activated="true" class="read_excel" compatibility="5.2.003" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
        <parameter key="excel_file" value="E:\R-2.14.0\bin\i386\tickets.xls"/>
        <parameter key="imported_cell_range" value="A1:A748"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="Rationale.true.text.attribute"/>
        </list>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.2.001" expanded="true" height="76" name="Process Documents from Data" width="90" x="179" y="30">
        <parameter key="keep_text" value="true"/>
        <parameter key="prune_method" value="absolute"/>
        <parameter key="prune_below_absolute" value="10"/>
        <parameter key="prune_above_absolute" value="700"/>
        <list key="specify_weights"/>
        <process expanded="true" height="434" width="557">
          <operator activated="true" class="text:transform_cases" compatibility="5.2.001" expanded="true" height="60" name="Transform Cases" width="90" x="35" y="45"/>
          <operator activated="true" class="text:stem_snowball" compatibility="5.2.001" expanded="true" height="60" name="Stem (Snowball)" width="90" x="154" y="49"/>
          <operator activated="true" class="text:tokenize" compatibility="5.2.001" expanded="true" height="60" name="Tokenize" width="90" x="40" y="144"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="5.2.001" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="45" y="255"/>
          <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.2.001" expanded="true" height="60" name="Filter Stopwords (Dictionary)" width="90" x="179" y="300">
            <parameter key="file" value="E:\Rapid Miner\RapidMiner5\Stop_Dictionary05082012.txt"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <operator activated="true" class="text:filter_by_length" compatibility="5.2.001" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="270" y="225">
            <parameter key="min_chars" value="2"/>
            <parameter key="max_chars" value="9999"/>
          </operator>
          <operator activated="true" class="text:generate_n_grams_terms" compatibility="5.2.001" expanded="true" height="60" name="Generate n-Grams (Terms)" width="90" x="380" y="165">
            <parameter key="max_length" value="1"/>
          </operator>
          <connect from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
          <connect from_op="Stem (Snowball)" from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
          <connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
          <connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="correlation_matrix" compatibility="5.2.003" expanded="true" height="94" name="Correlation Matrix" width="90" x="54" y="198"/>
      <connect from_op="Read Excel" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Correlation Matrix" to_port="example set"/>
      <connect from_op="Correlation Matrix" from_port="example set" to_port="result 1"/>
      <connect from_op="Correlation Matrix" from_port="matrix" to_port="result 2"/>
      <connect from_op="Correlation Matrix" from_port="weights" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
    </process>
  </operator>
</process>

Find more posts tagged with

Sort by:
1 - 9 of 91