naive bayes classification - confidences are binary

I'm using Naive Bayes to develop a model and then applying that model to classify a set of new documents into relevant and not. When I do this, all of the classification confidences are binary, matching the predicted group. If I switch out Naive Bayes for k-NN, I do get non-binary confidences. Are these binary confidences correct (seems unlikely) or is something going wrong?

Thanks in advance.

Find more posts tagged with

AI Studio

Accepted answers

All comments

MariusHelf

Hey, it's unlikely that you get binary confidences. Please post your process setup and, if possible, some data, such that we can reproduce the problem.

Best,
Marius

Aviva

Thanks.

Here is the process for generating the model:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.006">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.006" expanded="true" name="Process">
    <process expanded="true" height="971" width="815">
      <operator activated="false" class="read_excel" compatibility="5.2.006" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
        <parameter key="excel_file" value="C:\Users\litovitz\Documents\dissertation\rapidminer\pubs_500energy.xlsx"/>
        <parameter key="imported_cell_range" value="A1:F401"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="pub_id.true.integer.id"/>
          <parameter key="1" value="title.true.text.attribute"/>
          <parameter key="2" value="journal.true.text.attribute"/>
          <parameter key="3" value="keyword_plus.true.text.attribute"/>
          <parameter key="4" value="abstract.true.text.attribute"/>
          <parameter key="5" value="energy.true.binominal.label"/>
        </list>
      </operator>
      <operator activated="false" class="text:data_to_documents" compatibility="5.2.003" expanded="true" height="60" name="Data to Documents" width="90" x="45" y="120">
        <list key="specify_weights"/>
      </operator>
      <operator activated="false" class="read_excel" compatibility="5.2.006" expanded="true" height="60" name="Read Excel (2)" width="90" x="45" y="255">
        <parameter key="excel_file" value="C:\Users\litovitz\Documents\dissertation\rapidminer\pubs_500energy_withkeywords_2012-07-10.xlsx"/>
        <parameter key="imported_cell_range" value="A1:AB501"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="pub_id.true.integer.label"/>
          <parameter key="1" value="photovolt.true.binominal.attribute"/>
          <parameter key="2" value="solar energy.true.binominal.attribute"/>
          <parameter key="3" value="energy production.true.binominal.attribute"/>
          <parameter key="4" value="renewable energy.true.binominal.attribute"/>
          <parameter key="5" value="energy storage.true.binominal.attribute"/>
          <parameter key="6" value="fuel cell.true.binominal.attribute"/>
          <parameter key="7" value="batter.true.binominal.attribute"/>
          <parameter key="8" value="energy conversion technolog.true.binominal.attribute"/>
          <parameter key="9" value="solar cell.true.binominal.attribute"/>
          <parameter key="10" value="solar fuel.true.binominal.attribute"/>
          <parameter key="11" value="biofuel.true.binominal.attribute"/>
          <parameter key="12" value="alternative energy.true.binominal.attribute"/>
          <parameter key="13" value="sustainable energy.true.binominal.attribute"/>
          <parameter key="14" value="solar-energy.true.binominal.attribute"/>
          <parameter key="15" value="fuel-cell.true.binominal.attribute"/>
          <parameter key="16" value="hydrogen storage.true.binominal.attribute"/>
          <parameter key="17" value="artificial photosynthesis.true.binominal.attribute"/>
          <parameter key="18" value="h2 storage.true.binominal.attribute"/>
          <parameter key="19" value="energy application.true.binominal.attribute"/>
          <parameter key="20" value="power generation.true.binominal.attribute"/>
          <parameter key="21" value="solar hydrogen.true.binominal.attribute"/>
          <parameter key="22" value="biodiesel.true.binominal.attribute"/>
          <parameter key="23" value="energy generation.true.binominal.attribute"/>
          <parameter key="24" value="fuel.true.binominal.attribute"/>
          <parameter key="25" value="light harvesting.true.binominal.attribute"/>
          <parameter key="26" value="light-harvesting.true.binominal.attribute"/>
          <parameter key="27" value="keyword energy.true.binominal.attribute"/>
        </list>
      </operator>
      <operator activated="false" class="set_role" compatibility="5.2.006" expanded="true" height="76" name="Set Role (3)" width="90" x="121" y="322">
        <parameter key="name" value="pub_id"/>
        <parameter key="target_role" value="id"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="false" class="text:process_documents" compatibility="5.2.003" expanded="true" height="94" name="Process Documents" width="90" x="179" y="30">
        <parameter key="prune_method" value="percentual"/>
        <parameter key="prunde_below_percent" value="1.0"/>
        <parameter key="prune_above_percent" value="70.0"/>
        <parameter key="prune_below_absolute" value="2"/>
        <parameter key="prune_above_absolute" value="888"/>
        <process expanded="true" height="422" width="634">
          <operator activated="false" class="text:tokenize" compatibility="5.2.003" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30"/>
          <operator activated="false" class="text:transform_cases" compatibility="5.2.003" expanded="true" height="60" name="Transform Cases" width="90" x="70" y="150"/>
          <operator activated="false" class="text:filter_stopwords_english" compatibility="5.2.003" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="102" y="226"/>
          <operator activated="false" class="text:filter_by_length" compatibility="5.2.003" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="232" y="224">
            <parameter key="min_chars" value="2"/>
            <parameter key="max_chars" value="888"/>
          </operator>
          <operator activated="false" class="text:stem_snowball" compatibility="5.2.003" expanded="true" height="60" name="Stem (Snowball)" width="90" x="372" y="189"/>
          <operator activated="false" class="text:generate_n_grams_terms" compatibility="5.2.003" expanded="true" height="60" name="Generate n-Grams (Terms)" width="90" x="501" y="158">
            <parameter key="max_length" value="3"/>
          </operator>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
          <connect from_op="Stem (Snowball)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
          <connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="false" class="write" compatibility="5.2.006" expanded="true" height="60" name="Write" width="90" x="179" y="165">
        <parameter key="object_file" value="C:\Users\litovitz\Documents\dissertation\rapidminer\wordlist_500Energy"/>
      </operator>
      <operator activated="false" class="set_role" compatibility="5.2.006" expanded="true" height="76" name="Set Role (2)" width="90" x="313" y="30">
        <parameter key="name" value="pub_id"/>
        <parameter key="target_role" value="id"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="false" class="join" compatibility="5.2.006" expanded="true" height="76" name="Join" width="90" x="246" y="300">
        <list key="key_attributes"/>
      </operator>
      <operator activated="false" class="set_role" compatibility="5.2.006" expanded="true" height="76" name="Set Role" width="90" x="380" y="300">
        <parameter key="name" value="energy"/>
        <parameter key="target_role" value="label"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="false" class="generate_weight_stratification" compatibility="5.2.006" expanded="true" height="76" name="Generate Weight (Stratification)" width="90" x="447" y="30"/>
      <operator activated="false" class="x_validation" compatibility="5.2.006" expanded="true" height="112" name="Validation" width="90" x="514" y="165">
        <process expanded="true" height="422" width="292">
          <operator activated="false" class="naive_bayes" compatibility="5.2.006" expanded="true" height="76" name="Naive Bayes" width="90" x="84" y="26"/>
          <connect from_port="training" to_op="Naive Bayes" to_port="training set"/>
          <connect from_op="Naive Bayes" from_port="model" to_port="model"/>
          <portSpacing port="source_training" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true" height="422" width="292">
          <operator activated="false" class="apply_model" compatibility="5.2.006" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
            <list key="application_parameters"/>
          </operator>
          <operator activated="false" class="performance_binominal_classification" compatibility="5.2.006" expanded="true" height="76" name="Performance" width="90" x="179" y="30"/>
          <connect from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_averagable 1" spacing="0"/>
          <portSpacing port="sink_averagable 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="false" class="write_model" compatibility="5.2.006" expanded="true" height="60" name="Write Model" width="90" x="581" y="30">
        <parameter key="model_file" value="C:\Users\litovitz\Documents\dissertation\rapidminer\model_500Energy_NaiveBayes"/>
      </operator>
      <connect from_op="Read Excel" from_port="output" to_op="Data to Documents" to_port="example set"/>
      <connect from_op="Data to Documents" from_port="documents" to_op="Process Documents" to_port="documents 1"/>
      <connect from_op="Read Excel (2)" from_port="output" to_op="Set Role (3)" to_port="example set input"/>
      <connect from_op="Set Role (3)" from_port="example set output" to_op="Join" to_port="right"/>
      <connect from_op="Process Documents" from_port="example set" to_op="Set Role (2)" to_port="example set input"/>
      <connect from_op="Process Documents" from_port="word list" to_op="Write" to_port="object"/>
      <connect from_op="Set Role (2)" from_port="example set output" to_op="Join" to_port="left"/>
      <connect from_op="Join" from_port="join" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Generate Weight (Stratification)" to_port="example set input"/>
      <connect from_op="Generate Weight (Stratification)" from_port="example set output" to_op="Validation" to_port="training"/>
      <connect from_op="Validation" from_port="model" to_op="Write Model" to_port="input"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Aviva

And here is the code for applying the model to new documents for categorization:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.006">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.006" expanded="true" name="Process">
    <process expanded="true" height="971" width="815">
      <operator activated="true" class="read_model" compatibility="5.2.006" expanded="true" height="60" name="Read Model" width="90" x="514" y="435">
        <parameter key="model_file" value="C:\Users\litovitz\Documents\dissertation\rapidminer\model_500Energy_NaiveBayes"/>
      </operator>
      <operator activated="true" class="read_excel" compatibility="5.2.006" expanded="true" height="60" name="Read Excel (3)" width="90" x="12" y="467">
        <parameter key="excel_file" value="C:\Users\litovitz\Documents\dissertation\rapidminer\cat100new_2012-07-11.xlsx"/>
        <parameter key="imported_cell_range" value="A1:E101"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="pub_id.true.integer.label"/>
          <parameter key="1" value="title.true.text.attribute"/>
          <parameter key="2" value="journal.true.text.attribute"/>
          <parameter key="3" value="keyword_plus.true.text.attribute"/>
          <parameter key="4" value="abstract.true.text.attribute"/>
        </list>
      </operator>
      <operator activated="true" class="text:data_to_documents" compatibility="5.2.003" expanded="true" height="60" name="Data to Documents (2)" width="90" x="112" y="525">
        <list key="specify_weights"/>
      </operator>
      <operator activated="true" class="read" compatibility="5.2.006" expanded="true" height="60" name="Read" width="90" x="112" y="435">
        <parameter key="object_file" value="C:\Users\litovitz\Documents\dissertation\rapidminer\wordlist_500Energy"/>
        <parameter key="io_object" value="WordList"/>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="5.2.003" expanded="true" height="94" name="Process Documents (2)" width="90" x="246" y="525">
        <parameter key="prune_method" value="percentual"/>
        <parameter key="prunde_below_percent" value="1.0"/>
        <parameter key="prune_above_percent" value="70.0"/>
        <parameter key="prune_below_absolute" value="2"/>
        <parameter key="prune_above_absolute" value="888"/>
        <process expanded="true" height="404" width="643">
          <operator activated="true" class="text:tokenize" compatibility="5.2.003" expanded="true" height="60" name="Tokenize (2)" width="90" x="45" y="30"/>
          <operator activated="true" class="text:transform_cases" compatibility="5.2.003" expanded="true" height="60" name="Transform Cases (2)" width="90" x="180" y="30"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="5.2.003" expanded="true" height="60" name="Filter Stopwords (2)" width="90" x="315" y="30"/>
          <operator activated="true" class="text:filter_by_length" compatibility="5.2.003" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="450" y="30">
            <parameter key="min_chars" value="2"/>
            <parameter key="max_chars" value="888"/>
          </operator>
          <operator activated="true" class="text:stem_snowball" compatibility="5.2.003" expanded="true" height="60" name="Stem (2)" width="90" x="380" y="165"/>
          <operator activated="true" class="text:generate_n_grams_terms" compatibility="5.2.003" expanded="true" height="60" name="Generate n-Grams (2)" width="90" x="447" y="255">
            <parameter key="max_length" value="3"/>
          </operator>
          <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
          <connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
          <connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
          <connect from_op="Filter Tokens (2)" from_port="document" to_op="Stem (2)" to_port="document"/>
          <connect from_op="Stem (2)" from_port="document" to_op="Generate n-Grams (2)" to_port="document"/>
          <connect from_op="Generate n-Grams (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="set_role" compatibility="5.2.006" expanded="true" height="76" name="Set Role (4)" width="90" x="380" y="570">
        <parameter key="name" value="pub_id"/>
        <parameter key="target_role" value="id"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="read_excel" compatibility="5.2.006" expanded="true" height="60" name="Read Excel (4)" width="90" x="43" y="660">
        <parameter key="excel_file" value="C:\Users\litovitz\Documents\dissertation\rapidminer\cat100new_withkeywords_2012-07-11.xlsx"/>
        <parameter key="imported_cell_range" value="A1:AB101"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="pub_id.true.integer.id"/>
          <parameter key="1" value="photovolt.true.binominal.attribute"/>
          <parameter key="2" value="solar energy.true.binominal.attribute"/>
          <parameter key="3" value="energy production.true.binominal.attribute"/>
          <parameter key="4" value="renewable energy.true.binominal.attribute"/>
          <parameter key="5" value="energy storage.true.binominal.attribute"/>
          <parameter key="6" value="fuel cell.true.binominal.attribute"/>
          <parameter key="7" value="batter.true.binominal.attribute"/>
          <parameter key="8" value="energy conversion technolog.true.binominal.attribute"/>
          <parameter key="9" value="solar cell.true.binominal.attribute"/>
          <parameter key="10" value="solar fuel.true.binominal.attribute"/>
          <parameter key="11" value="biofuel.true.binominal.attribute"/>
          <parameter key="12" value="alternative energy.true.binominal.attribute"/>
          <parameter key="13" value="sustainable energy.true.binominal.attribute"/>
          <parameter key="14" value="solar-energy.true.binominal.attribute"/>
          <parameter key="15" value="fuel-cell.true.binominal.attribute"/>
          <parameter key="16" value="hydrogen storage.true.binominal.attribute"/>
          <parameter key="17" value="artificial photosynthesis.true.binominal.attribute"/>
          <parameter key="18" value="h2 storage.true.binominal.attribute"/>
          <parameter key="19" value="energy application.true.binominal.attribute"/>
          <parameter key="20" value="power generation.true.binominal.attribute"/>
          <parameter key="21" value="solar hydrogen.true.binominal.attribute"/>
          <parameter key="22" value="biodiesel.true.binominal.attribute"/>
          <parameter key="23" value="energy generation.true.binominal.attribute"/>
          <parameter key="24" value="fuel.true.binominal.attribute"/>
          <parameter key="25" value="light harvesting.true.binominal.attribute"/>
          <parameter key="26" value="light-harvesting.true.binominal.attribute"/>
          <parameter key="27" value="keyword energy.true.binominal.attribute"/>
        </list>
      </operator>
      <operator activated="true" class="set_role" compatibility="5.2.006" expanded="true" height="76" name="Set Role (5)" width="90" x="178" y="660">
        <parameter key="name" value="pub_id"/>
        <parameter key="target_role" value="id"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="join" compatibility="5.2.006" expanded="true" height="76" name="Join (2)" width="90" x="447" y="705">
        <list key="key_attributes"/>
      </operator>
      <operator activated="true" class="apply_model" compatibility="5.2.006" expanded="true" height="76" name="Apply Model (2)" width="90" x="648" y="525">
        <list key="application_parameters"/>
      </operator>
      <connect from_op="Read Model" from_port="output" to_op="Apply Model (2)" to_port="model"/>
      <connect from_op="Read Excel (3)" from_port="output" to_op="Data to Documents (2)" to_port="example set"/>
      <connect from_op="Data to Documents (2)" from_port="documents" to_op="Process Documents (2)" to_port="documents 1"/>
      <connect from_op="Read" from_port="output" to_op="Process Documents (2)" to_port="word list"/>
      <connect from_op="Process Documents (2)" from_port="example set" to_op="Set Role (4)" to_port="example set input"/>
      <connect from_op="Set Role (4)" from_port="example set output" to_op="Join (2)" to_port="left"/>
      <connect from_op="Read Excel (4)" from_port="output" to_op="Set Role (5)" to_port="example set input"/>
      <connect from_op="Set Role (5)" from_port="example set output" to_op="Join (2)" to_port="right"/>
      <connect from_op="Join (2)" from_port="join" to_op="Apply Model (2)" to_port="unlabelled data"/>
      <connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 1"/>
      <connect from_op="Apply Model (2)" from_port="model" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Not sure how I would post the data. The model is based on a set of 500 documents with 4 text attributes, an id (pub_id), and a label (energy); these are joined with a set of binomial attributes indicating whether specific keywords are contained in the document. Happy to provide more information.

Aviva

In case this provides pertinent information:

W-BayesLogisticRegression also only gives binary confidences whereas W-BayesNet does give non-0/1 confidences but all of the documents classified as irrelevant have the same confidence, with a little more variation for those classified as relevant.

I am currently using a sample of 500 documents that have been coded relevant/irrelevant and I am using the model to predict 100 new documents.

And here's a simpler version of the process that suffers from the sample problem:

To generate the model:

 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.006">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.006" expanded="true" name="Process">
    <process expanded="true" height="971" width="815">
      <operator activated="true" class="read_excel" compatibility="5.2.006" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
        <parameter key="excel_file" value="C:\Users\litovitz\Documents\dissertation\rapidminer\pubs_500energy.xlsx"/>
        <parameter key="imported_cell_range" value="A1:F501"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="pub_id.true.integer.id"/>
          <parameter key="1" value="title.true.text.attribute"/>
          <parameter key="2" value="journal.true.text.attribute"/>
          <parameter key="3" value="keyword_plus.true.text.attribute"/>
          <parameter key="4" value="abstract.true.text.attribute"/>
          <parameter key="5" value="energy.true.binominal.label"/>
        </list>
      </operator>
      <operator activated="true" class="text:data_to_documents" compatibility="5.2.003" expanded="true" height="60" name="Data to Documents" width="90" x="45" y="120">
        <list key="specify_weights"/>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="5.2.003" expanded="true" height="94" name="Process Documents" width="90" x="179" y="30">
        <parameter key="prune_method" value="percentual"/>
        <parameter key="prunde_below_percent" value="1.0"/>
        <parameter key="prune_above_percent" value="70.0"/>
        <parameter key="prune_below_absolute" value="2"/>
        <parameter key="prune_above_absolute" value="888"/>
        <process expanded="true" height="422" width="634">
          <operator activated="true" class="text:tokenize" compatibility="5.2.003" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30"/>
          <operator activated="true" class="text:transform_cases" compatibility="5.2.003" expanded="true" height="60" name="Transform Cases" width="90" x="70" y="150"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="5.2.003" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="102" y="226"/>
          <operator activated="true" class="text:filter_by_length" compatibility="5.2.003" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="232" y="224">
            <parameter key="min_chars" value="2"/>
            <parameter key="max_chars" value="888"/>
          </operator>
          <operator activated="true" class="text:stem_snowball" compatibility="5.2.003" expanded="true" height="60" name="Stem (Snowball)" width="90" x="372" y="189"/>
          <operator activated="true" class="text:generate_n_grams_terms" compatibility="5.2.003" expanded="true" height="60" name="Generate n-Grams (Terms)" width="90" x="501" y="158">
            <parameter key="max_length" value="3"/>
          </operator>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
          <connect from_op="Stem (Snowball)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
          <connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="write" compatibility="5.2.006" expanded="true" height="60" name="Write" width="90" x="179" y="165">
        <parameter key="object_file" value="C:\Users\litovitz\Documents\dissertation\rapidminer\wordlist_500Energy"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="5.2.006" expanded="true" height="76" name="Set Role" width="90" x="313" y="75">
        <parameter key="name" value="energy"/>
        <parameter key="target_role" value="label"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="generate_weight_stratification" compatibility="5.2.006" expanded="true" height="76" name="Generate Weight (Stratification)" width="90" x="447" y="30"/>
      <operator activated="true" class="x_validation" compatibility="5.2.006" expanded="true" height="112" name="Validation" width="90" x="514" y="165">
        <process expanded="true" height="422" width="292">
          <operator activated="true" class="naive_bayes" compatibility="5.2.006" expanded="true" height="76" name="Naive Bayes" width="90" x="64" y="36"/>
          <connect from_port="training" to_op="Naive Bayes" to_port="training set"/>
          <connect from_op="Naive Bayes" from_port="model" to_port="model"/>
          <portSpacing port="source_training" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true" height="422" width="292">
          <operator activated="true" class="apply_model" compatibility="5.2.006" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="performance_binominal_classification" compatibility="5.2.006" expanded="true" height="76" name="Performance" width="90" x="179" y="30"/>
          <connect from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_averagable 1" spacing="0"/>
          <portSpacing port="sink_averagable 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="write_model" compatibility="5.2.006" expanded="true" height="60" name="Write Model" width="90" x="581" y="30">
        <parameter key="model_file" value="C:\Users\litovitz\Documents\dissertation\rapidminer\model_500Energy_NaiveBayesNoKW"/>
      </operator>
      <connect from_op="Read Excel" from_port="output" to_op="Data to Documents" to_port="example set"/>
      <connect from_op="Data to Documents" from_port="documents" to_op="Process Documents" to_port="documents 1"/>
      <connect from_op="Process Documents" from_port="example set" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Process Documents" from_port="word list" to_op="Write" to_port="object"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Generate Weight (Stratification)" to_port="example set input"/>
      <connect from_op="Generate Weight (Stratification)" from_port="example set output" to_op="Validation" to_port="training"/>
      <connect from_op="Validation" from_port="model" to_op="Write Model" to_port="input"/>
      <connect from_op="Validation" from_port="training" to_port="result 1"/>
      <connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

And to apply model to new data set:

 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.006">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.006" expanded="true" name="Process">
    <process expanded="true" height="971" width="815">
      <operator activated="true" class="read_model" compatibility="5.2.006" expanded="true" height="60" name="Read Model" width="90" x="514" y="435">
        <parameter key="model_file" value="C:\Users\litovitz\Documents\dissertation\rapidminer\model_500Energy_NaiveBayesNoKW"/>
      </operator>
      <operator activated="true" class="read_excel" compatibility="5.2.006" expanded="true" height="60" name="Read Excel (3)" width="90" x="12" y="467">
        <parameter key="excel_file" value="C:\Users\litovitz\Documents\dissertation\rapidminer\cat100new_2012-07-11.xlsx"/>
        <parameter key="imported_cell_range" value="A1:E101"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="pub_id.true.integer.label"/>
          <parameter key="1" value="title.true.text.attribute"/>
          <parameter key="2" value="journal.true.text.attribute"/>
          <parameter key="3" value="keyword_plus.true.text.attribute"/>
          <parameter key="4" value="abstract.true.text.attribute"/>
        </list>
      </operator>
      <operator activated="true" class="text:data_to_documents" compatibility="5.2.003" expanded="true" height="60" name="Data to Documents (2)" width="90" x="112" y="525">
        <list key="specify_weights"/>
      </operator>
      <operator activated="true" class="read" compatibility="5.2.006" expanded="true" height="60" name="Read" width="90" x="112" y="435">
        <parameter key="object_file" value="C:\Users\litovitz\Documents\dissertation\rapidminer\wordlist_500Energy"/>
        <parameter key="io_object" value="WordList"/>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="5.2.003" expanded="true" height="94" name="Process Documents (2)" width="90" x="246" y="525">
        <parameter key="prune_method" value="percentual"/>
        <parameter key="prunde_below_percent" value="1.0"/>
        <parameter key="prune_above_percent" value="70.0"/>
        <parameter key="prune_below_absolute" value="2"/>
        <parameter key="prune_above_absolute" value="888"/>
        <process expanded="true" height="404" width="643">
          <operator activated="true" class="text:tokenize" compatibility="5.2.003" expanded="true" height="60" name="Tokenize (2)" width="90" x="45" y="30"/>
          <operator activated="true" class="text:transform_cases" compatibility="5.2.003" expanded="true" height="60" name="Transform Cases (2)" width="90" x="180" y="30"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="5.2.003" expanded="true" height="60" name="Filter Stopwords (2)" width="90" x="315" y="30"/>
          <operator activated="true" class="text:filter_by_length" compatibility="5.2.003" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="450" y="30">
            <parameter key="min_chars" value="2"/>
            <parameter key="max_chars" value="888"/>
          </operator>
          <operator activated="true" class="text:stem_snowball" compatibility="5.2.003" expanded="true" height="60" name="Stem (2)" width="90" x="380" y="165"/>
          <operator activated="true" class="text:generate_n_grams_terms" compatibility="5.2.003" expanded="true" height="60" name="Generate n-Grams (2)" width="90" x="447" y="255">
            <parameter key="max_length" value="3"/>
          </operator>
          <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
          <connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
          <connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
          <connect from_op="Filter Tokens (2)" from_port="document" to_op="Stem (2)" to_port="document"/>
          <connect from_op="Stem (2)" from_port="document" to_op="Generate n-Grams (2)" to_port="document"/>
          <connect from_op="Generate n-Grams (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="apply_model" compatibility="5.2.006" expanded="true" height="76" name="Apply Model (2)" width="90" x="648" y="525">
        <list key="application_parameters"/>
      </operator>
      <connect from_op="Read Model" from_port="output" to_op="Apply Model (2)" to_port="model"/>
      <connect from_op="Read Excel (3)" from_port="output" to_op="Data to Documents (2)" to_port="example set"/>
      <connect from_op="Data to Documents (2)" from_port="documents" to_op="Process Documents (2)" to_port="documents 1"/>
      <connect from_op="Read" from_port="output" to_op="Process Documents (2)" to_port="word list"/>
      <connect from_op="Process Documents (2)" from_port="example set" to_op="Apply Model (2)" to_port="unlabelled data"/>
      <connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 1"/>
      <connect from_op="Apply Model (2)" from_port="model" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

If there’s any other information that would help shed light on this, please let me know.

Thanks!

MariusHelf

Hm, your processes look fine. It's still a bit strange, but as long as your model performs well, there's nothing bad about this.

Best,
~Marius