🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

Selected Attribute Not Appearing in Output

User: "minerthreat"
New Altair Community Member
Updated by Jocelyn
I am running a Naive Bayes analysis on textual data. The Naive Bayes model itself is in another process that is input into the Apply Model operator in the process described below. The input data in the process is a 162 row dataset in MySQL. 'Title' is one of the columns/attributes in this table. These are simply titles of various news articles from around the web. As my XML code below shows, I want title to be included in my output. However, it does not appear even though the process completes successfully and the other selected attributes do appear.

My log contains the following warnings:

WARNING: SimpleDistribution: The number of regular attributes of the given example set does not fit the number of attributes of the training example set, training: 26228, application: 3162
WARNING: SimpleDistribution: The given example set does not contain a regular attribute with name 'aa_batteri'. This might cause problems for some models depending on this particular attribute.

The second warning repeats for many many times over for separate n-grams.

My XML:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve Model" width="90" x="45" y="30">
        <parameter key="repository_entry" value="//NewLocalRepository/Virtualization/Disruption Prediction Model/Realtime Predictions/Modeling/Model"/>
      </operator>
      <operator activated="true" class="read_database" compatibility="5.3.015" expanded="true" height="60" name="Read Database" width="90" x="45" y="210">
        <parameter key="connection" value="MySQL"/>
        <parameter key="query" value="SELECT `title`, `clean_text`&#10;FROM `potential_tech_disruptions_clean_data_tbl`"/>
        <enumeration key="parameters"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="210">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="clean_text|title|"/>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="5.3.015" expanded="true" height="76" name="Nominal to Text" width="90" x="313" y="210">
        <parameter key="attributes" value="|title|clean_text"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data (2)" width="90" x="447" y="210">
        <parameter key="prune_method" value="absolute"/>
        <parameter key="prune_below_percent" value="0.0"/>
        <parameter key="prune_above_percent" value="10.0"/>
        <parameter key="prune_below_absolute" value="2"/>
        <parameter key="prune_above_absolute" value="99999"/>
        <parameter key="prune_below_rank" value="0.5"/>
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (2)" width="90" x="45" y="30"/>
          <operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases (2)" width="90" x="180" y="30"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (2)" width="90" x="315" y="30"/>
          <operator activated="true" class="text:stem_snowball" compatibility="5.3.002" expanded="true" height="60" name="Stem (2)" width="90" x="450" y="30"/>
          <operator activated="true" class="text:generate_n_grams_terms" compatibility="5.3.002" expanded="true" height="60" name="Generate n-Grams (2)" width="90" x="313" y="165">
            <parameter key="max_length" value="3"/>
          </operator>
          <operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="447" y="165">
            <parameter key="min_chars" value="3"/>
            <parameter key="max_chars" value="125"/>
          </operator>
          <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
          <connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
          <connect from_op="Filter Stopwords (2)" from_port="document" to_op="Stem (2)" to_port="document"/>
          <connect from_op="Stem (2)" from_port="document" to_op="Generate n-Grams (2)" to_port="document"/>
          <connect from_op="Generate n-Grams (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
          <connect from_op="Filter Tokens (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="Apply Model (2)" width="90" x="179" y="30">
        <list key="application_parameters"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes (2)" width="90" x="313" y="30">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|title|confidence(N)|confidence(Y)|prediction(Disruptive)"/>
      </operator>
      <operator activated="true" class="write_database" compatibility="5.3.015" expanded="true" height="60" name="Write Database" width="90" x="447" y="30">
        <parameter key="connection" value="MySQL"/>
        <parameter key="table_name" value="predicted_disruption_tbl2"/>
        <parameter key="overwrite_mode" value="overwrite"/>
        <parameter key="set_default_varchar_length" value="true"/>
        <parameter key="default_varchar_length" value="255"/>
      </operator>
      <connect from_op="Retrieve Model" from_port="output" to_op="Apply Model (2)" to_port="model"/>
      <connect from_op="Read Database" from_port="output" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data (2)" to_port="example set"/>
      <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Apply Model (2)" to_port="unlabelled data"/>
      <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Select Attributes (2)" to_port="example set input"/>
      <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Write Database" to_port="input"/>
      <connect from_op="Write Database" from_port="through" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Thank you for any help that can be offered.

Find more posts tagged with