Impute Missing Values Weird behaviour

amerkel
amerkel New Altair Community Member
edited November 5 in Community Q&A
Hello,

I wanted to compare clustering of a Dataset with missing values vs imputed values. Unfortunately, as soon as I insert a branch that imputes the values and run it, this seems to be applied to all steps of the process. Even wenn I try to look at the data right after the import the missing values are replaced.
(A second but less important question is why "impute values" component needs a label. It is easy to work around that by labeling an attribute before imputation and unlabeling it afterwards, but....)

Here is my process.

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.006">
 <context>
   <input/>
   <output/>
   <macros/>
 </context>
 <operator activated="true" class="process" compatibility="5.1.006" expanded="true" name="Process">
   <process expanded="true" height="672" width="949">
     <operator activated="true" class="retrieve" compatibility="5.1.006" expanded="true" height="60" name="Retrieve" width="90" x="45" y="165">
       <parameter key="repository_entry" value="Marketing"/>
     </operator>
     <operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="112" name="Multiply" width="90" x="45" y="255"/>
     <operator activated="true" class="set_role" compatibility="5.1.006" expanded="true" height="76" name="Set Role" width="90" x="112" y="390">
       <parameter key="name" value="SEX"/>
       <parameter key="target_role" value="label"/>
       <list key="set_additional_roles"/>
     </operator>
     <operator activated="true" class="impute_missing_values" compatibility="5.1.006" expanded="true" height="60" name="Impute Missing Values" width="90" x="246" y="435">
       <parameter key="attribute_filter_type" value="no_missing_values"/>
       <parameter key="invert_selection" value="true"/>
       <parameter key="include_special_attributes" value="true"/>
       <parameter key="iterate" value="false"/>
       <process expanded="true" height="690" width="911">
         <operator activated="true" class="naive_bayes" compatibility="5.1.006" expanded="true" height="76" name="Naive Bayes" width="90" x="149" y="83"/>
         <connect from_port="example set source" to_op="Naive Bayes" to_port="training set"/>
         <connect from_op="Naive Bayes" from_port="model" to_port="model sink"/>
         <portSpacing port="source_example set source" spacing="0"/>
         <portSpacing port="sink_model sink" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="set_role" compatibility="5.1.006" expanded="true" height="76" name="Set Role (2)" width="90" x="380" y="435">
       <parameter key="name" value="SEX"/>
       <list key="set_additional_roles"/>
     </operator>
     <operator activated="true" class="weka:W-EM" compatibility="5.1.000" expanded="true" height="76" name="W-EM (2)" width="90" x="581" y="435">
       <parameter key="N" value="2.0"/>
       <parameter key="add_as_label" value="true"/>
     </operator>
     <operator activated="true" class="weka:W-EM" compatibility="5.1.000" expanded="true" height="76" name="W-EM" width="90" x="246" y="300">
       <parameter key="N" value="2.0"/>
       <parameter key="add_as_label" value="true"/>
     </operator>
     <connect from_op="Retrieve" from_port="output" to_op="Multiply" to_port="input"/>
     <connect from_op="Multiply" from_port="output 1" to_op="W-EM" to_port="example set"/>
     <connect from_op="Multiply" from_port="output 2" to_op="Set Role" to_port="example set input"/>
     <connect from_op="Multiply" from_port="output 3" to_port="result 1"/>
     <connect from_op="Set Role" from_port="example set output" to_op="Impute Missing Values" to_port="example set in"/>
     <connect from_op="Impute Missing Values" from_port="example set out" to_op="Set Role (2)" to_port="example set input"/>
     <connect from_op="Set Role (2)" from_port="example set output" to_op="W-EM (2)" to_port="example set"/>
     <connect from_op="W-EM" from_port="clustered set" to_port="result 2"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="0"/>
     <portSpacing port="sink_result 2" spacing="0"/>
     <portSpacing port="sink_result 3" spacing="0"/>
   </process>
 </operator>
</process>
thanks,

Alex
Tagged:

Answers

  • Hello

    The multiply operator doesn't take a copy, it actually provides a reference so if the data in one changes, they all do.

    One way out of this is to use the "Materialise Data" operator.

    I made some changes - see the enclosed.

    regards

    Andrew
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.1.006">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.1.006" expanded="true" name="Process">
        <process expanded="true" height="672" width="949">
          <operator activated="true" class="retrieve" compatibility="5.1.006" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30">
            <parameter key="repository_entry" value="//Samples/data/Labor-Negotiations"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="94" name="Multiply" width="90" x="45" y="255"/>
          <operator activated="true" class="materialize_data" compatibility="5.1.006" expanded="true" height="76" name="Materialize Data" width="90" x="179" y="345"/>
          <operator activated="true" class="impute_missing_values" compatibility="5.1.006" expanded="true" height="60" name="Impute Missing Values" width="90" x="380" y="345">
            <parameter key="include_special_attributes" value="true"/>
            <parameter key="iterate" value="false"/>
            <process expanded="true" height="690" width="911">
              <operator activated="true" class="k_nn" compatibility="5.1.006" expanded="true" height="76" name="k-NN" width="90" x="495" y="30"/>
              <connect from_port="example set source" to_op="k-NN" to_port="training set"/>
              <connect from_op="k-NN" from_port="model" to_port="model sink"/>
              <portSpacing port="source_example set source" spacing="0"/>
              <portSpacing port="sink_model sink" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="weka:W-EM" compatibility="5.1.000" expanded="true" height="76" name="W-EM (2)" width="90" x="581" y="345">
            <parameter key="N" value="2.0"/>
            <parameter key="add_as_label" value="true"/>
          </operator>
          <operator activated="true" class="weka:W-EM" compatibility="5.1.000" expanded="true" height="76" name="W-EM" width="90" x="447" y="165">
            <parameter key="N" value="2.0"/>
            <parameter key="add_as_label" value="true"/>
          </operator>
          <connect from_op="Retrieve" from_port="output" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="W-EM" to_port="example set"/>
          <connect from_op="Multiply" from_port="output 2" to_op="Materialize Data" to_port="example set input"/>
          <connect from_op="Materialize Data" from_port="example set output" to_op="Impute Missing Values" to_port="example set in"/>
          <connect from_op="Impute Missing Values" from_port="example set out" to_op="W-EM (2)" to_port="example set"/>
          <connect from_op="W-EM (2)" from_port="cluster model" to_port="result 3"/>
          <connect from_op="W-EM (2)" from_port="clustered set" to_port="result 4"/>
          <connect from_op="W-EM" from_port="cluster model" to_port="result 1"/>
          <connect from_op="W-EM" from_port="clustered set" to_port="result 2"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
          <portSpacing port="sink_result 4" spacing="0"/>
          <portSpacing port="sink_result 5" spacing="0"/>
        </process>
      </operator>
    </process>
  • amerkel
    amerkel New Altair Community Member
    Thanks a lot

    Alex