Impute Missing Values Weird behaviour
amerkel
New Altair Community Member
Hello,
I wanted to compare clustering of a Dataset with missing values vs imputed values. Unfortunately, as soon as I insert a branch that imputes the values and run it, this seems to be applied to all steps of the process. Even wenn I try to look at the data right after the import the missing values are replaced.
(A second but less important question is why "impute values" component needs a label. It is easy to work around that by labeling an attribute before imputation and unlabeling it afterwards, but....)
Here is my process.
Alex
I wanted to compare clustering of a Dataset with missing values vs imputed values. Unfortunately, as soon as I insert a branch that imputes the values and run it, this seems to be applied to all steps of the process. Even wenn I try to look at the data right after the import the missing values are replaced.
(A second but less important question is why "impute values" component needs a label. It is easy to work around that by labeling an attribute before imputation and unlabeling it afterwards, but....)
Here is my process.
thanks,
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.006">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.006" expanded="true" name="Process">
<process expanded="true" height="672" width="949">
<operator activated="true" class="retrieve" compatibility="5.1.006" expanded="true" height="60" name="Retrieve" width="90" x="45" y="165">
<parameter key="repository_entry" value="Marketing"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="112" name="Multiply" width="90" x="45" y="255"/>
<operator activated="true" class="set_role" compatibility="5.1.006" expanded="true" height="76" name="Set Role" width="90" x="112" y="390">
<parameter key="name" value="SEX"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="impute_missing_values" compatibility="5.1.006" expanded="true" height="60" name="Impute Missing Values" width="90" x="246" y="435">
<parameter key="attribute_filter_type" value="no_missing_values"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="iterate" value="false"/>
<process expanded="true" height="690" width="911">
<operator activated="true" class="naive_bayes" compatibility="5.1.006" expanded="true" height="76" name="Naive Bayes" width="90" x="149" y="83"/>
<connect from_port="example set source" to_op="Naive Bayes" to_port="training set"/>
<connect from_op="Naive Bayes" from_port="model" to_port="model sink"/>
<portSpacing port="source_example set source" spacing="0"/>
<portSpacing port="sink_model sink" spacing="0"/>
</process>
</operator>
<operator activated="true" class="set_role" compatibility="5.1.006" expanded="true" height="76" name="Set Role (2)" width="90" x="380" y="435">
<parameter key="name" value="SEX"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="weka:W-EM" compatibility="5.1.000" expanded="true" height="76" name="W-EM (2)" width="90" x="581" y="435">
<parameter key="N" value="2.0"/>
<parameter key="add_as_label" value="true"/>
</operator>
<operator activated="true" class="weka:W-EM" compatibility="5.1.000" expanded="true" height="76" name="W-EM" width="90" x="246" y="300">
<parameter key="N" value="2.0"/>
<parameter key="add_as_label" value="true"/>
</operator>
<connect from_op="Retrieve" from_port="output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="W-EM" to_port="example set"/>
<connect from_op="Multiply" from_port="output 2" to_op="Set Role" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 3" to_port="result 1"/>
<connect from_op="Set Role" from_port="example set output" to_op="Impute Missing Values" to_port="example set in"/>
<connect from_op="Impute Missing Values" from_port="example set out" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="W-EM (2)" to_port="example set"/>
<connect from_op="W-EM" from_port="clustered set" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
Alex
Tagged:
0
Answers
-
Hello
The multiply operator doesn't take a copy, it actually provides a reference so if the data in one changes, they all do.
One way out of this is to use the "Materialise Data" operator.
I made some changes - see the enclosed.
regards
Andrew<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.006">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.006" expanded="true" name="Process">
<process expanded="true" height="672" width="949">
<operator activated="true" class="retrieve" compatibility="5.1.006" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30">
<parameter key="repository_entry" value="//Samples/data/Labor-Negotiations"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="94" name="Multiply" width="90" x="45" y="255"/>
<operator activated="true" class="materialize_data" compatibility="5.1.006" expanded="true" height="76" name="Materialize Data" width="90" x="179" y="345"/>
<operator activated="true" class="impute_missing_values" compatibility="5.1.006" expanded="true" height="60" name="Impute Missing Values" width="90" x="380" y="345">
<parameter key="include_special_attributes" value="true"/>
<parameter key="iterate" value="false"/>
<process expanded="true" height="690" width="911">
<operator activated="true" class="k_nn" compatibility="5.1.006" expanded="true" height="76" name="k-NN" width="90" x="495" y="30"/>
<connect from_port="example set source" to_op="k-NN" to_port="training set"/>
<connect from_op="k-NN" from_port="model" to_port="model sink"/>
<portSpacing port="source_example set source" spacing="0"/>
<portSpacing port="sink_model sink" spacing="0"/>
</process>
</operator>
<operator activated="true" class="weka:W-EM" compatibility="5.1.000" expanded="true" height="76" name="W-EM (2)" width="90" x="581" y="345">
<parameter key="N" value="2.0"/>
<parameter key="add_as_label" value="true"/>
</operator>
<operator activated="true" class="weka:W-EM" compatibility="5.1.000" expanded="true" height="76" name="W-EM" width="90" x="447" y="165">
<parameter key="N" value="2.0"/>
<parameter key="add_as_label" value="true"/>
</operator>
<connect from_op="Retrieve" from_port="output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="W-EM" to_port="example set"/>
<connect from_op="Multiply" from_port="output 2" to_op="Materialize Data" to_port="example set input"/>
<connect from_op="Materialize Data" from_port="example set output" to_op="Impute Missing Values" to_port="example set in"/>
<connect from_op="Impute Missing Values" from_port="example set out" to_op="W-EM (2)" to_port="example set"/>
<connect from_op="W-EM (2)" from_port="cluster model" to_port="result 3"/>
<connect from_op="W-EM (2)" from_port="clustered set" to_port="result 4"/>
<connect from_op="W-EM" from_port="cluster model" to_port="result 1"/>
<connect from_op="W-EM" from_port="clustered set" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>0 -
Thanks a lot
Alex0