"Impute Missing Values help"

User: "diogo_fp_couto"
New Altair Community Member
Updated by Jocelyn

Hello,

 

I am running a process to impute missing values from my dataset. These are numeric values. I am using the operator Impute Missing Values with k-nn. I got it running on a server. My dataset consists of about 140000 examples, and about 30000 are missing (only one attribute). The problem is, it appears k-nn(62), not sure what 62 means, and it's been 4 days and 22 hours, as you can see in the picture. Will it take too much longer. Does anyone know how to make it faster? 
I am kinda getting out of time to deliver my work, so any help would be appreciated.

Thanks in advance.

Find more posts tagged with

Sort by:
1 - 1 of 11
    User: "sgenzer"
    Altair Employee
    Accepted Answer

    hi @diogo_fp_couto - not sure why you're dummy coding at this point.  I would just leave it as is while you impute the missing data.  It should take a little while b/c you do have a lot of data and you are using k-NN which is a slow algorithm.  I'm running this now...

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="7.6.001" expanded="true" height="68" name="Retrieve dataset" width="90" x="45" y="34">
    <parameter key="repository_entry" value="//RapidMiner OneDrive/random community stuff/dataset"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.6.001" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="34">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="CS Site-Specific Factor 25"/>
    <parameter key="invert_selection" value="true"/>
    </operator>
    <operator activated="true" class="remove_unused_values" compatibility="7.6.001" expanded="true" height="103" name="Remove Unused Values" width="90" x="313" y="34"/>
    <operator activated="true" class="declare_missing_value" compatibility="7.6.001" expanded="true" height="82" name="Declare Missing Value" width="90" x="447" y="34">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="CS Tumor Size"/>
    <parameter key="attributes" value="CS Tumor Size|CS Site-Specific Factor 2|CS Lymph Nodes"/>
    <parameter key="numeric_value" value="999.0"/>
    <parameter key="nominal_value" value="999"/>
    </operator>
    <operator activated="true" class="set_role" compatibility="7.6.001" expanded="true" height="82" name="Set Role" width="90" x="581" y="34">
    <parameter key="attribute_name" value="Survival months"/>
    <parameter key="target_role" value="label"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="false" class="nominal_to_numerical" compatibility="7.6.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="715" y="187">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="CS Tumor Size"/>
    <parameter key="coding_type" value="unique integers"/>
    <list key="comparison_groups"/>
    </operator>
    <operator activated="false" breakpoints="after" class="nominal_to_numerical" compatibility="7.6.001" expanded="true" height="103" name="Nominal to Numerical (2)" width="90" x="849" y="187">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="Histologic Type ICD-O-3|RX Summ—Radiation|RX Summ—Surg / Rad Seq"/>
    <list key="comparison_groups"/>
    </operator>
    <operator activated="true" class="impute_missing_values" compatibility="7.6.001" expanded="true" height="68" name="Impute Missing Values" width="90" x="715" y="34">
    <parameter key="attribute" value="CS Tumor Size"/>
    <parameter key="attributes" value="CS Tumor Size"/>
    <parameter key="include_special_attributes" value="true"/>
    <parameter key="order" value="random"/>
    <process expanded="true">
    <operator activated="true" class="k_nn" compatibility="7.6.001" expanded="true" height="82" name="k-NN" width="90" x="112" y="34"/>
    <connect from_port="example set source" to_op="k-NN" to_port="training set"/>
    <connect from_op="k-NN" from_port="model" to_port="model sink"/>
    <portSpacing port="source_example set source" spacing="0"/>
    <portSpacing port="sink_model sink" spacing="0"/>
    </process>
    </operator>
    <operator activated="false" class="store" compatibility="7.6.001" expanded="true" height="68" name="Store" width="90" x="916" y="187">
    <parameter key="repository_entry" value="//Tese/TESE/Data/ContinuousSurvival/Colon/colonTrainNumericImputed"/>
    </operator>
    <connect from_op="Retrieve dataset" from_port="output" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Remove Unused Values" to_port="example set input"/>
    <connect from_op="Remove Unused Values" from_port="example set output" to_op="Declare Missing Value" to_port="example set input"/>
    <connect from_op="Declare Missing Value" from_port="example set output" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="Impute Missing Values" to_port="example set in"/>
    <connect from_op="Nominal to Numerical" from_port="example set output" to_op="Nominal to Numerical (2)" to_port="example set input"/>
    <connect from_op="Impute Missing Values" from_port="example set out" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>

    Scott