Hi,
I am working with random forest leaners and I was wondering if Normalisation does effect the outcome of Random Forest predictions. Against my expectations it does. Here is the workflow I used with random generated data (so everyone can run it).
<?xml version="1.0" encoding="UTF-8" standalone="no"?> <process version="5.0"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="5.0.8" expanded="true" name="Process"> <process expanded="true" height="476" width="1552"> <operator activated="true" class="generate_data" compatibility="5.0.8" expanded="true" height="60" name="Generate Data" width="90" x="45" y="30"> <parameter key="number_examples" value="200"/> <parameter key="number_of_attributes" value="4"/> </operator> <operator activated="true" class="discretize_by_user_specification" compatibility="5.0.8" expanded="true" height="94" name="Discretize" width="90" x="179" y="30"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="label"/> <parameter key="include_special_attributes" value="true"/> <list key="classes"> <parameter key="0" value="0.5"/> <parameter key="1" value="1.0"/> </list> </operator> <operator activated="true" class="nominal_to_binominal" compatibility="5.0.8" expanded="true" height="94" name="Nominal to Binominal" width="90" x="313" y="30"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="label"/> <parameter key="include_special_attributes" value="true"/> </operator> <operator activated="true" class="generate_id" compatibility="5.0.8" expanded="true" height="76" name="Generate ID" width="90" x="447" y="30"/> <operator activated="true" class="multiply" compatibility="5.0.8" expanded="true" height="112" name="Multiply" width="90" x="581" y="30"/> <operator activated="true" class="subprocess" compatibility="5.0.8" expanded="true" height="112" name="CV-DT + performance (3)" width="90" x="849" y="30"> <process expanded="true" height="550" width="915"> <operator activated="true" class="random_forest" compatibility="5.0.8" expanded="true" height="76" name="Random Forest (2)" width="90" x="45" y="30"> <parameter key="criterion" value="gini_index"/> </operator> <operator activated="true" class="apply_model" compatibility="5.0.8" expanded="true" height="76" name="Apply Model (6)" width="90" x="180" y="30"> <list key="application_parameters"/> </operator> <operator activated="true" class="multiply" compatibility="5.0.8" expanded="true" height="94" name="Multiply (4)" width="90" x="315" y="30"/> <operator activated="true" class="performance_binominal_classification" compatibility="5.0.8" expanded="true" height="76" name="Performance no" width="90" x="447" y="30"> <parameter key="youden" value="true"/> <parameter key="psep" value="true"/> </operator> <operator activated="true" class="select_attributes" compatibility="5.0.8" expanded="true" height="76" name="Select Attributes (4)" width="90" x="447" y="120"> <parameter key="attribute_filter_type" value="regular_expression"/> <parameter key="regular_expression" value=".*prediction.*|label|id"/> </operator> <operator activated="true" class="rename" compatibility="5.0.8" expanded="true" height="76" name="Rename (4)" width="90" x="581" y="120"> <parameter key="old_name" value="prediction(label)"/> <parameter key="new_name" value="no_normalisation"/> </operator> <operator activated="true" class="set_role" compatibility="5.0.8" expanded="true" height="76" name="Set Role (3)" width="90" x="715" y="120"> <parameter key="name" value="no_normalisation"/> </operator> <connect from_port="in 1" to_op="Random Forest (2)" to_port="training set"/> <connect from_op="Random Forest (2)" from_port="model" to_op="Apply Model (6)" to_port="model"/> <connect from_op="Random Forest (2)" from_port="exampleSet" to_op="Apply Model (6)" to_port="unlabelled data"/> <connect from_op="Apply Model (6)" from_port="labelled data" to_op="Multiply (4)" to_port="input"/> <connect from_op="Apply Model (6)" from_port="model" to_port="out 2"/> <connect from_op="Multiply (4)" from_port="output 1" to_op="Performance no" to_port="labelled data"/> <connect from_op="Multiply (4)" from_port="output 2" to_op="Select Attributes (4)" to_port="example set input"/> <connect from_op="Performance no" from_port="performance" to_port="out 1"/> <connect from_op="Select Attributes (4)" from_port="example set output" to_op="Rename (4)" to_port="example set input"/> <connect from_op="Rename (4)" from_port="example set output" to_op="Set Role (3)" to_port="example set input"/> <connect from_op="Set Role (3)" from_port="example set output" to_port="out 3"/> <portSpacing port="source_in 1" spacing="0"/> <portSpacing port="source_in 2" spacing="0"/> <portSpacing port="sink_out 1" spacing="0"/> <portSpacing port="sink_out 2" spacing="0"/> <portSpacing port="sink_out 3" spacing="0"/> <portSpacing port="sink_out 4" spacing="0"/> </process> </operator> <operator activated="true" class="normalize" compatibility="5.0.8" expanded="true" height="94" name="Z-Transformation" width="90" x="715" y="165"/> <operator activated="true" class="subprocess" compatibility="5.0.8" expanded="true" height="112" name="CV-DT + performance (2)" width="90" x="849" y="165"> <process expanded="true" height="550" width="892"> <operator activated="true" class="random_forest" compatibility="5.0.8" expanded="true" height="76" name="Random Forest (3)" width="90" x="45" y="30"> <parameter key="criterion" value="gini_index"/> </operator> <operator activated="true" class="apply_model" compatibility="5.0.8" expanded="true" height="76" name="Apply Model (4)" width="90" x="180" y="30"> <list key="application_parameters"/> </operator> <operator activated="true" class="multiply" compatibility="5.0.8" expanded="true" height="94" name="Multiply (3)" width="90" x="315" y="30"/> <operator activated="true" class="performance_binominal_classification" compatibility="5.0.8" expanded="true" height="76" name="Performance z" width="90" x="447" y="30"> <parameter key="youden" value="true"/> <parameter key="psep" value="true"/> </operator> <operator activated="true" class="select_attributes" compatibility="5.0.8" expanded="true" height="76" name="Select Attributes (3)" width="90" x="447" y="120"> <parameter key="attribute_filter_type" value="regular_expression"/> <parameter key="regular_expression" value=".*prediction.*|label|id"/> </operator> <operator activated="true" class="rename" compatibility="5.0.8" expanded="true" height="76" name="Rename (3)" width="90" x="581" y="120"> <parameter key="old_name" value="prediction(label)"/> <parameter key="new_name" value="z_normalisation"/> </operator> <operator activated="true" class="set_role" compatibility="5.0.8" expanded="true" height="76" name="Set Role" width="90" x="715" y="120"> <parameter key="name" value="z_normalisation"/> </operator> <connect from_port="in 1" to_op="Random Forest (3)" to_port="training set"/> <connect from_op="Random Forest (3)" from_port="model" to_op="Apply Model (4)" to_port="model"/> <connect from_op="Random Forest (3)" from_port="exampleSet" to_op="Apply Model (4)" to_port="unlabelled data"/> <connect from_op="Apply Model (4)" from_port="labelled data" to_op="Multiply (3)" to_port="input"/> <connect from_op="Apply Model (4)" from_port="model" to_port="out 2"/> <connect from_op="Multiply (3)" from_port="output 1" to_op="Performance z" to_port="labelled data"/> <connect from_op="Multiply (3)" from_port="output 2" to_op="Select Attributes (3)" to_port="example set input"/> <connect from_op="Performance z" from_port="performance" to_port="out 1"/> <connect from_op="Select Attributes (3)" from_port="example set output" to_op="Rename (3)" to_port="example set input"/> <connect from_op="Rename (3)" from_port="example set output" to_op="Set Role" to_port="example set input"/> <connect from_op="Set Role" from_port="example set output" to_port="out 3"/> <portSpacing port="source_in 1" spacing="0"/> <portSpacing port="source_in 2" spacing="0"/> <portSpacing port="sink_out 1" spacing="0"/> <portSpacing port="sink_out 2" spacing="0"/> <portSpacing port="sink_out 3" spacing="0"/> <portSpacing port="sink_out 4" spacing="0"/> </process> </operator> <operator activated="true" class="join" compatibility="5.0.8" expanded="true" height="76" name="Join" width="90" x="983" y="30"/> <operator activated="true" class="normalize" compatibility="5.0.8" expanded="true" height="94" name="Range Transortmation" width="90" x="715" y="300"> <parameter key="method" value="range transformation"/> </operator> <operator activated="true" class="subprocess" compatibility="5.0.8" expanded="true" height="112" name="CV-DT + performance" width="90" x="849" y="300"> <process expanded="true" height="550" width="910"> <operator activated="true" class="random_forest" compatibility="5.0.8" expanded="true" height="76" name="Random Forest" width="90" x="45" y="30"> <parameter key="criterion" value="gini_index"/> </operator> <operator activated="true" class="apply_model" compatibility="5.0.8" expanded="true" height="76" name="Apply Model" width="90" x="179" y="30"> <list key="application_parameters"/> </operator> <operator activated="true" class="multiply" compatibility="5.0.8" expanded="true" height="94" name="Multiply (2)" width="90" x="313" y="30"/> <operator activated="true" class="performance_binominal_classification" compatibility="5.0.8" expanded="true" height="76" name="Performance range" width="90" x="447" y="30"> <parameter key="youden" value="true"/> <parameter key="psep" value="true"/> </operator> <operator activated="true" class="select_attributes" compatibility="5.0.8" expanded="true" height="76" name="Select Attributes (2)" width="90" x="449" y="120"> <parameter key="attribute_filter_type" value="regular_expression"/> <parameter key="regular_expression" value=".*prediction.*|label|id"/> </operator> <operator activated="true" class="rename" compatibility="5.0.8" expanded="true" height="76" name="Rename (2)" width="90" x="581" y="120"> <parameter key="old_name" value="prediction(label)"/> <parameter key="new_name" value="range_normalisation"/> </operator> <operator activated="true" class="set_role" compatibility="5.0.8" expanded="true" height="76" name="Set Role (2)" width="90" x="715" y="120"> <parameter key="name" value="range_normalisation"/> </operator> <connect from_port="in 1" to_op="Random Forest" to_port="training set"/> <connect from_op="Random Forest" from_port="model" to_op="Apply Model" to_port="model"/> <connect from_op="Random Forest" from_port="exampleSet" to_op="Apply Model" to_port="unlabelled data"/> <connect from_op="Apply Model" from_port="labelled data" to_op="Multiply (2)" to_port="input"/> <connect from_op="Apply Model" from_port="model" to_port="out 2"/> <connect from_op="Multiply (2)" from_port="output 1" to_op="Performance range" to_port="labelled data"/> <connect from_op="Multiply (2)" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/> <connect from_op="Performance range" from_port="performance" to_port="out 1"/> <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Rename (2)" to_port="example set input"/> <connect from_op="Rename (2)" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/> <connect from_op="Set Role (2)" from_port="example set output" to_port="out 3"/> <portSpacing port="source_in 1" spacing="0"/> <portSpacing port="source_in 2" spacing="0"/> <portSpacing port="sink_out 1" spacing="0"/> <portSpacing port="sink_out 2" spacing="0"/> <portSpacing port="sink_out 3" spacing="0"/> <portSpacing port="sink_out 4" spacing="0"/> </process> </operator> <operator activated="true" class="log" compatibility="5.0.8" expanded="true" height="112" name="Log" width="90" x="1117" y="300"> <list key="log"> <parameter key="youden_range" value="operator.Performance range.value.youden"/> <parameter key="youden_z" value="operator.Performance z.value.youden"/> <parameter key="youden_no" value="operator.Performance no.value.youden"/> <parameter key="accuracy_range" value="operator.Performance range.value.accuracy"/> <parameter key="accuracy_z" value="operator.Performance z.value.accuracy"/> <parameter key="accuracy_no" value="operator.Performance no.value.accuracy"/> </list> </operator> <operator activated="true" class="log_to_data" compatibility="5.0.8" expanded="true" height="94" name="Log to Data" width="90" x="1251" y="300"/> <operator activated="true" class="join" compatibility="5.0.8" expanded="true" height="76" name="Join (2)" width="90" x="1117" y="30"/> <operator activated="true" class="nominal_to_numerical" compatibility="5.0.8" expanded="true" height="94" name="Nominal to Numerical" width="90" x="1251" y="30"> <parameter key="attribute_filter_type" value="regular_expression"/> <parameter key="regular_expression" value=".*normalisation.*"/> </operator> <operator activated="true" class="generate_aggregation" compatibility="5.0.8" expanded="true" height="76" name="Stdev over all labels" width="90" x="1385" y="30"> <parameter key="attribute_name" value="stdev_normalisation"/> <parameter key="attribute_filter_type" value="regular_expression"/> <parameter key="regular_expression" value=".*normalisation.*"/> <parameter key="aggregation_function" value="standard_deviation"/> </operator> <connect from_op="Generate Data" from_port="output" to_op="Discretize" to_port="example set input"/> <connect from_op="Discretize" from_port="example set output" to_op="Nominal to Binominal" to_port="example set input"/> <connect from_op="Nominal to Binominal" from_port="example set output" to_op="Generate ID" to_port="example set input"/> <connect from_op="Generate ID" from_port="example set output" to_op="Multiply" to_port="input"/> <connect from_op="Multiply" from_port="output 1" to_op="CV-DT + performance (3)" to_port="in 1"/> <connect from_op="Multiply" from_port="output 2" to_op="Z-Transformation" to_port="example set input"/> <connect from_op="Multiply" from_port="output 3" to_op="Range Transortmation" to_port="example set input"/> <connect from_op="CV-DT + performance (3)" from_port="out 1" to_op="Log" to_port="through 1"/> <connect from_op="CV-DT + performance (3)" from_port="out 3" to_op="Join" to_port="left"/> <connect from_op="Z-Transformation" from_port="example set output" to_op="CV-DT + performance (2)" to_port="in 1"/> <connect from_op="CV-DT + performance (2)" from_port="out 1" to_op="Log" to_port="through 2"/> <connect from_op="CV-DT + performance (2)" from_port="out 3" to_op="Join" to_port="right"/> <connect from_op="Join" from_port="join" to_op="Join (2)" to_port="left"/> <connect from_op="Range Transortmation" from_port="example set output" to_op="CV-DT + performance" to_port="in 1"/> <connect from_op="CV-DT + performance" from_port="out 1" to_op="Log" to_port="through 3"/> <connect from_op="CV-DT + performance" from_port="out 3" to_op="Join (2)" to_port="right"/> <connect from_op="Log" from_port="through 1" to_op="Log to Data" to_port="through 1"/> <connect from_op="Join (2)" from_port="join" to_op="Nominal to Numerical" to_port="example set input"/> <connect from_op="Nominal to Numerical" from_port="example set output" to_op="Stdev over all labels" to_port="example set input"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> </process> </operator> </process>
|
I don't know why normalization does affect the performance and even the classification of some examples. I would be happy if someone could explain this to me.
Cheers,
Markus