I have a process with a training and testing part. In the testing branch, I have:
Read Database --> Set Role (for ID) --> Numerical_to_Bimodal --> Apply Normalization (from training part)
--> Set Role (for Label) --> Apply Model (SVM Classifier) --> Performance_Bimodal_Classifier
I would like to keep the original data in the output data set for reference. So, I insert a 'Generate Copy (in Testing)' before the Numerical_to_Bimodal. This completely changes the performance result.
Here is what I don't understand - are these bugs or features?
- Obviously, my testing dataset has now one attribute more (the copied one). Is this expected to create problems?
- I find that even connecting the 'ori' output of Generate Copy results in the same unexpected performance change...
- hence, I took Generate Copy out of the testing thread: I now have an IO multiplier and just look at the two outputs (exa, ori) of Generate Copy directly: Both contain the same copied attribute.l
- In additional testing, I inserted Generate Copy in the training flow. I now have the eventual label in two columns: Once as raw data, once to be converted to a bimodal label for training purposes. So, in effect, I have a perfect predictor for the SVM. However, I'm surprised on the result I get with libSVM: I don't see the new parameter in the weight table (so, it appears that the copied parameter hasn't made it to the SVM), but the weights of the other parameters have changed (so, it must have had some impact)
I could spend a little more time making this a documented test case - but before I spend my time on it, I'd like to know what I would expect ...
Thanks for any help! Stefan
PS: This is on RM 5.0.001
Here is the code I refer to in above text, with the exception that for sake of simplicity, I added both copy operators. In above text in each case, just one of them was present. I also removed the two Read Database operators in front of the two SetRole operators
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
<context>
<input>
<location/>
</input>
<output>
<location/>
<location/>
<location/>
<location/>
<location/>
</output>
<macros/>
</context>
<operator activated="true" class="process" expanded="true" name="Process">
<process expanded="true" height="791" width="882">
<operator activated="true" class="set_role" expanded="true" height="76" name="Set Role (3)" width="90" x="179" y="30">
<parameter key="name" value="die_id"/>
<parameter key="target_role" value="id"/>
</operator>
<operator activated="true" class="set_role" expanded="true" height="76" name="Set Role (2)" width="90" x="179" y="345">
<parameter key="name" value="die_id"/>
<parameter key="target_role" value="id"/>
</operator>
<operator activated="true" class="generate_copy" expanded="true" height="76" name="Generate Copy in Training" width="90" x="246" y="165">
<parameter key="attribute_name" value="32401_IFA_IM_6dB_L1_3_24_IF_IM_IFAD_P1_1_"/>
<parameter key="new_name" value="32401_original"/>
</operator>
<operator activated="true" class="numerical_to_binominal" expanded="true" height="76" name="Numerical to Binominal" width="90" x="313" y="30">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="32401_IFA_IM_6dB_L1_3_24_IF_IM_IFAD_P1_1_"/>
<parameter key="max" value="73.0"/>
</operator>
<operator activated="true" class="normalize" expanded="true" height="94" name="Normalize" width="90" x="447" y="30"/>
<operator activated="true" class="set_role" expanded="true" height="76" name="Set Role" width="90" x="581" y="30">
<parameter key="name" value="32401_IFA_IM_6dB_L1_3_24_IF_IM_IFAD_P1_1_"/>
<parameter key="target_role" value="label"/>
</operator>
<operator activated="true" class="support_vector_machine_libsvm" expanded="true" height="76" name="SVM" width="90" x="715" y="30">
<parameter key="gamma" value="1.0E-4"/>
<parameter key="C" value="200.0"/>
<list key="class_weights"/>
</operator>
<operator activated="true" class="generate_copy" expanded="true" height="76" name="Generate Copy in Testing" width="90" x="313" y="480">
<parameter key="attribute_name" value="32401_IFA_IM_6dB_L1_3_24_IF_IM_IFAD_P1_1_"/>
<parameter key="new_name" value="32401_original"/>
</operator>
<operator activated="true" class="numerical_to_binominal" expanded="true" height="76" name="Numerical to Binominal (2)" width="90" x="380" y="345">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="32401_IFA_IM_6dB_L1_3_24_IF_IM_IFAD_P1_1_"/>
<parameter key="max" value="73.0"/>
</operator>
<operator activated="true" class="apply_model" expanded="true" height="76" name="Apply Model (2)" width="90" x="447" y="165">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="set_role" expanded="true" height="76" name="Set Role (4)" width="90" x="581" y="300">
<parameter key="name" value="32401_IFA_IM_6dB_L1_3_24_IF_IM_IFAD_P1_1_"/>
<parameter key="target_role" value="label"/>
</operator>
<operator activated="true" class="apply_model" expanded="true" height="76" name="Apply Model" width="90" x="715" y="165">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" expanded="true" height="76" name="Performance (2)" width="90" x="715" y="300">
<parameter key="AUC (optimistic)" value="true"/>
<parameter key="precision" value="true"/>
<parameter key="recall" value="true"/>
<parameter key="false_positive" value="true"/>
<parameter key="false_negative" value="true"/>
<parameter key="true_positive" value="true"/>
<parameter key="true_negative" value="true"/>
</operator>
<operator activated="true" class="write_csv" expanded="true" height="60" name="Write CSV" width="90" x="782" y="480">
<parameter key="csv_file" value="C:\Users\eichenbe\Documents\Backup\Laptop\LiveCopy\Software\RapidMiner_5\SVC_3Lots.csv"/>
<parameter key="column_separator" value=","/>
</operator>
<connect from_op="Set Role (3)" from_port="example set output" to_op="Generate Copy in Training" to_port="example set input"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Generate Copy in Testing" to_port="example set input"/>
<connect from_op="Generate Copy in Training" from_port="example set output" to_op="Numerical to Binominal" to_port="example set input"/>
<connect from_op="Numerical to Binominal" from_port="example set output" to_op="Normalize" to_port="example set input"/>
<connect from_op="Normalize" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Normalize" from_port="preprocessing model" to_op="Apply Model (2)" to_port="model"/>
<connect from_op="Set Role" from_port="example set output" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_op="SVM" from_port="exampleSet" to_port="result 4"/>
<connect from_op="Generate Copy in Testing" from_port="example set output" to_op="Numerical to Binominal (2)" to_port="example set input"/>
<connect from_op="Numerical to Binominal (2)" from_port="example set output" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_op="Set Role (4)" to_port="example set input"/>
<connect from_op="Set Role (4)" from_port="example set output" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance (2)" to_port="labelled data"/>
<connect from_op="Apply Model" from_port="model" to_port="result 1"/>
<connect from_op="Performance (2)" from_port="performance" to_port="result 2"/>
<connect from_op="Performance (2)" from_port="example set" to_op="Write CSV" to_port="input"/>
<connect from_op="Write CSV" from_port="through" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>