<?xml version="1.0" encoding="UTF-8" standalone="no"?><process version="5.0"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" expanded="true" name="Process"> <parameter key="logverbosity" value="3"/> <parameter key="random_seed" value="2001"/> <parameter key="send_mail" value="1"/> <parameter key="process_duration_for_mail" value="30"/> <parameter key="encoding" value="SYSTEM"/> <parameter key="parallelize_main_process" value="false"/> <process expanded="true" height="422" width="547"> <operator activated="true" class="generate_churn_data" expanded="true" height="60" name="Generate Churn Data" width="90" x="45" y="75"> <parameter key="number_examples" value="1000"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> </operator> <operator activated="true" class="nominal_to_binominal" expanded="true" height="94" name="Nominal to Binominal" width="90" x="179" y="120"> <parameter key="return_preprocessing_model" value="false"/> <parameter key="create_view" value="false"/> <parameter key="attribute_filter_type" value="0"/> <parameter key="attribute" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="0"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="4"/> <parameter key="block_type" value="0"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="0"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="transform_binominal" value="false"/> <parameter key="use_underscore_in_name" value="false"/> </operator> <operator activated="true" class="remap_binominals" expanded="true" height="76" name="Remap Binominals" width="90" x="246" y="30"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="label"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="0"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="0"/> <parameter key="block_type" value="0"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="0"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="true"/> <parameter key="negative_value" value="ok"/> <parameter key="positive_value" value="terminate"/> </operator> <operator activated="true" class="split_validation" expanded="true" height="112" name="Validation" width="90" x="380" y="75"> <parameter key="create_complete_model" value="false"/> <parameter key="split" value="1"/> <parameter key="split_ratio" value="0.7"/> <parameter key="training_set_size" value="100"/> <parameter key="test_set_size" value="-1"/> <parameter key="sampling_type" value="stratified sampling"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> <parameter key="parallelize_training" value="false"/> <parameter key="parallelize_testing" value="false"/> <process expanded="true" height="443" width="207"> <operator activated="true" class="decision_tree" expanded="true" height="76" name="Decision Tree" width="90" x="45" y="30"> <parameter key="criterion" value="gain_ratio"/> <parameter key="minimal_size_for_split" value="4"/> <parameter key="minimal_leaf_size" value="2"/> <parameter key="minimal_gain" value="0.04"/> <parameter key="maximal_depth" value="20"/> <parameter key="confidence" value="0.25"/> <parameter key="number_of_prepruning_alternatives" value="3"/> <parameter key="no_pre_pruning" value="false"/> <parameter key="no_pruning" value="false"/> </operator> <connect from_port="training" to_op="Decision Tree" to_port="training set"/> <connect from_op="Decision Tree" from_port="model" to_port="model"/> <portSpacing port="source_training" spacing="0"/> <portSpacing port="sink_model" spacing="0"/> <portSpacing port="sink_through 1" spacing="0"/> </process> <process expanded="true" height="443" width="255"> <operator activated="true" class="apply_model" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30"> <list key="application_parameters"/> <parameter key="create_view" value="false"/> </operator> <operator activated="true" class="performance" expanded="true" height="76" name="Performance" width="90" x="155" y="30"> <parameter key="use_example_weights" value="true"/> </operator> <connect from_port="model" to_op="Apply Model" to_port="model"/> <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/> <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/> <connect from_op="Performance" from_port="performance" to_port="averagable 1"/> <portSpacing port="source_model" spacing="0"/> <portSpacing port="source_test set" spacing="0"/> <portSpacing port="source_through 1" spacing="0"/> <portSpacing port="sink_averagable 1" spacing="0"/> <portSpacing port="sink_averagable 2" spacing="0"/> </process> </operator> <connect from_op="Generate Churn Data" from_port="output" to_op="Nominal to Binominal" to_port="example set input"/> <connect from_op="Nominal to Binominal" from_port="example set output" to_op="Remap Binominals" to_port="example set input"/> <connect from_op="Remap Binominals" from_port="example set output" to_op="Validation" to_port="training"/> <connect from_op="Validation" from_port="model" to_port="result 1"/> <connect from_op="Validation" from_port="averagable 1" to_port="result 2"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="36"/> <portSpacing port="sink_result 2" spacing="0"/> <portSpacing port="sink_result 3" spacing="162"/> <portSpacing port="sink_result 4" spacing="54"/> </process> </operator></process>
You can see also the drawing of the ROC produced by RM in this case: indeed the area under this curve is 1. Therefore AUC indicator should be calculated to 1.
However, perhaps this suggestion may be useful to consider after the ROC Analysis implemented in Rapid Miner would be revised as it is still unreliable in this package (i.e. AUC calculation needs corrections, as I have shown on the forum http://rapid-i.com/rapidforum/index.php?PHPSESSID=18d6261d2d63b2ca946477f03c2552bc&;topic=2237.0, and Find Threshold operator does not find the best threshold as expected but provides suboptimal solutions - I emailed a complete report to the RM development team, with relevant processes illustrating this).
<?xml version="1.0" encoding="UTF-8" standalone="no"?><process version="5.0"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="5.0.0" expanded="true" name="Process"> <process expanded="true" height="422" width="547"> <operator activated="true" class="generate_churn_data" compatibility="5.0.0" expanded="true" height="60" name="Generate Churn Data" width="90" x="45" y="75"> <parameter key="number_examples" value="1000"/> </operator> <operator activated="false" class="nominal_to_binominal" compatibility="5.0.0" expanded="true" height="94" name="Nominal to Binominal" width="90" x="179" y="120"/> <operator activated="false" class="remap_binominals" compatibility="5.0.0" expanded="true" height="76" name="Remap Binominals" width="90" x="246" y="30"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="label"/> <parameter key="include_special_attributes" value="true"/> <parameter key="negative_value" value="ok"/> <parameter key="positive_value" value="terminate"/> </operator> <operator activated="true" class="split_validation" compatibility="5.0.0" expanded="true" height="112" name="Validation" width="90" x="380" y="75"> <parameter key="sampling_type" value="stratified sampling"/> <process expanded="true" height="443" width="207"> <operator activated="true" class="decision_tree" compatibility="5.0.0" expanded="true" height="76" name="Decision Tree" width="90" x="45" y="30"> <parameter key="minimal_gain" value="0.04"/> </operator> <connect from_port="training" to_op="Decision Tree" to_port="training set"/> <connect from_op="Decision Tree" from_port="model" to_port="model"/> <portSpacing port="source_training" spacing="0"/> <portSpacing port="sink_model" spacing="0"/> <portSpacing port="sink_through 1" spacing="0"/> </process> <process expanded="true" height="443" width="255"> <operator activated="true" class="apply_model" compatibility="5.0.0" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30"> <list key="application_parameters"/> </operator> <operator activated="true" class="performance" compatibility="5.0.0" expanded="true" height="76" name="Performance" width="90" x="155" y="30"/> <connect from_port="model" to_op="Apply Model" to_port="model"/> <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/> <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/> <connect from_op="Performance" from_port="performance" to_port="averagable 1"/> <portSpacing port="source_model" spacing="0"/> <portSpacing port="source_test set" spacing="0"/> <portSpacing port="source_through 1" spacing="0"/> <portSpacing port="sink_averagable 1" spacing="0"/> <portSpacing port="sink_averagable 2" spacing="0"/> </process> </operator> <connect from_op="Generate Churn Data" from_port="output" to_op="Validation" to_port="training"/> <connect from_op="Validation" from_port="model" to_port="result 1"/> <connect from_op="Validation" from_port="averagable 1" to_port="result 2"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="36"/> <portSpacing port="sink_result 2" spacing="0"/> <portSpacing port="sink_result 3" spacing="162"/> </process> </operator></process>
am I missing something?