Hi,
During my last calculations I did from runs with Random Forest and the Weka Random Forest Operator using various options. Although I tried to make both operators equivalent (using the same number of trees, same local random seed, no minimal split features etc.) the perfomance of the two operators were still different. Here is the workflow I used for benchmarking the two operators.
<?xml version="1.0" encoding="UTF-8" standalone="no"?> <process version="5.1.001"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="5.1.001" expanded="true" name="Process"> <process expanded="true" height="269" width="949"> <operator activated="true" class="generate_data" compatibility="5.1.001" expanded="true" height="60" name="Generate Data" width="90" x="45" y="30"> <parameter key="number_of_attributes" value="20"/> </operator> <operator activated="true" class="discretize_by_user_specification" compatibility="5.1.001" expanded="true" height="94" name="Discretize" width="90" x="179" y="30"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="label"/> <parameter key="include_special_attributes" value="true"/> <list key="classes"> <parameter key="first" value="0.5"/> <parameter key="last" value="Infinity"/> </list> </operator> <operator activated="true" class="nominal_to_binominal" compatibility="5.1.001" expanded="true" height="94" name="Nominal to Binominal" width="90" x="313" y="30"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="label"/> <parameter key="include_special_attributes" value="true"/> </operator> <operator activated="true" class="multiply" compatibility="5.1.001" expanded="true" height="94" name="Multiply" width="90" x="447" y="30"/> <operator activated="true" class="weka:W-RandomForest" compatibility="5.0.001" expanded="true" height="76" name="W-RandomForest" width="90" x="581" y="120"> <parameter key="S" value="1992.0"/> <parameter key="depth" value="5"/> <parameter key="D" value="true"/> </operator> <operator activated="true" class="apply_model" compatibility="5.1.001" expanded="true" height="76" name="Apply Model (2)" width="90" x="715" y="120"> <list key="application_parameters"/> </operator> <operator activated="true" class="performance_binominal_classification" compatibility="5.1.001" expanded="true" height="76" name="Weka Random Forest" width="90" x="849" y="120"> <parameter key="f_measure" value="true"/> <parameter key="youden" value="true"/> </operator> <operator activated="true" class="random_forest" compatibility="5.1.001" expanded="true" height="76" name="Random Forest" width="90" x="581" y="30"> <parameter key="minimal_size_for_split" value="1"/> <parameter key="minimal_leaf_size" value="1"/> <parameter key="minimal_gain" value="0.0"/> <parameter key="maximal_depth" value="5"/> <parameter key="confidence" value="1.0E-7"/> <parameter key="no_pre_pruning" value="true"/> <parameter key="no_pruning" value="true"/> <parameter key="use_local_random_seed" value="true"/> </operator> <operator activated="true" class="apply_model" compatibility="5.1.001" expanded="true" height="76" name="Apply Model" width="90" x="715" y="30"> <list key="application_parameters"/> </operator> <operator activated="true" class="performance_binominal_classification" compatibility="5.1.001" expanded="true" height="76" name="RM Random Forest" width="90" x="849" y="30"> <parameter key="f_measure" value="true"/> <parameter key="youden" value="true"/> </operator> <connect from_op="Generate Data" from_port="output" to_op="Discretize" to_port="example set input"/> <connect from_op="Discretize" from_port="example set output" to_op="Nominal to Binominal" to_port="example set input"/> <connect from_op="Nominal to Binominal" from_port="example set output" to_op="Multiply" to_port="input"/> <connect from_op="Multiply" from_port="output 1" to_op="Random Forest" to_port="training set"/> <connect from_op="Multiply" from_port="output 2" to_op="W-RandomForest" to_port="training set"/> <connect from_op="W-RandomForest" from_port="model" to_op="Apply Model (2)" to_port="model"/> <connect from_op="W-RandomForest" from_port="exampleSet" to_op="Apply Model (2)" to_port="unlabelled data"/> <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Weka Random Forest" to_port="labelled data"/> <connect from_op="Weka Random Forest" from_port="performance" to_port="result 2"/> <connect from_op="Random Forest" from_port="model" to_op="Apply Model" to_port="model"/> <connect from_op="Random Forest" from_port="exampleSet" to_op="Apply Model" to_port="unlabelled data"/> <connect from_op="Apply Model" from_port="labelled data" to_op="RM Random Forest" to_port="labelled data"/> <connect from_op="RM Random Forest" from_port="performance" to_port="result 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> <portSpacing port="sink_result 3" spacing="0"/> </process> </operator> </process>
|
Is RM using a different implentation of the Random Forest and if so, what differences were useed?
Best regards,
Markus