🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

Weka Random forest constantly better than Random Forest Rapidminer

User: "Fred12"
New Altair Community Member
Updated by Jocelyn

hi,

I teste W-RAndom Forest and Random Forest from Rapidminer on the same dataset, for W-RF, I got around 89%, whereas for Random Forest I got only 76%, why is that? I thought the Algorithm / Method is the same? Are the implementations so entirely different that I get such a performance discrepancy?

Find more posts tagged with

Sort by:
1 - 3 of 31
    User: "Thomas_Ott"
    New Altair Community Member

    Are you comparing it with the same splitting criteron? This post says that W-RF uses information criteron to split: http://stackoverflow.com/questions/30150970/what-splitting-criterion-does-random-tree-in-weka-3-7-11-use-for-numerical-attri

     

    When I do that, the results of the attached Iris data set works the same.

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.4.000">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.4.000" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="7.4.000" expanded="true" height="68" name="Retrieve Iris" width="90" x="45" y="187">
    <parameter key="repository_entry" value="//Samples/data/Iris"/>
    </operator>
    <operator activated="true" class="multiply" compatibility="7.4.000" expanded="true" height="103" name="Multiply" width="90" x="179" y="187"/>
    <operator activated="true" class="concurrency:cross_validation" compatibility="7.4.000" expanded="true" height="145" name="Validation (2)" width="90" x="313" y="238">
    <parameter key="sampling_type" value="stratified sampling"/>
    <process expanded="true">
    <operator activated="true" class="weka:W-RandomForest" compatibility="7.3.000" expanded="true" height="82" name="W-RandomForest" width="90" x="112" y="34">
    <parameter key="depth" value="20"/>
    </operator>
    <connect from_port="training set" to_op="W-RandomForest" to_port="training set"/>
    <connect from_op="W-RandomForest" from_port="model" to_port="model"/>
    <portSpacing port="source_training set" spacing="0"/>
    <portSpacing port="sink_model" spacing="0"/>
    <portSpacing port="sink_through 1" spacing="0"/>
    </process>
    <process expanded="true">
    <operator activated="true" class="apply_model" compatibility="7.4.000" expanded="true" height="82" name="Apply Model (2)" width="90" x="45" y="34">
    <list key="application_parameters"/>
    </operator>
    <operator activated="true" class="performance" compatibility="7.4.000" expanded="true" height="82" name="Performance (2)" width="90" x="179" y="34"/>
    <connect from_port="model" to_op="Apply Model (2)" to_port="model"/>
    <connect from_port="test set" to_op="Apply Model (2)" to_port="unlabelled data"/>
    <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Performance (2)" to_port="labelled data"/>
    <connect from_op="Performance (2)" from_port="performance" to_port="performance 1"/>
    <connect from_op="Performance (2)" from_port="example set" to_port="test set results"/>
    <portSpacing port="source_model" spacing="0"/>
    <portSpacing port="source_test set" spacing="0"/>
    <portSpacing port="source_through 1" spacing="0"/>
    <portSpacing port="sink_test set results" spacing="0"/>
    <portSpacing port="sink_performance 1" spacing="0"/>
    <portSpacing port="sink_performance 2" spacing="0"/>
    <description align="left" color="blue" colored="true" height="103" resized="false" width="315" x="38" y="137">The model created in the Training step is applied to the current test set (10 %).&lt;br/&gt;The performance is evaluated and sent to the operator results.</description>
    </process>
    </operator>
    <operator activated="true" class="concurrency:cross_validation" compatibility="7.4.000" expanded="true" height="145" name="Validation" width="90" x="313" y="34">
    <parameter key="sampling_type" value="stratified sampling"/>
    <process expanded="true">
    <operator activated="true" class="concurrency:parallel_random_forest" compatibility="7.4.000" expanded="true" height="82" name="Random Forest" width="90" x="179" y="34">
    <parameter key="criterion" value="information_gain"/>
    </operator>
    <connect from_port="training set" to_op="Random Forest" to_port="training set"/>
    <connect from_op="Random Forest" from_port="model" to_port="model"/>
    <portSpacing port="source_training set" spacing="0"/>
    <portSpacing port="sink_model" spacing="0"/>
    <portSpacing port="sink_through 1" spacing="0"/>
    </process>
    <process expanded="true">
    <operator activated="true" class="apply_model" compatibility="7.4.000" expanded="true" height="82" name="Apply Model" width="90" x="45" y="34">
    <list key="application_parameters"/>
    </operator>
    <operator activated="true" class="performance" compatibility="7.4.000" expanded="true" height="82" name="Performance" width="90" x="179" y="34"/>
    <connect from_port="model" to_op="Apply Model" to_port="model"/>
    <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
    <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
    <connect from_op="Performance" from_port="performance" to_port="performance 1"/>
    <connect from_op="Performance" from_port="example set" to_port="test set results"/>
    <portSpacing port="source_model" spacing="0"/>
    <portSpacing port="source_test set" spacing="0"/>
    <portSpacing port="source_through 1" spacing="0"/>
    <portSpacing port="sink_test set results" spacing="0"/>
    <portSpacing port="sink_performance 1" spacing="0"/>
    <portSpacing port="sink_performance 2" spacing="0"/>
    <description align="left" color="blue" colored="true" height="103" resized="true" width="315" x="38" y="137">The model created in the Training step is applied to the current test set (10 %).&lt;br/&gt;The performance is evaluated and sent to the operator results.</description>
    </process>
    </operator>
    <connect from_op="Retrieve Iris" from_port="output" to_op="Multiply" to_port="input"/>
    <connect from_op="Multiply" from_port="output 1" to_op="Validation" to_port="example set"/>
    <connect from_op="Multiply" from_port="output 2" to_op="Validation (2)" to_port="example set"/>
    <connect from_op="Validation (2)" from_port="performance 1" to_port="result 2"/>
    <connect from_op="Validation" from_port="performance 1" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    </process>
    </operator>
    </process>

     

    User: "Fred12"
    New Altair Community Member
    OP

    that might be the problem, I used gain ratio I will try out information gain

    EDIT: with information gain I also got around 77%-... but my dataset is far harder than iris data...

    User: "Fred12"
    New Altair Community Member
    OP

    is there a solution found to that now?

     

    I think it is rather the Random forest implementation from Rapidminer that causes the results rather than any parameter settings...

    I mean its a quite big difference, someone should check that...