Altair RISE

A program to recognize and reward our most engaged community members

Nominate Yourself Now!

Test set beating training set

Hello,

I have begun using RapidMiner recently and am having a strange problem with one of my workflows. I have split a dataset using the Split Data function, I have then built a random forest on the 90% Training set and applied that model on the 10% test set. However when I asses the performances, the test set consistently does better even as I vary the seeds. This result seems counter intuitive and I'm wondering if I have interpreted one of the arguments wrongly or am missing a detail?

By the way I am aware that there are more efficient ways to set up this flow, I am trying alternative ways as a bit of practice

Thanks

Find more posts tagged with

AI Studio

Accepted answers

All comments

Thomas_Ott

Ok, you have to be careful here with your setup because the results are misleading based on your choice of partition size for the Split operator. Why 90% and 10%? Why not 85% and 15%? You will get varying results based on the size of your split, seed, and how you split the data. I noticed you used stratified sampling, which samples your data according to the class distribution of survivorship (Yes/No), so you can get strange results there.

What I suggest is to use a Cross Validation operator as your setup appears to try to mimic that thought process. I ran the process below by changing the seed and got that the Training Perf is slightly better than the Test Perf. Then I added a Cross Validation and measured the results there.

Also, you can use the Select Attributes operator to select the attributes you want inlieu of the R script if you want.

<?xml version="1.0" encoding="UTF-8"?><process version="7.5.003">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.5.003" expanded="true" name="Process">
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="7.5.003" expanded="true" height="68" name="Retrieve Titanic" width="90" x="45" y="34">
        <parameter key="repository_entry" value="//Samples/data/Titanic"/>
      </operator>
      <operator activated="false" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="68" name="Execute R" width="90" x="45" y="340">
        <parameter key="script" value="&#10;rm_main = function(data)&#10;{&#10;  return(data[,c(&quot;Sex&quot;, &quot;Passenger.Class&quot;, &quot;Age&quot;, &quot;Survived&quot;)])&#10;}&#10;"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="7.5.003" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="Passenger Class|Sex|Survived|Age"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="7.5.003" expanded="true" height="82" name="Set Role" width="90" x="313" y="34">
        <parameter key="attribute_name" value="Survived"/>
        <parameter key="target_role" value="label"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="multiply" compatibility="7.5.003" expanded="true" height="103" name="Multiply" width="90" x="179" y="289"/>
      <operator activated="true" class="concurrency:cross_validation" compatibility="7.5.003" expanded="true" height="145" name="Validation" width="90" x="380" y="340">
        <parameter key="sampling_type" value="shuffled sampling"/>
        <process expanded="true">
          <operator activated="true" class="concurrency:parallel_random_forest" compatibility="7.5.003" expanded="true" height="82" name="RF for Xval" width="90" x="179" y="34">
            <parameter key="use_local_random_seed" value="true"/>
            <parameter key="local_random_seed" value="2110"/>
          </operator>
          <connect from_port="training set" to_op="RF for Xval" to_port="training set"/>
          <connect from_op="RF for Xval" from_port="model" to_port="model"/>
          <portSpacing port="source_training set" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
          <description align="left" color="green" colored="true" height="113" resized="true" width="284" x="85" y="148">Builds a model on the current training data set (90 % of the data by default, 10 times).&lt;br&gt;&lt;br&gt;Make sure that you only put numerical attributes into a linear regression!</description>
        </process>
        <process expanded="true">
          <operator activated="true" class="apply_model" compatibility="7.5.003" expanded="true" height="82" name="Apply Model (3)" width="90" x="45" y="34">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="performance" compatibility="7.5.003" expanded="true" height="82" name="Performance" width="90" x="179" y="34"/>
          <connect from_port="model" to_op="Apply Model (3)" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model (3)" to_port="unlabelled data"/>
          <connect from_op="Apply Model (3)" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="performance 1"/>
          <connect from_op="Performance" from_port="example set" to_port="test set results"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_test set results" spacing="0"/>
          <portSpacing port="sink_performance 1" spacing="0"/>
          <portSpacing port="sink_performance 2" spacing="0"/>
          <description align="left" color="blue" colored="true" height="107" resized="true" width="333" x="28" y="139">Applies the model built from the training data set on the current test set (10 % by default).&lt;br/&gt;The Performance operator calculates performance indicators and sends them to the operator result.</description>
        </process>
        <description align="center" color="transparent" colored="false" width="126">A cross validation including a linear regression.</description>
      </operator>
      <operator activated="true" class="split_data" compatibility="7.5.003" expanded="true" height="103" name="Split Data" width="90" x="380" y="187">
        <enumeration key="partitions">
          <parameter key="ratio" value="0.9"/>
          <parameter key="ratio" value="0.1"/>
        </enumeration>
        <parameter key="sampling_type" value="stratified sampling"/>
        <parameter key="use_local_random_seed" value="true"/>
        <parameter key="local_random_seed" value="2000"/>
      </operator>
      <operator activated="true" class="multiply" compatibility="7.5.003" expanded="true" height="103" name="Multiply (2)" width="90" x="514" y="34"/>
      <operator activated="true" class="concurrency:parallel_random_forest" compatibility="7.5.003" expanded="true" height="82" name="Random Forest" width="90" x="514" y="187">
        <parameter key="use_local_random_seed" value="true"/>
        <parameter key="local_random_seed" value="2110"/>
      </operator>
      <operator activated="true" class="multiply" compatibility="7.5.003" expanded="true" height="103" name="Multiply (3)" width="90" x="648" y="136"/>
      <operator activated="true" class="apply_model" compatibility="7.5.003" expanded="true" height="82" name="Apply Model (2)" width="90" x="715" y="34">
        <list key="application_parameters"/>
      </operator>
      <operator activated="true" class="performance_classification" compatibility="7.5.003" expanded="true" height="82" name="Trainig Perf" width="90" x="849" y="34">
        <list key="class_weights"/>
      </operator>
      <operator activated="true" class="apply_model" compatibility="7.5.003" expanded="true" height="82" name="Apply Model" width="90" x="782" y="289">
        <list key="application_parameters"/>
      </operator>
      <operator activated="true" class="performance_classification" compatibility="7.5.003" expanded="true" height="82" name="Test Perf" width="90" x="916" y="187">
        <list key="class_weights"/>
      </operator>
      <connect from_op="Retrieve Titanic" from_port="output" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Multiply" to_port="input"/>
      <connect from_op="Multiply" from_port="output 1" to_op="Split Data" to_port="example set"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Validation" to_port="example set"/>
      <connect from_op="Validation" from_port="performance 1" to_port="result 3"/>
      <connect from_op="Split Data" from_port="partition 1" to_op="Multiply (2)" to_port="input"/>
      <connect from_op="Split Data" from_port="partition 2" to_op="Apply Model" to_port="unlabelled data"/>
      <connect from_op="Multiply (2)" from_port="output 1" to_op="Random Forest" to_port="training set"/>
      <connect from_op="Multiply (2)" from_port="output 2" to_op="Apply Model (2)" to_port="unlabelled data"/>
      <connect from_op="Random Forest" from_port="model" to_op="Multiply (3)" to_port="input"/>
      <connect from_op="Multiply (3)" from_port="output 1" to_op="Apply Model (2)" to_port="model"/>
      <connect from_op="Multiply (3)" from_port="output 2" to_op="Apply Model" to_port="model"/>
      <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Trainig Perf" to_port="labelled data"/>
      <connect from_op="Trainig Perf" from_port="performance" to_port="result 1"/>
      <connect from_op="Apply Model" from_port="labelled data" to_op="Test Perf" to_port="labelled data"/>
      <connect from_op="Test Perf" from_port="performance" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="273"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
    </process>
  </operator>
</process>

alan_jeffares

Thanks for the response

Yes I am aware that some of my parameters were a bit weird but I was varying them all and getting similar results. Turns out I was changing the seed in the wrong operator, silly mistake.

Regarding the choice of operators, I was just using things such as the execute R just to try out different operators and get a feel for how everything works. Thanks for the help