[SOLVED] Automatic dataset shuffle
wessel
New Altair Community Member
Dear All,
I have 5 different datasets (from 5 different user).
I wish to do "user-cross-validation".
Meaning, I wish to test on user n, and train on all other users, for n = 1, ..., 5.
Any way to do this automatically?
I can retrieve all 5 data sets, but after this, I should "dynamically" join them.
Best regards,
Wessel
I have 5 different datasets (from 5 different user).
I wish to do "user-cross-validation".
Meaning, I wish to test on user n, and train on all other users, for n = 1, ..., 5.
Any way to do this automatically?
I can retrieve all 5 data sets, but after this, I should "dynamically" join them.
Best regards,
Wessel
Tagged:
0
Answers
-
Should I join all 5 files into 1 big data set?
And then use 'linear sampling' option?
Best regards,
Wessel0 -
If your five datasets are equal-sized that should work.0
-
Yes, but they are not! :PMarcin wrote:
If your five datasets are equal-sized that should work.0 -
OK, unfortunately there is no easy out-of-the-box-with-a-single-operator method for this. But - because of the almighty tool-box power of RapidMiner - we can try to mimic a cross-validation with your desired behaviour!
There are actually several methods for this. One could work like this. You append all of your data-sets, but add a special attribute, let us say 'set_id', for every single attribute before. This attribute contains the number of the exampleset (1,2,3,...,k). After this you can loop k-times and filter the train- and test data with the help of this attribute. After you calculate the performance you can build an average.
Here is an example of such an process with 5 identical iris datasets:
If you find a more elegant or remarkable way to achieve this, feel free to post it here.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.000" expanded="true" name="Process">
<process expanded="true" height="550" width="815">
<operator activated="true" class="retrieve" compatibility="5.3.000" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30">
<parameter key="repository_entry" value="//Samples/data/Iris"/>
</operator>
<operator activated="true" class="retrieve" compatibility="5.3.000" expanded="true" height="60" name="Retrieve (2)" width="90" x="45" y="120">
<parameter key="repository_entry" value="//Samples/data/Iris"/>
</operator>
<operator activated="true" class="retrieve" compatibility="5.3.000" expanded="true" height="60" name="Retrieve (3)" width="90" x="45" y="210">
<parameter key="repository_entry" value="//Samples/data/Iris"/>
</operator>
<operator activated="true" class="retrieve" compatibility="5.3.000" expanded="true" height="60" name="Retrieve (4)" width="90" x="45" y="300">
<parameter key="repository_entry" value="//Samples/data/Iris"/>
</operator>
<operator activated="true" class="retrieve" compatibility="5.3.000" expanded="true" height="60" name="Retrieve (5)" width="90" x="45" y="390">
<parameter key="repository_entry" value="//Samples/data/Iris"/>
</operator>
<operator activated="true" class="subprocess" compatibility="5.3.000" expanded="true" height="148" name="Append with set_id" width="90" x="246" y="30">
<process expanded="true" height="538" width="893">
<operator activated="true" class="generate_attributes" compatibility="5.3.000" expanded="true" height="76" name="Generate Attributes" width="90" x="45" y="30">
<list key="function_descriptions">
<parameter key="set_id" value="1"/>
</list>
</operator>
<operator activated="true" class="generate_attributes" compatibility="5.3.000" expanded="true" height="76" name="Generate Attributes (2)" width="90" x="45" y="120">
<list key="function_descriptions">
<parameter key="set_id" value="2"/>
</list>
</operator>
<operator activated="true" class="generate_attributes" compatibility="5.3.000" expanded="true" height="76" name="Generate Attributes (3)" width="90" x="45" y="210">
<list key="function_descriptions">
<parameter key="set_id" value="3"/>
</list>
</operator>
<operator activated="true" class="generate_attributes" compatibility="5.3.000" expanded="true" height="76" name="Generate Attributes (4)" width="90" x="45" y="300">
<list key="function_descriptions">
<parameter key="set_id" value="4"/>
</list>
</operator>
<operator activated="true" class="generate_attributes" compatibility="5.3.000" expanded="true" height="76" name="Generate Attributes (5)" width="90" x="45" y="390">
<list key="function_descriptions">
<parameter key="set_id" value="5"/>
</list>
</operator>
<operator activated="true" class="append" compatibility="5.3.000" expanded="true" height="148" name="Append (2)" width="90" x="246" y="30"/>
<operator activated="true" class="set_role" compatibility="5.3.000" expanded="true" height="76" name="Set Role" width="90" x="447" y="30">
<parameter key="name" value="set_id"/>
<parameter key="target_role" value="set"/>
<list key="set_additional_roles"/>
</operator>
<connect from_port="in 1" to_op="Generate Attributes" to_port="example set input"/>
<connect from_port="in 2" to_op="Generate Attributes (2)" to_port="example set input"/>
<connect from_port="in 3" to_op="Generate Attributes (3)" to_port="example set input"/>
<connect from_port="in 4" to_op="Generate Attributes (4)" to_port="example set input"/>
<connect from_port="in 5" to_op="Generate Attributes (5)" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Append (2)" to_port="example set 1"/>
<connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Append (2)" to_port="example set 2"/>
<connect from_op="Generate Attributes (3)" from_port="example set output" to_op="Append (2)" to_port="example set 3"/>
<connect from_op="Generate Attributes (4)" from_port="example set output" to_op="Append (2)" to_port="example set 4"/>
<connect from_op="Generate Attributes (5)" from_port="example set output" to_op="Append (2)" to_port="example set 5"/>
<connect from_op="Append (2)" from_port="merged set" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="source_in 3" spacing="0"/>
<portSpacing port="source_in 4" spacing="0"/>
<portSpacing port="source_in 5" spacing="0"/>
<portSpacing port="source_in 6" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="loop" compatibility="5.3.000" expanded="true" height="76" name="Loop" width="90" x="380" y="30">
<parameter key="set_iteration_macro" value="true"/>
<parameter key="macro_name" value="k"/>
<parameter key="iterations" value="5"/>
<process expanded="true" height="538" width="893">
<operator activated="true" class="filter_examples" compatibility="5.3.000" expanded="true" height="76" name="Get Train" width="90" x="45" y="30">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="set_id=%{k}"/>
<parameter key="invert_filter" value="true"/>
</operator>
<operator activated="true" class="naive_bayes" compatibility="5.3.000" expanded="true" height="76" name="Naive Bayes" width="90" x="313" y="30"/>
<operator activated="true" class="filter_examples" compatibility="5.3.000" expanded="true" height="76" name="Get Test" width="90" x="179" y="165">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="set_id=%{k}"/>
</operator>
<operator activated="true" class="apply_model" compatibility="5.3.000" expanded="true" height="76" name="Apply Model" width="90" x="447" y="120">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.3.000" expanded="true" height="76" name="Performance" width="90" x="581" y="120"/>
<connect from_port="input 1" to_op="Get Train" to_port="example set input"/>
<connect from_op="Get Train" from_port="example set output" to_op="Naive Bayes" to_port="training set"/>
<connect from_op="Get Train" from_port="original" to_op="Get Test" to_port="example set input"/>
<connect from_op="Naive Bayes" from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_op="Get Test" from_port="example set output" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="average" compatibility="5.3.000" expanded="true" height="76" name="Average" width="90" x="514" y="30"/>
<connect from_op="Retrieve" from_port="output" to_op="Append with set_id" to_port="in 1"/>
<connect from_op="Retrieve (2)" from_port="output" to_op="Append with set_id" to_port="in 2"/>
<connect from_op="Retrieve (3)" from_port="output" to_op="Append with set_id" to_port="in 3"/>
<connect from_op="Retrieve (4)" from_port="output" to_op="Append with set_id" to_port="in 4"/>
<connect from_op="Retrieve (5)" from_port="output" to_op="Append with set_id" to_port="in 5"/>
<connect from_op="Append with set_id" from_port="out 1" to_op="Loop" to_port="input 1"/>
<connect from_op="Loop" from_port="output 1" to_op="Average" to_port="averagable 1"/>
<connect from_op="Average" from_port="average" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>0 -
Thanks a lot!0