Use sampling inverse output

StefanRei
New Altair Community Member
Best Answer
-
Hello @StefanRei
Actually, you can do the same with Split data operator in one step. We can also solve your exact requirement. After playing with some operators, I think here is what you are looking for.
I used the titanic dataset for this and added an ID column using generate ID operator, I then applied sampling(your requirement) based on probability (0.7). The output of the sample is connected to Multiply as we need two sets one for comparison and other for training the model. Then a Set Minus operator is used to remove the Sampled data from original data so that it gives us dataset with samples that are not present in our sample dataset. You can use one output of Multiply operator for testing and the "exa" of set minus operator for testing your model. XML code is provided below, click on SHOW.
XML:<?xml version="1.0" encoding="UTF-8"?><process version="9.3.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.3.001" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.3.001" expanded="true" height="68" name="Retrieve Titanic" width="90" x="45" y="85">
<parameter key="repository_entry" value="//Samples/data/Titanic"/>
</operator>
<operator activated="true" class="generate_id" compatibility="9.3.001" expanded="true" height="82" name="Generate ID" width="90" x="246" y="85">
<parameter key="create_nominal_ids" value="false"/>
<parameter key="offset" value="0"/>
</operator>
<operator activated="true" class="sample" compatibility="9.3.001" expanded="true" height="82" name="Sample" width="90" x="380" y="85">
<parameter key="sample" value="probability"/>
<parameter key="balance_data" value="false"/>
<parameter key="sample_size" value="100"/>
<parameter key="sample_ratio" value="0.1"/>
<parameter key="sample_probability" value="0.7"/>
<list key="sample_size_per_class"/>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<operator activated="true" class="multiply" compatibility="9.3.001" expanded="true" height="82" name="Multiply" width="90" x="581" y="34"/>
<operator activated="true" class="set_minus" compatibility="9.3.001" expanded="true" height="82" name="Set Minus" width="90" x="648" y="187"/>
<connect from_op="Retrieve Titanic" from_port="output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Sample" to_port="example set input"/>
<connect from_op="Sample" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Sample" from_port="original" to_op="Set Minus" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Set Minus" to_port="subtrahend"/>
<connect from_op="Set Minus" from_port="example set output" to_port="result 2"/>
<connect from_op="Set Minus" from_port="original" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>Hope this helps.2
Answers
-
Hi @StefanRei - this is MarlaBot. I found these great videos on our RapidMiner Academy that you may find helpful:
Please LIKE my comment if it helps! 👇Instructional Video: Sampling & Weighting demo (Viewing time: ~7m) Instructional Video: Sampling & Weighting intro (Viewing time: ~5m)
MarlaBot0 -
Hello @StefanRei
Actually, you can do the same with Split data operator in one step. We can also solve your exact requirement. After playing with some operators, I think here is what you are looking for.
I used the titanic dataset for this and added an ID column using generate ID operator, I then applied sampling(your requirement) based on probability (0.7). The output of the sample is connected to Multiply as we need two sets one for comparison and other for training the model. Then a Set Minus operator is used to remove the Sampled data from original data so that it gives us dataset with samples that are not present in our sample dataset. You can use one output of Multiply operator for testing and the "exa" of set minus operator for testing your model. XML code is provided below, click on SHOW.
XML:<?xml version="1.0" encoding="UTF-8"?><process version="9.3.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.3.001" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.3.001" expanded="true" height="68" name="Retrieve Titanic" width="90" x="45" y="85">
<parameter key="repository_entry" value="//Samples/data/Titanic"/>
</operator>
<operator activated="true" class="generate_id" compatibility="9.3.001" expanded="true" height="82" name="Generate ID" width="90" x="246" y="85">
<parameter key="create_nominal_ids" value="false"/>
<parameter key="offset" value="0"/>
</operator>
<operator activated="true" class="sample" compatibility="9.3.001" expanded="true" height="82" name="Sample" width="90" x="380" y="85">
<parameter key="sample" value="probability"/>
<parameter key="balance_data" value="false"/>
<parameter key="sample_size" value="100"/>
<parameter key="sample_ratio" value="0.1"/>
<parameter key="sample_probability" value="0.7"/>
<list key="sample_size_per_class"/>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<operator activated="true" class="multiply" compatibility="9.3.001" expanded="true" height="82" name="Multiply" width="90" x="581" y="34"/>
<operator activated="true" class="set_minus" compatibility="9.3.001" expanded="true" height="82" name="Set Minus" width="90" x="648" y="187"/>
<connect from_op="Retrieve Titanic" from_port="output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Sample" to_port="example set input"/>
<connect from_op="Sample" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Sample" from_port="original" to_op="Set Minus" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Set Minus" to_port="subtrahend"/>
<connect from_op="Set Minus" from_port="example set output" to_port="result 2"/>
<connect from_op="Set Minus" from_port="original" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>Hope this helps.2 -
Perfect, thank you very much!0