🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

Execute Python failed in a Optimization / Cross validation operator

User: "lionelderkrikor"
New Altair Community Member
Updated by Jocelyn

Hi,

 

 

I use the "Execute Python" operator to perform a generation of dummy variables on a dataset.

I know that this function can be performed with the "Nominal to Numerical " operator or not to be performed at all.......

but I discovered that without X-validation/Optimization, the created decision tree is not the same (and its associated prediction/accuracy) when the dummy variables are generated by "Nominal to Numerical " or generated by "Execute Python" which seems to be weird.....

 

In my case, the 2 "Execute Python", which are respectively in the training and test parts of a "cross validation" operator, itself in 

an "Optimization" operator, seems to be not executed and then the process failed.

 

Here my process : 

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.6.001" expanded="true" height="68" name="Retrieve Golf-Testset" width="90" x="45" y="391">
<parameter key="repository_entry" value="//Samples/data/Golf-Testset"/>
</operator>
<operator activated="true" class="python_scripting:execute_python" compatibility="7.4.000" expanded="true" height="82" name="Execute Python (3)" width="90" x="179" y="391">
<parameter key="script" value="import pandas as pd&#10;&#10;# rm_main is a mandatory function, &#10;# the number of arguments has to be the number of input ports (can be none)&#10;def rm_main(data):&#10; data = pd.get_dummies(data,columns = ['Outlook', 'Wind'] )&#10;&#10; # connect 2 output ports to see the results&#10; return data"/>
</operator>
<operator activated="false" class="nominal_to_numerical" compatibility="7.6.001" expanded="true" height="103" name="Nominal to Numerical (2)" width="90" x="112" y="544">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Play"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="retrieve" compatibility="7.6.001" expanded="true" height="68" name="Retrieve Golf" width="90" x="45" y="85">
<parameter key="repository_entry" value="//Samples/data/Golf"/>
</operator>
<operator activated="true" class="optimize_parameters_grid" compatibility="7.6.001" expanded="true" height="145" name="Optimize Parameters (Grid)" width="90" x="313" y="85">
<list key="parameters">
<parameter key="Decision Tree.criterion" value="gain_ratio,information_gain,gini_index,accuracy"/>
<parameter key="Decision Tree.apply_pruning" value="true,false"/>
<parameter key="Decision Tree.apply_prepruning" value="true,false"/>
<parameter key="Decision Tree.maximal_depth" value="[-1.0;20;20;linear]"/>
</list>
<process expanded="true">
<operator activated="true" class="concurrency:cross_validation" compatibility="7.6.001" expanded="true" height="145" name="Cross Validation" width="90" x="380" y="34">
<parameter key="use_local_random_seed" value="true"/>
<process expanded="true">
<operator activated="false" class="nominal_to_numerical" compatibility="7.6.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="45" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Play"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="python_scripting:execute_python" compatibility="7.4.000" expanded="true" height="82" name="Execute Python" width="90" x="45" y="136">
<parameter key="script" value="import pandas as pd&#10;&#10;# rm_main is a mandatory function, &#10;# the number of arguments has to be the number of input ports (can be none)&#10;def rm_main(data):&#10; data = pd.get_dummies(data,columns = ['Outlook', 'Wind'] )&#10;&#10; # connect 2 output ports to see the results&#10; return data"/>
</operator>
<operator activated="true" class="set_role" compatibility="7.6.001" expanded="true" height="82" name="Set Role" width="90" x="179" y="136">
<parameter key="attribute_name" value="Play"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="7.6.001" expanded="true" height="82" name="Decision Tree" width="90" x="313" y="136">
<parameter key="maximal_depth" value="-1"/>
</operator>
<connect from_port="training set" to_op="Execute Python" to_port="input 1"/>
<connect from_op="Execute Python" from_port="output 1" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Decision Tree" to_port="training set"/>
<connect from_op="Decision Tree" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="false" class="nominal_to_numerical" compatibility="7.6.001" expanded="true" height="103" name="Nominal to Numerical (3)" width="90" x="45" y="238">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Play"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="python_scripting:execute_python" compatibility="7.4.000" expanded="true" height="82" name="Execute Python (2)" width="90" x="112" y="85">
<parameter key="script" value="import pandas as pd&#10;&#10;# rm_main is a mandatory function, &#10;# the number of arguments has to be the number of input ports (can be none)&#10;def rm_main(data):&#10; data = pd.get_dummies(data,columns = ['Outlook', 'Wind'] )&#10;&#10; # connect 2 output ports to see the results&#10; return data"/>
</operator>
<operator activated="true" class="set_role" compatibility="7.6.001" expanded="true" height="82" name="Set Role (3)" width="90" x="246" y="187">
<parameter key="attribute_name" value="Play"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Model (2)" width="90" x="246" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="7.6.001" expanded="true" height="82" name="Performance (2)" width="90" x="380" y="34">
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model (2)" to_port="model"/>
<connect from_port="test set" to_op="Execute Python (2)" to_port="input 1"/>
<connect from_op="Execute Python (2)" from_port="output 1" to_op="Set Role (3)" to_port="example set input"/>
<connect from_op="Set Role (3)" from_port="example set output" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_op="Performance (2)" to_port="labelled data"/>
<connect from_op="Performance (2)" from_port="performance" to_port="performance 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_test set results" spacing="0"/>
<portSpacing port="sink_performance 1" spacing="0"/>
<portSpacing port="sink_performance 2" spacing="0"/>
</process>
</operator>
<connect from_port="input 1" to_op="Cross Validation" to_port="example set"/>
<connect from_op="Cross Validation" from_port="model" to_port="result 1"/>
<connect from_op="Cross Validation" from_port="example set" to_port="result 2"/>
<connect from_op="Cross Validation" from_port="performance 1" to_port="performance"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_performance" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
<operator activated="true" class="set_role" compatibility="7.6.001" expanded="true" height="82" name="Set Role (2)" width="90" x="313" y="391">
<parameter key="attribute_name" value="Play"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="apply_model" compatibility="7.1.001" expanded="true" height="82" name="Apply Model" width="90" x="514" y="340">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="7.6.001" expanded="true" height="82" name="Performance" width="90" x="715" y="391">
<list key="class_weights"/>
</operator>
<connect from_op="Retrieve Golf-Testset" from_port="output" to_op="Execute Python (3)" to_port="input 1"/>
<connect from_op="Execute Python (3)" from_port="output 1" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Retrieve Golf" from_port="output" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
<connect from_op="Optimize Parameters (Grid)" from_port="performance" to_port="result 4"/>
<connect from_op="Optimize Parameters (Grid)" from_port="parameter" to_port="result 5"/>
<connect from_op="Optimize Parameters (Grid)" from_port="result 1" to_op="Apply Model" to_port="model"/>
<connect from_op="Optimize Parameters (Grid)" from_port="result 2" to_port="result 6"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Apply Model" from_port="model" to_port="result 2"/>
<connect from_op="Performance" from_port="performance" to_port="result 1"/>
<connect from_op="Performance" from_port="example set" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
<portSpacing port="sink_result 6" spacing="0"/>
<portSpacing port="sink_result 7" spacing="0"/>
</process>
</operator>
</process>

 

My approach seems to be futile, but maybe there is a bug on the "Execute python" operator and it will help those who use

this operator for more useful tasks.

 

Thank you for your help,

 

Regards,

 

Lionel 

 

 

 

 

 

 

 

Find more posts tagged with

Sort by:
1 - 1 of 11
    User: "JEdward"
    New Altair Community Member
    Accepted Answer

    The problem showing using your Golf dataset is that the attributes don't match.  Using breakpoints I can see that your Test data fold only contains one record (and your training is also on a small number). 

    And because you are converting to dummy variables on each side of training & testing then it's pretty likely that some attributes won't match your model as your test data might be missing important details. 

     

    This is bad practice and I recommend that you feed your preprocessing model through the RapidMiner process to work on it. 

     

    However, as you did state you wanted to use this way what you need to do is ensure that the attributes of your dataset matches the output.  You can do this with operators like Superset.   See below XML.

     

    Maybe you could also post an example of the incorrect results you're getting with the Nom to Num operator? 

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="7.6.001" expanded="true" height="68" name="Retrieve Golf-Testset" width="90" x="45" y="391">
    <parameter key="repository_entry" value="//Samples/data/Golf-Testset"/>
    </operator>
    <operator activated="true" class="python_scripting:execute_python" compatibility="7.4.000" expanded="true" height="82" name="Execute Python (3)" width="90" x="179" y="391">
    <parameter key="script" value="import pandas as pd&#10;&#10;# rm_main is a mandatory function, &#10;# the number of arguments has to be the number of input ports (can be none)&#10;def rm_main(data):&#10; data = pd.get_dummies(data,columns = ['Outlook', 'Wind'] )&#10;&#10; # connect 2 output ports to see the results&#10; return data"/>
    </operator>
    <operator activated="false" class="nominal_to_numerical" compatibility="7.6.001" expanded="true" height="103" name="Nominal to Numerical (2)" width="90" x="112" y="544">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="Play"/>
    <parameter key="invert_selection" value="true"/>
    <parameter key="include_special_attributes" value="true"/>
    <list key="comparison_groups"/>
    </operator>
    <operator activated="true" class="retrieve" compatibility="7.6.001" expanded="true" height="68" name="Retrieve Golf" width="90" x="45" y="85">
    <parameter key="repository_entry" value="//Samples/data/Golf"/>
    </operator>
    <operator activated="true" class="optimize_parameters_grid" compatibility="7.6.001" expanded="true" height="166" name="Optimize Parameters (Grid)" width="90" x="313" y="85">
    <list key="parameters">
    <parameter key="Decision Tree.criterion" value="gain_ratio,information_gain,gini_index,accuracy"/>
    <parameter key="Decision Tree.apply_pruning" value="true,false"/>
    <parameter key="Decision Tree.apply_prepruning" value="true,false"/>
    <parameter key="Decision Tree.maximal_depth" value="[-1.0;20;20;linear]"/>
    </list>
    <process expanded="true">
    <operator activated="true" class="concurrency:cross_validation" compatibility="7.6.001" expanded="true" height="145" name="Cross Validation" width="90" x="380" y="34">
    <parameter key="use_local_random_seed" value="true"/>
    <process expanded="true">
    <operator activated="false" class="nominal_to_numerical" compatibility="7.6.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="112" y="34">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="Play"/>
    <parameter key="invert_selection" value="true"/>
    <parameter key="include_special_attributes" value="true"/>
    <list key="comparison_groups"/>
    <parameter key="use_underscore_in_name" value="true"/>
    </operator>
    <operator activated="true" class="python_scripting:execute_python" compatibility="7.4.000" expanded="true" height="82" name="Execute Python" width="90" x="45" y="289">
    <parameter key="script" value="import pandas as pd&#10;&#10;# rm_main is a mandatory function, &#10;# the number of arguments has to be the number of input ports (can be none)&#10;def rm_main(data):&#10; data = pd.get_dummies(data,columns = ['Outlook', 'Wind'] )&#10;&#10; # connect 2 output ports to see the results&#10; return data"/>
    </operator>
    <operator activated="true" class="set_role" compatibility="7.6.001" expanded="true" height="82" name="Set Role" width="90" x="112" y="187">
    <parameter key="attribute_name" value="Play"/>
    <parameter key="target_role" value="label"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="concurrency:parallel_decision_tree" compatibility="7.6.001" expanded="true" height="82" name="Decision Tree" width="90" x="246" y="85">
    <parameter key="maximal_depth" value="-1"/>
    </operator>
    <operator activated="true" class="remember" compatibility="7.6.001" expanded="true" height="68" name="Remember" width="90" x="313" y="187">
    <parameter key="name" value="myDataSet"/>
    </operator>
    <connect from_port="training set" to_op="Execute Python" to_port="input 1"/>
    <connect from_op="Execute Python" from_port="output 1" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="Decision Tree" to_port="training set"/>
    <connect from_op="Decision Tree" from_port="model" to_port="model"/>
    <connect from_op="Decision Tree" from_port="exampleSet" to_op="Remember" to_port="store"/>
    <connect from_op="Remember" from_port="stored" to_port="through 1"/>
    <portSpacing port="source_training set" spacing="0"/>
    <portSpacing port="sink_model" spacing="0"/>
    <portSpacing port="sink_through 1" spacing="126"/>
    <portSpacing port="sink_through 2" spacing="0"/>
    </process>
    <process expanded="true">
    <operator activated="true" class="python_scripting:execute_python" compatibility="7.4.000" expanded="true" height="82" name="Execute Python (2)" width="90" x="45" y="85">
    <parameter key="script" value="import pandas as pd&#10;&#10;# rm_main is a mandatory function, &#10;# the number of arguments has to be the number of input ports (can be none)&#10;def rm_main(data):&#10; data = pd.get_dummies(data,columns = ['Outlook', 'Wind'] )&#10;&#10; # connect 2 output ports to see the results&#10; return data"/>
    </operator>
    <operator activated="true" class="set_role" compatibility="7.6.001" expanded="true" height="82" name="Set Role (3)" width="90" x="179" y="187">
    <parameter key="attribute_name" value="Play"/>
    <parameter key="target_role" value="label"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="superset" compatibility="7.6.001" expanded="true" height="82" name="Superset" width="90" x="313" y="238"/>
    <operator activated="true" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Model (2)" width="90" x="246" y="34">
    <list key="application_parameters"/>
    </operator>
    <operator activated="true" class="performance_classification" compatibility="7.6.001" expanded="true" height="82" name="Performance (2)" width="90" x="380" y="34">
    <list key="class_weights"/>
    </operator>
    <connect from_port="model" to_op="Apply Model (2)" to_port="model"/>
    <connect from_port="test set" to_op="Execute Python (2)" to_port="input 1"/>
    <connect from_port="through 1" to_op="Superset" to_port="example set 2"/>
    <connect from_op="Execute Python (2)" from_port="output 1" to_op="Set Role (3)" to_port="example set input"/>
    <connect from_op="Set Role (3)" from_port="example set output" to_op="Superset" to_port="example set 1"/>
    <connect from_op="Superset" from_port="superset 1" to_op="Apply Model (2)" to_port="unlabelled data"/>
    <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Performance (2)" to_port="labelled data"/>
    <connect from_op="Performance (2)" from_port="performance" to_port="performance 1"/>
    <portSpacing port="source_model" spacing="0"/>
    <portSpacing port="source_test set" spacing="0"/>
    <portSpacing port="source_through 1" spacing="105"/>
    <portSpacing port="source_through 2" spacing="21"/>
    <portSpacing port="sink_test set results" spacing="0"/>
    <portSpacing port="sink_performance 1" spacing="0"/>
    <portSpacing port="sink_performance 2" spacing="0"/>
    <description align="center" color="yellow" colored="false" height="87" resized="true" width="237" x="254" y="337">There should really be a replace missing values here too, but I didn't feel like adding it. :P</description>
    </process>
    </operator>
    <connect from_port="input 1" to_op="Cross Validation" to_port="example set"/>
    <connect from_op="Cross Validation" from_port="model" to_port="result 1"/>
    <connect from_op="Cross Validation" from_port="example set" to_port="result 2"/>
    <connect from_op="Cross Validation" from_port="test result set" to_port="result 3"/>
    <connect from_op="Cross Validation" from_port="performance 1" to_port="performance"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="source_input 2" spacing="0"/>
    <portSpacing port="sink_performance" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="set_role" compatibility="7.6.001" expanded="true" height="82" name="Set Role (2)" width="90" x="313" y="391">
    <parameter key="attribute_name" value="Play"/>
    <parameter key="target_role" value="label"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="recall" compatibility="7.6.001" expanded="true" height="68" name="Recall" width="90" x="313" y="493">
    <parameter key="name" value="myDataSet"/>
    <description align="center" color="transparent" colored="false" width="126">This needs to happen AFTER the Optimize has run.</description>
    </operator>
    <operator activated="true" class="superset" compatibility="7.6.001" expanded="true" height="82" name="Superset (2)" width="90" x="447" y="442"/>
    <operator activated="true" class="apply_model" compatibility="7.1.001" expanded="true" height="82" name="Apply Model" width="90" x="514" y="340">
    <list key="application_parameters"/>
    </operator>
    <operator activated="true" class="performance_classification" compatibility="7.6.001" expanded="true" height="82" name="Performance" width="90" x="715" y="391">
    <list key="class_weights"/>
    </operator>
    <connect from_op="Retrieve Golf-Testset" from_port="output" to_op="Execute Python (3)" to_port="input 1"/>
    <connect from_op="Execute Python (3)" from_port="output 1" to_op="Set Role (2)" to_port="example set input"/>
    <connect from_op="Retrieve Golf" from_port="output" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
    <connect from_op="Optimize Parameters (Grid)" from_port="performance" to_port="result 4"/>
    <connect from_op="Optimize Parameters (Grid)" from_port="parameter" to_port="result 5"/>
    <connect from_op="Optimize Parameters (Grid)" from_port="result 1" to_op="Apply Model" to_port="model"/>
    <connect from_op="Optimize Parameters (Grid)" from_port="result 2" to_port="result 6"/>
    <connect from_op="Set Role (2)" from_port="example set output" to_op="Superset (2)" to_port="example set 1"/>
    <connect from_op="Recall" from_port="result" to_op="Superset (2)" to_port="example set 2"/>
    <connect from_op="Superset (2)" from_port="superset 1" to_op="Apply Model" to_port="unlabelled data"/>
    <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
    <connect from_op="Apply Model" from_port="model" to_port="result 2"/>
    <connect from_op="Performance" from_port="performance" to_port="result 1"/>
    <connect from_op="Performance" from_port="example set" to_port="result 3"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    <portSpacing port="sink_result 5" spacing="0"/>
    <portSpacing port="sink_result 6" spacing="0"/>
    <portSpacing port="sink_result 7" spacing="0"/>
    </process>
    </operator>
    </process>