"Cross Validation test examples are more than input samples"
Hi,
I am storing cross validation predictions using Write Excel. I see that the output samples (570553) of Cross validation 5 fold for Gradient Boosted Tree are more than input samples (316974). I am not sure if I am doing something wrong. Please see XML below.
Thanks,
Varun
I am storing cross validation predictions using Write Excel. I see that the output samples (570553) of Cross validation 5 fold for Gradient Boosted Tree are more than input samples (316974). I am not sure if I am doing something wrong. Please see XML below.
<?xml version="1.0" encoding="UTF-8"?><process version="9.1.000"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="9.1.000" expanded="true" name="Process"> <parameter key="logverbosity" value="init"/> <parameter key="random_seed" value="2001"/> <parameter key="send_mail" value="never"/> <parameter key="notification_email" value=""/> <parameter key="process_duration_for_mail" value="30"/> <parameter key="encoding" value="SYSTEM"/> <process expanded="true"> <operator activated="true" class="retrieve" compatibility="9.1.000" expanded="true" height="68" name="Retrieve Assistment_Data_Labels" width="90" x="45" y="85"> <parameter key="repository_entry" value="../../data/AIED_New/Assistment_Data_Labels"/> </operator> <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="124" name="Multiply (6)" width="90" x="179" y="85"/> <operator activated="true" class="select_attributes" compatibility="9.1.000" expanded="true" height="82" name="Select Attributes (3)" width="90" x="380" y="442"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="Ln|Ln-1|NumActions|RES_BORED|RES_CONFUSED|RES_FRUSTRATED|RES_GAMING|RES_OFFTASK|attemptCount|correct|endsWithScaffolding|frIsHelpRequest|frIsHelpRequestScaffolding|frPast5HelpRequest|frPast5WrongCount|frPast8HelpRequest|frPast8WrongCount|frTotalSkillOpportunitiesScaffolding|hint|hintCount|hintTotal|manywrong|original|past8BottomOut|problemType|scaffold|skill|sumRight|sumTimePerSkill|timeGreater10SecAndNextActionRight|timeSinceSkill|timeTaken|totalFrAttempted|totalFrPastWrongCount|totalFrPercentPastWrong|totalFrSkillOpportunities|totalFrSkillOpportunitiesByScaffolding|totalFrTimeOnSkill|totalTimeByPercentCorrectForskill"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> </operator> <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="166" name="Cross Validation (4)" width="90" x="581" y="442"> <parameter key="split_on_batch_attribute" value="false"/> <parameter key="leave_one_out" value="false"/> <parameter key="number_of_folds" value="5"/> <parameter key="sampling_type" value="automatic"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> <parameter key="enable_parallel_execution" value="true"/> <process expanded="true"> <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees (3)" width="90" x="112" y="34"> <parameter key="number_of_trees" value="20"/> <parameter key="reproducible" value="false"/> <parameter key="maximum_number_of_threads" value="4"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> <parameter key="maximal_depth" value="20"/> <parameter key="min_rows" value="10.0"/> <parameter key="min_split_improvement" value="0.0"/> <parameter key="number_of_bins" value="20"/> <parameter key="learning_rate" value="0.1"/> <parameter key="sample_rate" value="1.0"/> <parameter key="distribution" value="AUTO"/> <parameter key="early_stopping" value="false"/> <parameter key="stopping_rounds" value="1"/> <parameter key="stopping_metric" value="AUTO"/> <parameter key="stopping_tolerance" value="0.001"/> <parameter key="max_runtime_seconds" value="0"/> <list key="expert_parameters"/> </operator> <connect from_port="training set" to_op="Gradient Boosted Trees (3)" to_port="training set"/> <connect from_op="Gradient Boosted Trees (3)" from_port="model" to_port="model"/> <portSpacing port="source_training set" spacing="0"/> <portSpacing port="sink_model" spacing="0"/> <portSpacing port="sink_through 1" spacing="0"/> </process> <process expanded="true"> <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (4)" width="90" x="45" y="34"> <list key="application_parameters"/> <parameter key="create_view" value="false"/> </operator> <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="103" name="Multiply (3)" width="90" x="45" y="136"/> <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance (3)" width="90" x="246" y="340"> <parameter key="use_example_weights" value="true"/> </operator> <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (4)" width="90" x="246" y="238"> <parameter key="main_criterion" value="first"/> <parameter key="accuracy" value="true"/> <parameter key="classification_error" value="false"/> <parameter key="kappa" value="true"/> <parameter key="weighted_mean_recall" value="false"/> <parameter key="weighted_mean_precision" value="false"/> <parameter key="spearman_rho" value="false"/> <parameter key="kendall_tau" value="false"/> <parameter key="absolute_error" value="false"/> <parameter key="relative_error" value="false"/> <parameter key="relative_error_lenient" value="false"/> <parameter key="relative_error_strict" value="false"/> <parameter key="normalized_absolute_error" value="false"/> <parameter key="root_mean_squared_error" value="true"/> <parameter key="root_relative_squared_error" value="false"/> <parameter key="squared_error" value="false"/> <parameter key="correlation" value="false"/> <parameter key="squared_correlation" value="false"/> <parameter key="cross-entropy" value="false"/> <parameter key="margin" value="false"/> <parameter key="soft_margin_loss" value="false"/> <parameter key="logistic_loss" value="false"/> <parameter key="skip_undefined_labels" value="true"/> <parameter key="use_example_weights" value="true"/> <list key="class_weights"/> </operator> <connect from_port="model" to_op="Apply Model (4)" to_port="model"/> <connect from_port="test set" to_op="Apply Model (4)" to_port="unlabelled data"/> <connect from_op="Apply Model (4)" from_port="labelled data" to_op="Multiply (3)" to_port="input"/> <connect from_op="Multiply (3)" from_port="output 1" to_op="Performance (4)" to_port="labelled data"/> <connect from_op="Multiply (3)" from_port="output 2" to_op="Performance (3)" to_port="labelled data"/> <connect from_op="Performance (3)" from_port="performance" to_port="performance 2"/> <connect from_op="Performance (3)" from_port="example set" to_port="test set results"/> <connect from_op="Performance (4)" from_port="performance" to_port="performance 1"/> <portSpacing port="source_model" spacing="0"/> <portSpacing port="source_test set" spacing="0"/> <portSpacing port="source_through 1" spacing="0"/> <portSpacing port="sink_test set results" spacing="0"/> <portSpacing port="sink_performance 1" spacing="0"/> <portSpacing port="sink_performance 2" spacing="0"/> <portSpacing port="sink_performance 3" spacing="0"/> </process> </operator> <operator activated="true" class="write_excel" compatibility="9.1.000" expanded="true" height="82" name="Write Excel (3)" width="90" x="707" y="595"> <parameter key="excel_file" value="E:\AIED_New\AIED_GBT_Test_Predictions_CV\CV_Predictions_GBT_40_Attributes.xlsx"/> <parameter key="file_format" value="xlsx"/> <parameter key="encoding" value="SYSTEM"/> <parameter key="sheet_name" value="RapidMiner Data"/> <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/> <parameter key="number_format" value="#.0"/> </operator> <operator activated="true" class="select_attributes" compatibility="9.1.000" expanded="true" height="82" name="Select Attributes" width="90" x="380" y="34"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="Ln|Ln-1|RES_BORED|RES_CONFUSED|RES_FRUSTRATED|RES_GAMING|RES_OFFTASK|attemptCount|correct|endsWithScaffolding|frIsHelpRequest|frIsHelpRequestScaffolding|frPast5HelpRequest|frPast5WrongCount|frPast8HelpRequest|frPast8WrongCount|frTotalSkillOpportunitiesScaffolding|hint|hintCount|hintTotal|manywrong|original|past8BottomOut|problemType|scaffold|skill|sumRight|sumTimePerSkill|timeGreater10SecAndNextActionRight|timeSinceSkill|timeTaken|totalFrPastWrongCount|totalFrPercentPastWrong|totalFrSkillOpportunities|totalFrSkillOpportunitiesByScaffolding|totalFrTimeOnSkill|totalTimeByPercentCorrectForskill"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> </operator> <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="187" name="Cross Validation (3)" width="90" x="581" y="34"> <parameter key="split_on_batch_attribute" value="false"/> <parameter key="leave_one_out" value="false"/> <parameter key="number_of_folds" value="5"/> <parameter key="sampling_type" value="automatic"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> <parameter key="enable_parallel_execution" value="true"/> <process expanded="true"> <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees" width="90" x="112" y="34"> <parameter key="number_of_trees" value="20"/> <parameter key="reproducible" value="false"/> <parameter key="maximum_number_of_threads" value="4"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> <parameter key="maximal_depth" value="20"/> <parameter key="min_rows" value="10.0"/> <parameter key="min_split_improvement" value="0.0"/> <parameter key="number_of_bins" value="20"/> <parameter key="learning_rate" value="0.1"/> <parameter key="sample_rate" value="1.0"/> <parameter key="distribution" value="AUTO"/> <parameter key="early_stopping" value="false"/> <parameter key="stopping_rounds" value="1"/> <parameter key="stopping_metric" value="AUTO"/> <parameter key="stopping_tolerance" value="0.001"/> <parameter key="max_runtime_seconds" value="0"/> <list key="expert_parameters"/> </operator> <connect from_port="training set" to_op="Gradient Boosted Trees" to_port="training set"/> <connect from_op="Gradient Boosted Trees" from_port="model" to_port="model"/> <portSpacing port="source_training set" spacing="0"/> <portSpacing port="sink_model" spacing="0"/> <portSpacing port="sink_through 1" spacing="0"/> </process> <process expanded="true"> <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (3)" width="90" x="45" y="34"> <list key="application_parameters"/> <parameter key="create_view" value="false"/> </operator> <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="124" name="Multiply (4)" width="90" x="45" y="136"/> <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance" width="90" x="179" y="340"> <parameter key="use_example_weights" value="true"/> </operator> <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (5)" width="90" x="246" y="238"> <parameter key="main_criterion" value="first"/> <parameter key="accuracy" value="true"/> <parameter key="classification_error" value="false"/> <parameter key="kappa" value="true"/> <parameter key="weighted_mean_recall" value="false"/> <parameter key="weighted_mean_precision" value="false"/> <parameter key="spearman_rho" value="false"/> <parameter key="kendall_tau" value="false"/> <parameter key="absolute_error" value="false"/> <parameter key="relative_error" value="false"/> <parameter key="relative_error_lenient" value="false"/> <parameter key="relative_error_strict" value="false"/> <parameter key="normalized_absolute_error" value="false"/> <parameter key="root_mean_squared_error" value="true"/> <parameter key="root_relative_squared_error" value="false"/> <parameter key="squared_error" value="false"/> <parameter key="correlation" value="false"/> <parameter key="squared_correlation" value="false"/> <parameter key="cross-entropy" value="false"/> <parameter key="margin" value="false"/> <parameter key="soft_margin_loss" value="false"/> <parameter key="logistic_loss" value="false"/> <parameter key="skip_undefined_labels" value="true"/> <parameter key="use_example_weights" value="true"/> <list key="class_weights"/> </operator> <connect from_port="model" to_op="Apply Model (3)" to_port="model"/> <connect from_port="test set" to_op="Apply Model (3)" to_port="unlabelled data"/> <connect from_op="Apply Model (3)" from_port="labelled data" to_op="Multiply (4)" to_port="input"/> <connect from_op="Multiply (4)" from_port="output 2" to_op="Performance (5)" to_port="labelled data"/> <connect from_op="Multiply (4)" from_port="output 3" to_op="Performance" to_port="labelled data"/> <connect from_op="Performance" from_port="performance" to_port="performance 3"/> <connect from_op="Performance" from_port="example set" to_port="test set results"/> <connect from_op="Performance (5)" from_port="performance" to_port="performance 1"/> <portSpacing port="source_model" spacing="0"/> <portSpacing port="source_test set" spacing="0"/> <portSpacing port="source_through 1" spacing="0"/> <portSpacing port="sink_test set results" spacing="0"/> <portSpacing port="sink_performance 1" spacing="0"/> <portSpacing port="sink_performance 2" spacing="0"/> <portSpacing port="sink_performance 3" spacing="0"/> <portSpacing port="sink_performance 4" spacing="0"/> </process> </operator> <operator activated="true" class="write_excel" compatibility="9.1.000" expanded="true" height="82" name="Write Excel" width="90" x="715" y="187"> <parameter key="excel_file" value="E:\AIED_New\AIED_GBT_Test_Predictions_CV\CV_Predictions_GBT_38_Attributes.xlsx"/> <parameter key="file_format" value="xlsx"/> <parameter key="encoding" value="SYSTEM"/> <parameter key="sheet_name" value="RapidMiner Data"/> <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/> <parameter key="number_format" value="#.0"/> </operator> <operator activated="true" class="select_attributes" compatibility="9.1.000" expanded="true" height="82" name="Select Attributes (2)" width="90" x="380" y="238"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="Ln|Ln-1|RES_BORED|RES_CONFUSED|RES_FRUSTRATED|RES_GAMING|RES_OFFTASK|attemptCount|correct|endsWithScaffolding|frIsHelpRequest|frIsHelpRequestScaffolding|frPast5HelpRequest|frPast5WrongCount|frPast8HelpRequest|frPast8WrongCount|frTotalSkillOpportunitiesScaffolding|hint|hintCount|hintTotal|manywrong|original|past8BottomOut|problemType|scaffold|skill|sumRight|sumTimePerSkill|timeGreater10SecAndNextActionRight|timeSinceSkill|timeTaken|totalFrAttempted|totalFrPastWrongCount|totalFrPercentPastWrong|totalFrSkillOpportunities|totalFrSkillOpportunitiesByScaffolding|totalFrTimeOnSkill|totalTimeByPercentCorrectForskill"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> </operator> <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="166" name="Cross Validation (2)" width="90" x="581" y="238"> <parameter key="split_on_batch_attribute" value="false"/> <parameter key="leave_one_out" value="false"/> <parameter key="number_of_folds" value="5"/> <parameter key="sampling_type" value="automatic"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> <parameter key="enable_parallel_execution" value="true"/> <process expanded="true"> <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees (2)" width="90" x="112" y="34"> <parameter key="number_of_trees" value="20"/> <parameter key="reproducible" value="false"/> <parameter key="maximum_number_of_threads" value="4"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> <parameter key="maximal_depth" value="20"/> <parameter key="min_rows" value="10.0"/> <parameter key="min_split_improvement" value="0.0"/> <parameter key="number_of_bins" value="20"/> <parameter key="learning_rate" value="0.1"/> <parameter key="sample_rate" value="1.0"/> <parameter key="distribution" value="AUTO"/> <parameter key="early_stopping" value="false"/> <parameter key="stopping_rounds" value="1"/> <parameter key="stopping_metric" value="AUTO"/> <parameter key="stopping_tolerance" value="0.001"/> <parameter key="max_runtime_seconds" value="0"/> <list key="expert_parameters"/> </operator> <connect from_port="training set" to_op="Gradient Boosted Trees (2)" to_port="training set"/> <connect from_op="Gradient Boosted Trees (2)" from_port="model" to_port="model"/> <portSpacing port="source_training set" spacing="0"/> <portSpacing port="sink_model" spacing="0"/> <portSpacing port="sink_through 1" spacing="0"/> </process> <process expanded="true"> <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (2)" width="90" x="45" y="34"> <list key="application_parameters"/> <parameter key="create_view" value="false"/> </operator> <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="103" name="Multiply (2)" width="90" x="45" y="136"/> <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance (8)" width="90" x="246" y="340"> <parameter key="use_example_weights" value="true"/> </operator> <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (2)" width="90" x="246" y="85"> <parameter key="main_criterion" value="first"/> <parameter key="accuracy" value="true"/> <parameter key="classification_error" value="false"/> <parameter key="kappa" value="true"/> <parameter key="weighted_mean_recall" value="false"/> <parameter key="weighted_mean_precision" value="false"/> <parameter key="spearman_rho" value="false"/> <parameter key="kendall_tau" value="false"/> <parameter key="absolute_error" value="false"/> <parameter key="relative_error" value="false"/> <parameter key="relative_error_lenient" value="false"/> <parameter key="relative_error_strict" value="false"/> <parameter key="normalized_absolute_error" value="false"/> <parameter key="root_mean_squared_error" value="true"/> <parameter key="root_relative_squared_error" value="false"/> <parameter key="squared_error" value="false"/> <parameter key="correlation" value="false"/> <parameter key="squared_correlation" value="false"/> <parameter key="cross-entropy" value="false"/> <parameter key="margin" value="false"/> <parameter key="soft_margin_loss" value="false"/> <parameter key="logistic_loss" value="false"/> <parameter key="skip_undefined_labels" value="true"/> <parameter key="use_example_weights" value="true"/> <list key="class_weights"/> </operator> <connect from_port="model" to_op="Apply Model (2)" to_port="model"/> <connect from_port="test set" to_op="Apply Model (2)" to_port="unlabelled data"/> <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Multiply (2)" to_port="input"/> <connect from_op="Multiply (2)" from_port="output 1" to_op="Performance (2)" to_port="labelled data"/> <connect from_op="Multiply (2)" from_port="output 2" to_op="Performance (8)" to_port="labelled data"/> <connect from_op="Performance (8)" from_port="performance" to_port="performance 1"/> <connect from_op="Performance (8)" from_port="example set" to_port="test set results"/> <connect from_op="Performance (2)" from_port="performance" to_port="performance 2"/> <portSpacing port="source_model" spacing="0"/> <portSpacing port="source_test set" spacing="0"/> <portSpacing port="source_through 1" spacing="0"/> <portSpacing port="sink_test set results" spacing="0"/> <portSpacing port="sink_performance 1" spacing="0"/> <portSpacing port="sink_performance 2" spacing="0"/> <portSpacing port="sink_performance 3" spacing="0"/> </process> </operator> <operator activated="true" class="write_excel" compatibility="9.1.000" expanded="true" height="82" name="Write Excel (2)" width="90" x="707" y="340"> <parameter key="excel_file" value="E:\AIED_New\AIED_GBT_Test_Predictions_CV\CV_Predictions_GBT_39_Attributes.xlsx"/> <parameter key="file_format" value="xlsx"/> <parameter key="encoding" value="SYSTEM"/> <parameter key="sheet_name" value="RapidMiner Data"/> <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/> <parameter key="number_format" value="#.0"/> </operator> <connect from_op="Retrieve Assistment_Data_Labels" from_port="output" to_op="Multiply (6)" to_port="input"/> <connect from_op="Multiply (6)" from_port="output 1" to_op="Select Attributes" to_port="example set input"/> <connect from_op="Multiply (6)" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/> <connect from_op="Multiply (6)" from_port="output 3" to_op="Select Attributes (3)" to_port="example set input"/> <connect from_op="Select Attributes (3)" from_port="example set output" to_op="Cross Validation (4)" to_port="example set"/> <connect from_op="Cross Validation (4)" from_port="test result set" to_op="Write Excel (3)" to_port="input"/> <connect from_op="Cross Validation (4)" from_port="performance 1" to_port="result 5"/> <connect from_op="Cross Validation (4)" from_port="performance 2" to_port="result 6"/> <connect from_op="Write Excel (3)" from_port="through" to_port="result 9"/> <connect from_op="Select Attributes" from_port="example set output" to_op="Cross Validation (3)" to_port="example set"/> <connect from_op="Cross Validation (3)" from_port="test result set" to_op="Write Excel" to_port="input"/> <connect from_op="Cross Validation (3)" from_port="performance 1" to_port="result 1"/> <connect from_op="Cross Validation (3)" from_port="performance 2" to_port="result 2"/> <connect from_op="Cross Validation (3)" from_port="performance 3" to_port="result 10"/> <connect from_op="Write Excel" from_port="through" to_port="result 7"/> <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Cross Validation (2)" to_port="example set"/> <connect from_op="Cross Validation (2)" from_port="test result set" to_op="Write Excel (2)" to_port="input"/> <connect from_op="Cross Validation (2)" from_port="performance 1" to_port="result 3"/> <connect from_op="Cross Validation (2)" from_port="performance 2" to_port="result 4"/> <connect from_op="Write Excel (2)" from_port="through" to_port="result 8"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> <portSpacing port="sink_result 3" spacing="0"/> <portSpacing port="sink_result 4" spacing="0"/> <portSpacing port="sink_result 5" spacing="0"/> <portSpacing port="sink_result 6" spacing="0"/> <portSpacing port="sink_result 7" spacing="0"/> <portSpacing port="sink_result 8" spacing="0"/> <portSpacing port="sink_result 9" spacing="0"/> <portSpacing port="sink_result 10" spacing="0"/> <portSpacing port="sink_result 11" spacing="0"/> </process> </operator> </process>
Varun
Find more posts tagged with
Sort by:
1 - 11 of
111

Update on this. I see that there are duplicate columns in excel. Once I remove duplicated it gave me a correct number of samples but I am not sure why duplicate values are coming in excel. I see the confusion matrix in performance has a correct number of samples @mschmitz and @David_A any suggestion?
Hi @varunm1,
Interesting and amusing problem !
I think it's linked to the 2 Performances operators you are using in the CV operator. RM performs a 5 fold CV for each Performance operator...
but when you connect the Performances operators in serie , you will retrieve the original number of rows of your dataset. (it's not a priori linked to the Write Excel operator).
However what I don' t undestand is why you don't obtain [2 * number of rows] but (10-1)/5 = 9/5 * [number of rows] (the second CV is not complete...).
Regards,
Lionel
Interesting and amusing problem !
I think it's linked to the 2 Performances operators you are using in the CV operator. RM performs a 5 fold CV for each Performance operator...
but when you connect the Performances operators in serie , you will retrieve the original number of rows of your dataset. (it's not a priori linked to the Write Excel operator).
However what I don' t undestand is why you don't obtain [2 * number of rows] but (10-1)/5 = 9/5 * [number of rows] (the second CV is not complete...).
Regards,
Lionel
Hi @lionelderkrikor,
That's what confuses me as well. But when I check the confusion matrix, it gives correct sample numbers (316974). Only while writing to excel or csv this is happening. But it's giving both performance results.

Thanks,
Varun
That's what confuses me as well. But when I check the confusion matrix, it gives correct sample numbers (316974). Only while writing to excel or csv this is happening. But it's giving both performance results.

Thanks,
Varun
@mschmitz any suggestion on this issue? Thanks
Hi @varunm1 ,
I can reproduce and this looks like a bug, which is somewhat connected to the multiply in your apply model. I'll create a ticket on this.
As a workaround you can just append the performance vectors like this:
<?xml version="1.0" encoding="UTF-8"?><process version="9.1.000"><br> <context><br> <input/><br> <output/><br> <macros/><br> </context><br> <operator activated="true" class="process" compatibility="9.1.000" expanded="true" name="Process"><br> <parameter key="logverbosity" value="init"/><br> <parameter key="random_seed" value="2001"/><br> <parameter key="send_mail" value="never"/><br> <parameter key="notification_email" value=""/><br> <parameter key="process_duration_for_mail" value="30"/><br> <parameter key="encoding" value="SYSTEM"/><br> <process expanded="true"><br> <operator activated="true" class="generate_data" compatibility="9.1.000" expanded="true" height="68" name="Generate Data" width="90" x="45" y="238"><br> <parameter key="target_function" value="one third classification"/><br> <parameter key="number_examples" value="100"/><br> <parameter key="number_of_attributes" value="5"/><br> <parameter key="attributes_lower_bound" value="-10.0"/><br> <parameter key="attributes_upper_bound" value="10.0"/><br> <parameter key="gaussian_standard_deviation" value="10.0"/><br> <parameter key="largest_radius" value="10.0"/><br> <parameter key="use_local_random_seed" value="false"/><br> <parameter key="local_random_seed" value="1992"/><br> <parameter key="datamanagement" value="double_array"/><br> <parameter key="data_management" value="auto"/><br> </operator><br> <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="145" name="Cross Validation (3)" width="90" x="313" y="238"><br> <parameter key="split_on_batch_attribute" value="false"/><br> <parameter key="leave_one_out" value="false"/><br> <parameter key="number_of_folds" value="5"/><br> <parameter key="sampling_type" value="automatic"/><br> <parameter key="use_local_random_seed" value="false"/><br> <parameter key="local_random_seed" value="1992"/><br> <parameter key="enable_parallel_execution" value="true"/><br> <process expanded="true"><br> <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees" width="90" x="112" y="34"><br> <parameter key="number_of_trees" value="20"/><br> <parameter key="reproducible" value="false"/><br> <parameter key="maximum_number_of_threads" value="4"/><br> <parameter key="use_local_random_seed" value="false"/><br> <parameter key="local_random_seed" value="1992"/><br> <parameter key="maximal_depth" value="20"/><br> <parameter key="min_rows" value="10.0"/><br> <parameter key="min_split_improvement" value="0.0"/><br> <parameter key="number_of_bins" value="20"/><br> <parameter key="learning_rate" value="0.1"/><br> <parameter key="sample_rate" value="1.0"/><br> <parameter key="distribution" value="AUTO"/><br> <parameter key="early_stopping" value="false"/><br> <parameter key="stopping_rounds" value="1"/><br> <parameter key="stopping_metric" value="AUTO"/><br> <parameter key="stopping_tolerance" value="0.001"/><br> <parameter key="max_runtime_seconds" value="0"/><br> <list key="expert_parameters"/><br> </operator><br> <connect from_port="training set" to_op="Gradient Boosted Trees" to_port="training set"/><br> <connect from_op="Gradient Boosted Trees" from_port="model" to_port="model"/><br> <portSpacing port="source_training set" spacing="0"/><br> <portSpacing port="sink_model" spacing="0"/><br> <portSpacing port="sink_through 1" spacing="0"/><br> </process><br> <process expanded="true"><br> <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (3)" width="90" x="45" y="34"><br> <list key="application_parameters"/><br> <parameter key="create_view" value="false"/><br> </operator><br> <operator activated="false" class="multiply" compatibility="9.1.000" expanded="true" height="68" name="Multiply (4)" width="90" x="45" y="136"/><br> <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (5)" width="90" x="179" y="34"><br> <parameter key="main_criterion" value="first"/><br> <parameter key="accuracy" value="true"/><br> <parameter key="classification_error" value="false"/><br> <parameter key="kappa" value="true"/><br> <parameter key="weighted_mean_recall" value="false"/><br> <parameter key="weighted_mean_precision" value="false"/><br> <parameter key="spearman_rho" value="false"/><br> <parameter key="kendall_tau" value="false"/><br> <parameter key="absolute_error" value="false"/><br> <parameter key="relative_error" value="false"/><br> <parameter key="relative_error_lenient" value="false"/><br> <parameter key="relative_error_strict" value="false"/><br> <parameter key="normalized_absolute_error" value="false"/><br> <parameter key="root_mean_squared_error" value="true"/><br> <parameter key="root_relative_squared_error" value="false"/><br> <parameter key="squared_error" value="false"/><br> <parameter key="correlation" value="false"/><br> <parameter key="squared_correlation" value="false"/><br> <parameter key="cross-entropy" value="false"/><br> <parameter key="margin" value="false"/><br> <parameter key="soft_margin_loss" value="false"/><br> <parameter key="logistic_loss" value="false"/><br> <parameter key="skip_undefined_labels" value="true"/><br> <parameter key="use_example_weights" value="true"/><br> <list key="class_weights"/><br> </operator><br> <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance" width="90" x="313" y="34"><br> <parameter key="use_example_weights" value="true"/><br> </operator><br> <operator activated="false" class="collect" compatibility="9.1.000" expanded="true" height="68" name="Collect" width="90" x="380" y="187"><br> <parameter key="unfold" value="false"/><br> </operator><br> <connect from_port="model" to_op="Apply Model (3)" to_port="model"/><br> <connect from_port="test set" to_op="Apply Model (3)" to_port="unlabelled data"/><br> <connect from_op="Apply Model (3)" from_port="labelled data" to_op="Performance (5)" to_port="labelled data"/><br> <connect from_op="Performance (5)" from_port="performance" to_op="Performance" to_port="performance"/><br> <connect from_op="Performance (5)" from_port="example set" to_op="Performance" to_port="labelled data"/><br> <connect from_op="Performance" from_port="performance" to_port="performance 1"/><br> <connect from_op="Performance" from_port="example set" to_port="test set results"/><br> <portSpacing port="source_model" spacing="0"/><br> <portSpacing port="source_test set" spacing="0"/><br> <portSpacing port="source_through 1" spacing="0"/><br> <portSpacing port="sink_test set results" spacing="0"/><br> <portSpacing port="sink_performance 1" spacing="0"/><br> <portSpacing port="sink_performance 2" spacing="0"/><br> </process><br> </operator><br> <connect from_op="Generate Data" from_port="output" to_op="Cross Validation (3)" to_port="example set"/><br> <connect from_op="Cross Validation (3)" from_port="test result set" to_port="result 1"/><br> <connect from_op="Cross Validation (3)" from_port="performance 1" to_port="result 2"/><br> <portSpacing port="source_input 1" spacing="0"/><br> <portSpacing port="sink_result 1" spacing="0"/><br> <portSpacing port="sink_result 2" spacing="0"/><br> <portSpacing port="sink_result 3" spacing="0"/><br> </process><br> </operator><br></process><br><br>
BR,
Martin
Thanks @mschmitz
@mschmitz one quick question. Does this bug impact performance metrics (accuracy etc.) by any chance? I see the confusion matrix has the correct number of samples but just want to cross check.
Thanks @Marco_Boeck for confirming.
@Marco_Boeck please advise if this should get pushed to Product Feedback.
@sgenzer you can push it there, but it will be fixed in 9.2.