"Cross Validation test examples are more than input samples"
varunm1
New Altair Community Member
Hi,
I am storing cross validation predictions using Write Excel. I see that the output samples (570553) of Cross validation 5 fold for Gradient Boosted Tree are more than input samples (316974). I am not sure if I am doing something wrong. Please see XML below.
Thanks,
Varun
I am storing cross validation predictions using Write Excel. I see that the output samples (570553) of Cross validation 5 fold for Gradient Boosted Tree are more than input samples (316974). I am not sure if I am doing something wrong. Please see XML below.
<?xml version="1.0" encoding="UTF-8"?><process version="9.1.000"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="9.1.000" expanded="true" name="Process"> <parameter key="logverbosity" value="init"/> <parameter key="random_seed" value="2001"/> <parameter key="send_mail" value="never"/> <parameter key="notification_email" value=""/> <parameter key="process_duration_for_mail" value="30"/> <parameter key="encoding" value="SYSTEM"/> <process expanded="true"> <operator activated="true" class="retrieve" compatibility="9.1.000" expanded="true" height="68" name="Retrieve Assistment_Data_Labels" width="90" x="45" y="85"> <parameter key="repository_entry" value="../../data/AIED_New/Assistment_Data_Labels"/> </operator> <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="124" name="Multiply (6)" width="90" x="179" y="85"/> <operator activated="true" class="select_attributes" compatibility="9.1.000" expanded="true" height="82" name="Select Attributes (3)" width="90" x="380" y="442"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="Ln|Ln-1|NumActions|RES_BORED|RES_CONFUSED|RES_FRUSTRATED|RES_GAMING|RES_OFFTASK|attemptCount|correct|endsWithScaffolding|frIsHelpRequest|frIsHelpRequestScaffolding|frPast5HelpRequest|frPast5WrongCount|frPast8HelpRequest|frPast8WrongCount|frTotalSkillOpportunitiesScaffolding|hint|hintCount|hintTotal|manywrong|original|past8BottomOut|problemType|scaffold|skill|sumRight|sumTimePerSkill|timeGreater10SecAndNextActionRight|timeSinceSkill|timeTaken|totalFrAttempted|totalFrPastWrongCount|totalFrPercentPastWrong|totalFrSkillOpportunities|totalFrSkillOpportunitiesByScaffolding|totalFrTimeOnSkill|totalTimeByPercentCorrectForskill"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> </operator> <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="166" name="Cross Validation (4)" width="90" x="581" y="442"> <parameter key="split_on_batch_attribute" value="false"/> <parameter key="leave_one_out" value="false"/> <parameter key="number_of_folds" value="5"/> <parameter key="sampling_type" value="automatic"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> <parameter key="enable_parallel_execution" value="true"/> <process expanded="true"> <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees (3)" width="90" x="112" y="34"> <parameter key="number_of_trees" value="20"/> <parameter key="reproducible" value="false"/> <parameter key="maximum_number_of_threads" value="4"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> <parameter key="maximal_depth" value="20"/> <parameter key="min_rows" value="10.0"/> <parameter key="min_split_improvement" value="0.0"/> <parameter key="number_of_bins" value="20"/> <parameter key="learning_rate" value="0.1"/> <parameter key="sample_rate" value="1.0"/> <parameter key="distribution" value="AUTO"/> <parameter key="early_stopping" value="false"/> <parameter key="stopping_rounds" value="1"/> <parameter key="stopping_metric" value="AUTO"/> <parameter key="stopping_tolerance" value="0.001"/> <parameter key="max_runtime_seconds" value="0"/> <list key="expert_parameters"/> </operator> <connect from_port="training set" to_op="Gradient Boosted Trees (3)" to_port="training set"/> <connect from_op="Gradient Boosted Trees (3)" from_port="model" to_port="model"/> <portSpacing port="source_training set" spacing="0"/> <portSpacing port="sink_model" spacing="0"/> <portSpacing port="sink_through 1" spacing="0"/> </process> <process expanded="true"> <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (4)" width="90" x="45" y="34"> <list key="application_parameters"/> <parameter key="create_view" value="false"/> </operator> <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="103" name="Multiply (3)" width="90" x="45" y="136"/> <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance (3)" width="90" x="246" y="340"> <parameter key="use_example_weights" value="true"/> </operator> <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (4)" width="90" x="246" y="238"> <parameter key="main_criterion" value="first"/> <parameter key="accuracy" value="true"/> <parameter key="classification_error" value="false"/> <parameter key="kappa" value="true"/> <parameter key="weighted_mean_recall" value="false"/> <parameter key="weighted_mean_precision" value="false"/> <parameter key="spearman_rho" value="false"/> <parameter key="kendall_tau" value="false"/> <parameter key="absolute_error" value="false"/> <parameter key="relative_error" value="false"/> <parameter key="relative_error_lenient" value="false"/> <parameter key="relative_error_strict" value="false"/> <parameter key="normalized_absolute_error" value="false"/> <parameter key="root_mean_squared_error" value="true"/> <parameter key="root_relative_squared_error" value="false"/> <parameter key="squared_error" value="false"/> <parameter key="correlation" value="false"/> <parameter key="squared_correlation" value="false"/> <parameter key="cross-entropy" value="false"/> <parameter key="margin" value="false"/> <parameter key="soft_margin_loss" value="false"/> <parameter key="logistic_loss" value="false"/> <parameter key="skip_undefined_labels" value="true"/> <parameter key="use_example_weights" value="true"/> <list key="class_weights"/> </operator> <connect from_port="model" to_op="Apply Model (4)" to_port="model"/> <connect from_port="test set" to_op="Apply Model (4)" to_port="unlabelled data"/> <connect from_op="Apply Model (4)" from_port="labelled data" to_op="Multiply (3)" to_port="input"/> <connect from_op="Multiply (3)" from_port="output 1" to_op="Performance (4)" to_port="labelled data"/> <connect from_op="Multiply (3)" from_port="output 2" to_op="Performance (3)" to_port="labelled data"/> <connect from_op="Performance (3)" from_port="performance" to_port="performance 2"/> <connect from_op="Performance (3)" from_port="example set" to_port="test set results"/> <connect from_op="Performance (4)" from_port="performance" to_port="performance 1"/> <portSpacing port="source_model" spacing="0"/> <portSpacing port="source_test set" spacing="0"/> <portSpacing port="source_through 1" spacing="0"/> <portSpacing port="sink_test set results" spacing="0"/> <portSpacing port="sink_performance 1" spacing="0"/> <portSpacing port="sink_performance 2" spacing="0"/> <portSpacing port="sink_performance 3" spacing="0"/> </process> </operator> <operator activated="true" class="write_excel" compatibility="9.1.000" expanded="true" height="82" name="Write Excel (3)" width="90" x="707" y="595"> <parameter key="excel_file" value="E:\AIED_New\AIED_GBT_Test_Predictions_CV\CV_Predictions_GBT_40_Attributes.xlsx"/> <parameter key="file_format" value="xlsx"/> <parameter key="encoding" value="SYSTEM"/> <parameter key="sheet_name" value="RapidMiner Data"/> <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/> <parameter key="number_format" value="#.0"/> </operator> <operator activated="true" class="select_attributes" compatibility="9.1.000" expanded="true" height="82" name="Select Attributes" width="90" x="380" y="34"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="Ln|Ln-1|RES_BORED|RES_CONFUSED|RES_FRUSTRATED|RES_GAMING|RES_OFFTASK|attemptCount|correct|endsWithScaffolding|frIsHelpRequest|frIsHelpRequestScaffolding|frPast5HelpRequest|frPast5WrongCount|frPast8HelpRequest|frPast8WrongCount|frTotalSkillOpportunitiesScaffolding|hint|hintCount|hintTotal|manywrong|original|past8BottomOut|problemType|scaffold|skill|sumRight|sumTimePerSkill|timeGreater10SecAndNextActionRight|timeSinceSkill|timeTaken|totalFrPastWrongCount|totalFrPercentPastWrong|totalFrSkillOpportunities|totalFrSkillOpportunitiesByScaffolding|totalFrTimeOnSkill|totalTimeByPercentCorrectForskill"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> </operator> <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="187" name="Cross Validation (3)" width="90" x="581" y="34"> <parameter key="split_on_batch_attribute" value="false"/> <parameter key="leave_one_out" value="false"/> <parameter key="number_of_folds" value="5"/> <parameter key="sampling_type" value="automatic"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> <parameter key="enable_parallel_execution" value="true"/> <process expanded="true"> <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees" width="90" x="112" y="34"> <parameter key="number_of_trees" value="20"/> <parameter key="reproducible" value="false"/> <parameter key="maximum_number_of_threads" value="4"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> <parameter key="maximal_depth" value="20"/> <parameter key="min_rows" value="10.0"/> <parameter key="min_split_improvement" value="0.0"/> <parameter key="number_of_bins" value="20"/> <parameter key="learning_rate" value="0.1"/> <parameter key="sample_rate" value="1.0"/> <parameter key="distribution" value="AUTO"/> <parameter key="early_stopping" value="false"/> <parameter key="stopping_rounds" value="1"/> <parameter key="stopping_metric" value="AUTO"/> <parameter key="stopping_tolerance" value="0.001"/> <parameter key="max_runtime_seconds" value="0"/> <list key="expert_parameters"/> </operator> <connect from_port="training set" to_op="Gradient Boosted Trees" to_port="training set"/> <connect from_op="Gradient Boosted Trees" from_port="model" to_port="model"/> <portSpacing port="source_training set" spacing="0"/> <portSpacing port="sink_model" spacing="0"/> <portSpacing port="sink_through 1" spacing="0"/> </process> <process expanded="true"> <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (3)" width="90" x="45" y="34"> <list key="application_parameters"/> <parameter key="create_view" value="false"/> </operator> <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="124" name="Multiply (4)" width="90" x="45" y="136"/> <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance" width="90" x="179" y="340"> <parameter key="use_example_weights" value="true"/> </operator> <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (5)" width="90" x="246" y="238"> <parameter key="main_criterion" value="first"/> <parameter key="accuracy" value="true"/> <parameter key="classification_error" value="false"/> <parameter key="kappa" value="true"/> <parameter key="weighted_mean_recall" value="false"/> <parameter key="weighted_mean_precision" value="false"/> <parameter key="spearman_rho" value="false"/> <parameter key="kendall_tau" value="false"/> <parameter key="absolute_error" value="false"/> <parameter key="relative_error" value="false"/> <parameter key="relative_error_lenient" value="false"/> <parameter key="relative_error_strict" value="false"/> <parameter key="normalized_absolute_error" value="false"/> <parameter key="root_mean_squared_error" value="true"/> <parameter key="root_relative_squared_error" value="false"/> <parameter key="squared_error" value="false"/> <parameter key="correlation" value="false"/> <parameter key="squared_correlation" value="false"/> <parameter key="cross-entropy" value="false"/> <parameter key="margin" value="false"/> <parameter key="soft_margin_loss" value="false"/> <parameter key="logistic_loss" value="false"/> <parameter key="skip_undefined_labels" value="true"/> <parameter key="use_example_weights" value="true"/> <list key="class_weights"/> </operator> <connect from_port="model" to_op="Apply Model (3)" to_port="model"/> <connect from_port="test set" to_op="Apply Model (3)" to_port="unlabelled data"/> <connect from_op="Apply Model (3)" from_port="labelled data" to_op="Multiply (4)" to_port="input"/> <connect from_op="Multiply (4)" from_port="output 2" to_op="Performance (5)" to_port="labelled data"/> <connect from_op="Multiply (4)" from_port="output 3" to_op="Performance" to_port="labelled data"/> <connect from_op="Performance" from_port="performance" to_port="performance 3"/> <connect from_op="Performance" from_port="example set" to_port="test set results"/> <connect from_op="Performance (5)" from_port="performance" to_port="performance 1"/> <portSpacing port="source_model" spacing="0"/> <portSpacing port="source_test set" spacing="0"/> <portSpacing port="source_through 1" spacing="0"/> <portSpacing port="sink_test set results" spacing="0"/> <portSpacing port="sink_performance 1" spacing="0"/> <portSpacing port="sink_performance 2" spacing="0"/> <portSpacing port="sink_performance 3" spacing="0"/> <portSpacing port="sink_performance 4" spacing="0"/> </process> </operator> <operator activated="true" class="write_excel" compatibility="9.1.000" expanded="true" height="82" name="Write Excel" width="90" x="715" y="187"> <parameter key="excel_file" value="E:\AIED_New\AIED_GBT_Test_Predictions_CV\CV_Predictions_GBT_38_Attributes.xlsx"/> <parameter key="file_format" value="xlsx"/> <parameter key="encoding" value="SYSTEM"/> <parameter key="sheet_name" value="RapidMiner Data"/> <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/> <parameter key="number_format" value="#.0"/> </operator> <operator activated="true" class="select_attributes" compatibility="9.1.000" expanded="true" height="82" name="Select Attributes (2)" width="90" x="380" y="238"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="Ln|Ln-1|RES_BORED|RES_CONFUSED|RES_FRUSTRATED|RES_GAMING|RES_OFFTASK|attemptCount|correct|endsWithScaffolding|frIsHelpRequest|frIsHelpRequestScaffolding|frPast5HelpRequest|frPast5WrongCount|frPast8HelpRequest|frPast8WrongCount|frTotalSkillOpportunitiesScaffolding|hint|hintCount|hintTotal|manywrong|original|past8BottomOut|problemType|scaffold|skill|sumRight|sumTimePerSkill|timeGreater10SecAndNextActionRight|timeSinceSkill|timeTaken|totalFrAttempted|totalFrPastWrongCount|totalFrPercentPastWrong|totalFrSkillOpportunities|totalFrSkillOpportunitiesByScaffolding|totalFrTimeOnSkill|totalTimeByPercentCorrectForskill"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> </operator> <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="166" name="Cross Validation (2)" width="90" x="581" y="238"> <parameter key="split_on_batch_attribute" value="false"/> <parameter key="leave_one_out" value="false"/> <parameter key="number_of_folds" value="5"/> <parameter key="sampling_type" value="automatic"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> <parameter key="enable_parallel_execution" value="true"/> <process expanded="true"> <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees (2)" width="90" x="112" y="34"> <parameter key="number_of_trees" value="20"/> <parameter key="reproducible" value="false"/> <parameter key="maximum_number_of_threads" value="4"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> <parameter key="maximal_depth" value="20"/> <parameter key="min_rows" value="10.0"/> <parameter key="min_split_improvement" value="0.0"/> <parameter key="number_of_bins" value="20"/> <parameter key="learning_rate" value="0.1"/> <parameter key="sample_rate" value="1.0"/> <parameter key="distribution" value="AUTO"/> <parameter key="early_stopping" value="false"/> <parameter key="stopping_rounds" value="1"/> <parameter key="stopping_metric" value="AUTO"/> <parameter key="stopping_tolerance" value="0.001"/> <parameter key="max_runtime_seconds" value="0"/> <list key="expert_parameters"/> </operator> <connect from_port="training set" to_op="Gradient Boosted Trees (2)" to_port="training set"/> <connect from_op="Gradient Boosted Trees (2)" from_port="model" to_port="model"/> <portSpacing port="source_training set" spacing="0"/> <portSpacing port="sink_model" spacing="0"/> <portSpacing port="sink_through 1" spacing="0"/> </process> <process expanded="true"> <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (2)" width="90" x="45" y="34"> <list key="application_parameters"/> <parameter key="create_view" value="false"/> </operator> <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="103" name="Multiply (2)" width="90" x="45" y="136"/> <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance (8)" width="90" x="246" y="340"> <parameter key="use_example_weights" value="true"/> </operator> <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (2)" width="90" x="246" y="85"> <parameter key="main_criterion" value="first"/> <parameter key="accuracy" value="true"/> <parameter key="classification_error" value="false"/> <parameter key="kappa" value="true"/> <parameter key="weighted_mean_recall" value="false"/> <parameter key="weighted_mean_precision" value="false"/> <parameter key="spearman_rho" value="false"/> <parameter key="kendall_tau" value="false"/> <parameter key="absolute_error" value="false"/> <parameter key="relative_error" value="false"/> <parameter key="relative_error_lenient" value="false"/> <parameter key="relative_error_strict" value="false"/> <parameter key="normalized_absolute_error" value="false"/> <parameter key="root_mean_squared_error" value="true"/> <parameter key="root_relative_squared_error" value="false"/> <parameter key="squared_error" value="false"/> <parameter key="correlation" value="false"/> <parameter key="squared_correlation" value="false"/> <parameter key="cross-entropy" value="false"/> <parameter key="margin" value="false"/> <parameter key="soft_margin_loss" value="false"/> <parameter key="logistic_loss" value="false"/> <parameter key="skip_undefined_labels" value="true"/> <parameter key="use_example_weights" value="true"/> <list key="class_weights"/> </operator> <connect from_port="model" to_op="Apply Model (2)" to_port="model"/> <connect from_port="test set" to_op="Apply Model (2)" to_port="unlabelled data"/> <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Multiply (2)" to_port="input"/> <connect from_op="Multiply (2)" from_port="output 1" to_op="Performance (2)" to_port="labelled data"/> <connect from_op="Multiply (2)" from_port="output 2" to_op="Performance (8)" to_port="labelled data"/> <connect from_op="Performance (8)" from_port="performance" to_port="performance 1"/> <connect from_op="Performance (8)" from_port="example set" to_port="test set results"/> <connect from_op="Performance (2)" from_port="performance" to_port="performance 2"/> <portSpacing port="source_model" spacing="0"/> <portSpacing port="source_test set" spacing="0"/> <portSpacing port="source_through 1" spacing="0"/> <portSpacing port="sink_test set results" spacing="0"/> <portSpacing port="sink_performance 1" spacing="0"/> <portSpacing port="sink_performance 2" spacing="0"/> <portSpacing port="sink_performance 3" spacing="0"/> </process> </operator> <operator activated="true" class="write_excel" compatibility="9.1.000" expanded="true" height="82" name="Write Excel (2)" width="90" x="707" y="340"> <parameter key="excel_file" value="E:\AIED_New\AIED_GBT_Test_Predictions_CV\CV_Predictions_GBT_39_Attributes.xlsx"/> <parameter key="file_format" value="xlsx"/> <parameter key="encoding" value="SYSTEM"/> <parameter key="sheet_name" value="RapidMiner Data"/> <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/> <parameter key="number_format" value="#.0"/> </operator> <connect from_op="Retrieve Assistment_Data_Labels" from_port="output" to_op="Multiply (6)" to_port="input"/> <connect from_op="Multiply (6)" from_port="output 1" to_op="Select Attributes" to_port="example set input"/> <connect from_op="Multiply (6)" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/> <connect from_op="Multiply (6)" from_port="output 3" to_op="Select Attributes (3)" to_port="example set input"/> <connect from_op="Select Attributes (3)" from_port="example set output" to_op="Cross Validation (4)" to_port="example set"/> <connect from_op="Cross Validation (4)" from_port="test result set" to_op="Write Excel (3)" to_port="input"/> <connect from_op="Cross Validation (4)" from_port="performance 1" to_port="result 5"/> <connect from_op="Cross Validation (4)" from_port="performance 2" to_port="result 6"/> <connect from_op="Write Excel (3)" from_port="through" to_port="result 9"/> <connect from_op="Select Attributes" from_port="example set output" to_op="Cross Validation (3)" to_port="example set"/> <connect from_op="Cross Validation (3)" from_port="test result set" to_op="Write Excel" to_port="input"/> <connect from_op="Cross Validation (3)" from_port="performance 1" to_port="result 1"/> <connect from_op="Cross Validation (3)" from_port="performance 2" to_port="result 2"/> <connect from_op="Cross Validation (3)" from_port="performance 3" to_port="result 10"/> <connect from_op="Write Excel" from_port="through" to_port="result 7"/> <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Cross Validation (2)" to_port="example set"/> <connect from_op="Cross Validation (2)" from_port="test result set" to_op="Write Excel (2)" to_port="input"/> <connect from_op="Cross Validation (2)" from_port="performance 1" to_port="result 3"/> <connect from_op="Cross Validation (2)" from_port="performance 2" to_port="result 4"/> <connect from_op="Write Excel (2)" from_port="through" to_port="result 8"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> <portSpacing port="sink_result 3" spacing="0"/> <portSpacing port="sink_result 4" spacing="0"/> <portSpacing port="sink_result 5" spacing="0"/> <portSpacing port="sink_result 6" spacing="0"/> <portSpacing port="sink_result 7" spacing="0"/> <portSpacing port="sink_result 8" spacing="0"/> <portSpacing port="sink_result 9" spacing="0"/> <portSpacing port="sink_result 10" spacing="0"/> <portSpacing port="sink_result 11" spacing="0"/> </process> </operator> </process>
Varun
Tagged:
0
Answers
-
Update on this. I see that there are duplicate columns in excel. Once I remove duplicated it gave me a correct number of samples but I am not sure why duplicate values are coming in excel. I see the confusion matrix in performance has a correct number of samples @mschmitz and @David_A any suggestion?0
-
Hi @varunm1,
Interesting and amusing problem !
I think it's linked to the 2 Performances operators you are using in the CV operator. RM performs a 5 fold CV for each Performance operator...
but when you connect the Performances operators in serie , you will retrieve the original number of rows of your dataset. (it's not a priori linked to the Write Excel operator).
However what I don' t undestand is why you don't obtain [2 * number of rows] but (10-1)/5 = 9/5 * [number of rows] (the second CV is not complete...).
Regards,
Lionel
3 -
Hi @lionelderkrikor,
That's what confuses me as well. But when I check the confusion matrix, it gives correct sample numbers (316974). Only while writing to excel or csv this is happening. But it's giving both performance results.
Thanks,
Varun1 -
Hi @varunm1 ,I can reproduce and this looks like a bug, which is somewhat connected to the multiply in your apply model. I'll create a ticket on this.As a workaround you can just append the performance vectors like this:
<?xml version="1.0" encoding="UTF-8"?><process version="9.1.000"><br> <context><br> <input/><br> <output/><br> <macros/><br> </context><br> <operator activated="true" class="process" compatibility="9.1.000" expanded="true" name="Process"><br> <parameter key="logverbosity" value="init"/><br> <parameter key="random_seed" value="2001"/><br> <parameter key="send_mail" value="never"/><br> <parameter key="notification_email" value=""/><br> <parameter key="process_duration_for_mail" value="30"/><br> <parameter key="encoding" value="SYSTEM"/><br> <process expanded="true"><br> <operator activated="true" class="generate_data" compatibility="9.1.000" expanded="true" height="68" name="Generate Data" width="90" x="45" y="238"><br> <parameter key="target_function" value="one third classification"/><br> <parameter key="number_examples" value="100"/><br> <parameter key="number_of_attributes" value="5"/><br> <parameter key="attributes_lower_bound" value="-10.0"/><br> <parameter key="attributes_upper_bound" value="10.0"/><br> <parameter key="gaussian_standard_deviation" value="10.0"/><br> <parameter key="largest_radius" value="10.0"/><br> <parameter key="use_local_random_seed" value="false"/><br> <parameter key="local_random_seed" value="1992"/><br> <parameter key="datamanagement" value="double_array"/><br> <parameter key="data_management" value="auto"/><br> </operator><br> <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="145" name="Cross Validation (3)" width="90" x="313" y="238"><br> <parameter key="split_on_batch_attribute" value="false"/><br> <parameter key="leave_one_out" value="false"/><br> <parameter key="number_of_folds" value="5"/><br> <parameter key="sampling_type" value="automatic"/><br> <parameter key="use_local_random_seed" value="false"/><br> <parameter key="local_random_seed" value="1992"/><br> <parameter key="enable_parallel_execution" value="true"/><br> <process expanded="true"><br> <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees" width="90" x="112" y="34"><br> <parameter key="number_of_trees" value="20"/><br> <parameter key="reproducible" value="false"/><br> <parameter key="maximum_number_of_threads" value="4"/><br> <parameter key="use_local_random_seed" value="false"/><br> <parameter key="local_random_seed" value="1992"/><br> <parameter key="maximal_depth" value="20"/><br> <parameter key="min_rows" value="10.0"/><br> <parameter key="min_split_improvement" value="0.0"/><br> <parameter key="number_of_bins" value="20"/><br> <parameter key="learning_rate" value="0.1"/><br> <parameter key="sample_rate" value="1.0"/><br> <parameter key="distribution" value="AUTO"/><br> <parameter key="early_stopping" value="false"/><br> <parameter key="stopping_rounds" value="1"/><br> <parameter key="stopping_metric" value="AUTO"/><br> <parameter key="stopping_tolerance" value="0.001"/><br> <parameter key="max_runtime_seconds" value="0"/><br> <list key="expert_parameters"/><br> </operator><br> <connect from_port="training set" to_op="Gradient Boosted Trees" to_port="training set"/><br> <connect from_op="Gradient Boosted Trees" from_port="model" to_port="model"/><br> <portSpacing port="source_training set" spacing="0"/><br> <portSpacing port="sink_model" spacing="0"/><br> <portSpacing port="sink_through 1" spacing="0"/><br> </process><br> <process expanded="true"><br> <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (3)" width="90" x="45" y="34"><br> <list key="application_parameters"/><br> <parameter key="create_view" value="false"/><br> </operator><br> <operator activated="false" class="multiply" compatibility="9.1.000" expanded="true" height="68" name="Multiply (4)" width="90" x="45" y="136"/><br> <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (5)" width="90" x="179" y="34"><br> <parameter key="main_criterion" value="first"/><br> <parameter key="accuracy" value="true"/><br> <parameter key="classification_error" value="false"/><br> <parameter key="kappa" value="true"/><br> <parameter key="weighted_mean_recall" value="false"/><br> <parameter key="weighted_mean_precision" value="false"/><br> <parameter key="spearman_rho" value="false"/><br> <parameter key="kendall_tau" value="false"/><br> <parameter key="absolute_error" value="false"/><br> <parameter key="relative_error" value="false"/><br> <parameter key="relative_error_lenient" value="false"/><br> <parameter key="relative_error_strict" value="false"/><br> <parameter key="normalized_absolute_error" value="false"/><br> <parameter key="root_mean_squared_error" value="true"/><br> <parameter key="root_relative_squared_error" value="false"/><br> <parameter key="squared_error" value="false"/><br> <parameter key="correlation" value="false"/><br> <parameter key="squared_correlation" value="false"/><br> <parameter key="cross-entropy" value="false"/><br> <parameter key="margin" value="false"/><br> <parameter key="soft_margin_loss" value="false"/><br> <parameter key="logistic_loss" value="false"/><br> <parameter key="skip_undefined_labels" value="true"/><br> <parameter key="use_example_weights" value="true"/><br> <list key="class_weights"/><br> </operator><br> <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance" width="90" x="313" y="34"><br> <parameter key="use_example_weights" value="true"/><br> </operator><br> <operator activated="false" class="collect" compatibility="9.1.000" expanded="true" height="68" name="Collect" width="90" x="380" y="187"><br> <parameter key="unfold" value="false"/><br> </operator><br> <connect from_port="model" to_op="Apply Model (3)" to_port="model"/><br> <connect from_port="test set" to_op="Apply Model (3)" to_port="unlabelled data"/><br> <connect from_op="Apply Model (3)" from_port="labelled data" to_op="Performance (5)" to_port="labelled data"/><br> <connect from_op="Performance (5)" from_port="performance" to_op="Performance" to_port="performance"/><br> <connect from_op="Performance (5)" from_port="example set" to_op="Performance" to_port="labelled data"/><br> <connect from_op="Performance" from_port="performance" to_port="performance 1"/><br> <connect from_op="Performance" from_port="example set" to_port="test set results"/><br> <portSpacing port="source_model" spacing="0"/><br> <portSpacing port="source_test set" spacing="0"/><br> <portSpacing port="source_through 1" spacing="0"/><br> <portSpacing port="sink_test set results" spacing="0"/><br> <portSpacing port="sink_performance 1" spacing="0"/><br> <portSpacing port="sink_performance 2" spacing="0"/><br> </process><br> </operator><br> <connect from_op="Generate Data" from_port="output" to_op="Cross Validation (3)" to_port="example set"/><br> <connect from_op="Cross Validation (3)" from_port="test result set" to_port="result 1"/><br> <connect from_op="Cross Validation (3)" from_port="performance 1" to_port="result 2"/><br> <portSpacing port="source_input 1" spacing="0"/><br> <portSpacing port="sink_result 1" spacing="0"/><br> <portSpacing port="sink_result 2" spacing="0"/><br> <portSpacing port="sink_result 3" spacing="0"/><br> </process><br> </operator><br></process><br><br>
BR,Martin
1 -
Hi varunm1,
We just looked at the code, and it's only duplicating rows in the final test set that the test output port returns. So the performance metrics are not affected.
Regards,
Marco4 -
Thanks @Marco_Boeck for confirming.0
-
@Marco_Boeck please advise if this should get pushed to Product Feedback.0