"Cross Validation test examples are more than input samples"

varunm1
varunm1 New Altair Community Member
edited November 5 in Community Q&A
Hi,

I am storing cross validation predictions using Write Excel. I see that the output samples (570553) of Cross validation 5 fold for Gradient Boosted Tree are more than input samples (316974). I am not sure if I am doing something wrong. Please see XML below.

<?xml version="1.0" encoding="UTF-8"?><process version="9.1.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.1.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="9.1.000" expanded="true" height="68" name="Retrieve Assistment_Data_Labels" width="90" x="45" y="85">
        <parameter key="repository_entry" value="../../data/AIED_New/Assistment_Data_Labels"/>
      </operator>
      <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="124" name="Multiply (6)" width="90" x="179" y="85"/>
      <operator activated="true" class="select_attributes" compatibility="9.1.000" expanded="true" height="82" name="Select Attributes (3)" width="90" x="380" y="442">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="Ln|Ln-1|NumActions|RES_BORED|RES_CONFUSED|RES_FRUSTRATED|RES_GAMING|RES_OFFTASK|attemptCount|correct|endsWithScaffolding|frIsHelpRequest|frIsHelpRequestScaffolding|frPast5HelpRequest|frPast5WrongCount|frPast8HelpRequest|frPast8WrongCount|frTotalSkillOpportunitiesScaffolding|hint|hintCount|hintTotal|manywrong|original|past8BottomOut|problemType|scaffold|skill|sumRight|sumTimePerSkill|timeGreater10SecAndNextActionRight|timeSinceSkill|timeTaken|totalFrAttempted|totalFrPastWrongCount|totalFrPercentPastWrong|totalFrSkillOpportunities|totalFrSkillOpportunitiesByScaffolding|totalFrTimeOnSkill|totalTimeByPercentCorrectForskill"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
      </operator>
      <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="166" name="Cross Validation (4)" width="90" x="581" y="442">
        <parameter key="split_on_batch_attribute" value="false"/>
        <parameter key="leave_one_out" value="false"/>
        <parameter key="number_of_folds" value="5"/>
        <parameter key="sampling_type" value="automatic"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
        <parameter key="enable_parallel_execution" value="true"/>
        <process expanded="true">
          <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees (3)" width="90" x="112" y="34">
            <parameter key="number_of_trees" value="20"/>
            <parameter key="reproducible" value="false"/>
            <parameter key="maximum_number_of_threads" value="4"/>
            <parameter key="use_local_random_seed" value="false"/>
            <parameter key="local_random_seed" value="1992"/>
            <parameter key="maximal_depth" value="20"/>
            <parameter key="min_rows" value="10.0"/>
            <parameter key="min_split_improvement" value="0.0"/>
            <parameter key="number_of_bins" value="20"/>
            <parameter key="learning_rate" value="0.1"/>
            <parameter key="sample_rate" value="1.0"/>
            <parameter key="distribution" value="AUTO"/>
            <parameter key="early_stopping" value="false"/>
            <parameter key="stopping_rounds" value="1"/>
            <parameter key="stopping_metric" value="AUTO"/>
            <parameter key="stopping_tolerance" value="0.001"/>
            <parameter key="max_runtime_seconds" value="0"/>
            <list key="expert_parameters"/>
          </operator>
          <connect from_port="training set" to_op="Gradient Boosted Trees (3)" to_port="training set"/>
          <connect from_op="Gradient Boosted Trees (3)" from_port="model" to_port="model"/>
          <portSpacing port="source_training set" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true">
          <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (4)" width="90" x="45" y="34">
            <list key="application_parameters"/>
            <parameter key="create_view" value="false"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="103" name="Multiply (3)" width="90" x="45" y="136"/>
          <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance (3)" width="90" x="246" y="340">
            <parameter key="use_example_weights" value="true"/>
          </operator>
          <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (4)" width="90" x="246" y="238">
            <parameter key="main_criterion" value="first"/>
            <parameter key="accuracy" value="true"/>
            <parameter key="classification_error" value="false"/>
            <parameter key="kappa" value="true"/>
            <parameter key="weighted_mean_recall" value="false"/>
            <parameter key="weighted_mean_precision" value="false"/>
            <parameter key="spearman_rho" value="false"/>
            <parameter key="kendall_tau" value="false"/>
            <parameter key="absolute_error" value="false"/>
            <parameter key="relative_error" value="false"/>
            <parameter key="relative_error_lenient" value="false"/>
            <parameter key="relative_error_strict" value="false"/>
            <parameter key="normalized_absolute_error" value="false"/>
            <parameter key="root_mean_squared_error" value="true"/>
            <parameter key="root_relative_squared_error" value="false"/>
            <parameter key="squared_error" value="false"/>
            <parameter key="correlation" value="false"/>
            <parameter key="squared_correlation" value="false"/>
            <parameter key="cross-entropy" value="false"/>
            <parameter key="margin" value="false"/>
            <parameter key="soft_margin_loss" value="false"/>
            <parameter key="logistic_loss" value="false"/>
            <parameter key="skip_undefined_labels" value="true"/>
            <parameter key="use_example_weights" value="true"/>
            <list key="class_weights"/>
          </operator>
          <connect from_port="model" to_op="Apply Model (4)" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model (4)" to_port="unlabelled data"/>
          <connect from_op="Apply Model (4)" from_port="labelled data" to_op="Multiply (3)" to_port="input"/>
          <connect from_op="Multiply (3)" from_port="output 1" to_op="Performance (4)" to_port="labelled data"/>
          <connect from_op="Multiply (3)" from_port="output 2" to_op="Performance (3)" to_port="labelled data"/>
          <connect from_op="Performance (3)" from_port="performance" to_port="performance 2"/>
          <connect from_op="Performance (3)" from_port="example set" to_port="test set results"/>
          <connect from_op="Performance (4)" from_port="performance" to_port="performance 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_test set results" spacing="0"/>
          <portSpacing port="sink_performance 1" spacing="0"/>
          <portSpacing port="sink_performance 2" spacing="0"/>
          <portSpacing port="sink_performance 3" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="write_excel" compatibility="9.1.000" expanded="true" height="82" name="Write Excel (3)" width="90" x="707" y="595">
        <parameter key="excel_file" value="E:\AIED_New\AIED_GBT_Test_Predictions_CV\CV_Predictions_GBT_40_Attributes.xlsx"/>
        <parameter key="file_format" value="xlsx"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="sheet_name" value="RapidMiner Data"/>
        <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
        <parameter key="number_format" value="#.0"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="9.1.000" expanded="true" height="82" name="Select Attributes" width="90" x="380" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="Ln|Ln-1|RES_BORED|RES_CONFUSED|RES_FRUSTRATED|RES_GAMING|RES_OFFTASK|attemptCount|correct|endsWithScaffolding|frIsHelpRequest|frIsHelpRequestScaffolding|frPast5HelpRequest|frPast5WrongCount|frPast8HelpRequest|frPast8WrongCount|frTotalSkillOpportunitiesScaffolding|hint|hintCount|hintTotal|manywrong|original|past8BottomOut|problemType|scaffold|skill|sumRight|sumTimePerSkill|timeGreater10SecAndNextActionRight|timeSinceSkill|timeTaken|totalFrPastWrongCount|totalFrPercentPastWrong|totalFrSkillOpportunities|totalFrSkillOpportunitiesByScaffolding|totalFrTimeOnSkill|totalTimeByPercentCorrectForskill"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
      </operator>
      <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="187" name="Cross Validation (3)" width="90" x="581" y="34">
        <parameter key="split_on_batch_attribute" value="false"/>
        <parameter key="leave_one_out" value="false"/>
        <parameter key="number_of_folds" value="5"/>
        <parameter key="sampling_type" value="automatic"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
        <parameter key="enable_parallel_execution" value="true"/>
        <process expanded="true">
          <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees" width="90" x="112" y="34">
            <parameter key="number_of_trees" value="20"/>
            <parameter key="reproducible" value="false"/>
            <parameter key="maximum_number_of_threads" value="4"/>
            <parameter key="use_local_random_seed" value="false"/>
            <parameter key="local_random_seed" value="1992"/>
            <parameter key="maximal_depth" value="20"/>
            <parameter key="min_rows" value="10.0"/>
            <parameter key="min_split_improvement" value="0.0"/>
            <parameter key="number_of_bins" value="20"/>
            <parameter key="learning_rate" value="0.1"/>
            <parameter key="sample_rate" value="1.0"/>
            <parameter key="distribution" value="AUTO"/>
            <parameter key="early_stopping" value="false"/>
            <parameter key="stopping_rounds" value="1"/>
            <parameter key="stopping_metric" value="AUTO"/>
            <parameter key="stopping_tolerance" value="0.001"/>
            <parameter key="max_runtime_seconds" value="0"/>
            <list key="expert_parameters"/>
          </operator>
          <connect from_port="training set" to_op="Gradient Boosted Trees" to_port="training set"/>
          <connect from_op="Gradient Boosted Trees" from_port="model" to_port="model"/>
          <portSpacing port="source_training set" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true">
          <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (3)" width="90" x="45" y="34">
            <list key="application_parameters"/>
            <parameter key="create_view" value="false"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="124" name="Multiply (4)" width="90" x="45" y="136"/>
          <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance" width="90" x="179" y="340">
            <parameter key="use_example_weights" value="true"/>
          </operator>
          <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (5)" width="90" x="246" y="238">
            <parameter key="main_criterion" value="first"/>
            <parameter key="accuracy" value="true"/>
            <parameter key="classification_error" value="false"/>
            <parameter key="kappa" value="true"/>
            <parameter key="weighted_mean_recall" value="false"/>
            <parameter key="weighted_mean_precision" value="false"/>
            <parameter key="spearman_rho" value="false"/>
            <parameter key="kendall_tau" value="false"/>
            <parameter key="absolute_error" value="false"/>
            <parameter key="relative_error" value="false"/>
            <parameter key="relative_error_lenient" value="false"/>
            <parameter key="relative_error_strict" value="false"/>
            <parameter key="normalized_absolute_error" value="false"/>
            <parameter key="root_mean_squared_error" value="true"/>
            <parameter key="root_relative_squared_error" value="false"/>
            <parameter key="squared_error" value="false"/>
            <parameter key="correlation" value="false"/>
            <parameter key="squared_correlation" value="false"/>
            <parameter key="cross-entropy" value="false"/>
            <parameter key="margin" value="false"/>
            <parameter key="soft_margin_loss" value="false"/>
            <parameter key="logistic_loss" value="false"/>
            <parameter key="skip_undefined_labels" value="true"/>
            <parameter key="use_example_weights" value="true"/>
            <list key="class_weights"/>
          </operator>
          <connect from_port="model" to_op="Apply Model (3)" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model (3)" to_port="unlabelled data"/>
          <connect from_op="Apply Model (3)" from_port="labelled data" to_op="Multiply (4)" to_port="input"/>
          <connect from_op="Multiply (4)" from_port="output 2" to_op="Performance (5)" to_port="labelled data"/>
          <connect from_op="Multiply (4)" from_port="output 3" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="performance 3"/>
          <connect from_op="Performance" from_port="example set" to_port="test set results"/>
          <connect from_op="Performance (5)" from_port="performance" to_port="performance 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_test set results" spacing="0"/>
          <portSpacing port="sink_performance 1" spacing="0"/>
          <portSpacing port="sink_performance 2" spacing="0"/>
          <portSpacing port="sink_performance 3" spacing="0"/>
          <portSpacing port="sink_performance 4" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="write_excel" compatibility="9.1.000" expanded="true" height="82" name="Write Excel" width="90" x="715" y="187">
        <parameter key="excel_file" value="E:\AIED_New\AIED_GBT_Test_Predictions_CV\CV_Predictions_GBT_38_Attributes.xlsx"/>
        <parameter key="file_format" value="xlsx"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="sheet_name" value="RapidMiner Data"/>
        <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
        <parameter key="number_format" value="#.0"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="9.1.000" expanded="true" height="82" name="Select Attributes (2)" width="90" x="380" y="238">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="Ln|Ln-1|RES_BORED|RES_CONFUSED|RES_FRUSTRATED|RES_GAMING|RES_OFFTASK|attemptCount|correct|endsWithScaffolding|frIsHelpRequest|frIsHelpRequestScaffolding|frPast5HelpRequest|frPast5WrongCount|frPast8HelpRequest|frPast8WrongCount|frTotalSkillOpportunitiesScaffolding|hint|hintCount|hintTotal|manywrong|original|past8BottomOut|problemType|scaffold|skill|sumRight|sumTimePerSkill|timeGreater10SecAndNextActionRight|timeSinceSkill|timeTaken|totalFrAttempted|totalFrPastWrongCount|totalFrPercentPastWrong|totalFrSkillOpportunities|totalFrSkillOpportunitiesByScaffolding|totalFrTimeOnSkill|totalTimeByPercentCorrectForskill"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
      </operator>
      <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="166" name="Cross Validation (2)" width="90" x="581" y="238">
        <parameter key="split_on_batch_attribute" value="false"/>
        <parameter key="leave_one_out" value="false"/>
        <parameter key="number_of_folds" value="5"/>
        <parameter key="sampling_type" value="automatic"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
        <parameter key="enable_parallel_execution" value="true"/>
        <process expanded="true">
          <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees (2)" width="90" x="112" y="34">
            <parameter key="number_of_trees" value="20"/>
            <parameter key="reproducible" value="false"/>
            <parameter key="maximum_number_of_threads" value="4"/>
            <parameter key="use_local_random_seed" value="false"/>
            <parameter key="local_random_seed" value="1992"/>
            <parameter key="maximal_depth" value="20"/>
            <parameter key="min_rows" value="10.0"/>
            <parameter key="min_split_improvement" value="0.0"/>
            <parameter key="number_of_bins" value="20"/>
            <parameter key="learning_rate" value="0.1"/>
            <parameter key="sample_rate" value="1.0"/>
            <parameter key="distribution" value="AUTO"/>
            <parameter key="early_stopping" value="false"/>
            <parameter key="stopping_rounds" value="1"/>
            <parameter key="stopping_metric" value="AUTO"/>
            <parameter key="stopping_tolerance" value="0.001"/>
            <parameter key="max_runtime_seconds" value="0"/>
            <list key="expert_parameters"/>
          </operator>
          <connect from_port="training set" to_op="Gradient Boosted Trees (2)" to_port="training set"/>
          <connect from_op="Gradient Boosted Trees (2)" from_port="model" to_port="model"/>
          <portSpacing port="source_training set" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true">
          <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (2)" width="90" x="45" y="34">
            <list key="application_parameters"/>
            <parameter key="create_view" value="false"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="103" name="Multiply (2)" width="90" x="45" y="136"/>
          <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance (8)" width="90" x="246" y="340">
            <parameter key="use_example_weights" value="true"/>
          </operator>
          <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (2)" width="90" x="246" y="85">
            <parameter key="main_criterion" value="first"/>
            <parameter key="accuracy" value="true"/>
            <parameter key="classification_error" value="false"/>
            <parameter key="kappa" value="true"/>
            <parameter key="weighted_mean_recall" value="false"/>
            <parameter key="weighted_mean_precision" value="false"/>
            <parameter key="spearman_rho" value="false"/>
            <parameter key="kendall_tau" value="false"/>
            <parameter key="absolute_error" value="false"/>
            <parameter key="relative_error" value="false"/>
            <parameter key="relative_error_lenient" value="false"/>
            <parameter key="relative_error_strict" value="false"/>
            <parameter key="normalized_absolute_error" value="false"/>
            <parameter key="root_mean_squared_error" value="true"/>
            <parameter key="root_relative_squared_error" value="false"/>
            <parameter key="squared_error" value="false"/>
            <parameter key="correlation" value="false"/>
            <parameter key="squared_correlation" value="false"/>
            <parameter key="cross-entropy" value="false"/>
            <parameter key="margin" value="false"/>
            <parameter key="soft_margin_loss" value="false"/>
            <parameter key="logistic_loss" value="false"/>
            <parameter key="skip_undefined_labels" value="true"/>
            <parameter key="use_example_weights" value="true"/>
            <list key="class_weights"/>
          </operator>
          <connect from_port="model" to_op="Apply Model (2)" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model (2)" to_port="unlabelled data"/>
          <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Multiply (2)" to_port="input"/>
          <connect from_op="Multiply (2)" from_port="output 1" to_op="Performance (2)" to_port="labelled data"/>
          <connect from_op="Multiply (2)" from_port="output 2" to_op="Performance (8)" to_port="labelled data"/>
          <connect from_op="Performance (8)" from_port="performance" to_port="performance 1"/>
          <connect from_op="Performance (8)" from_port="example set" to_port="test set results"/>
          <connect from_op="Performance (2)" from_port="performance" to_port="performance 2"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_test set results" spacing="0"/>
          <portSpacing port="sink_performance 1" spacing="0"/>
          <portSpacing port="sink_performance 2" spacing="0"/>
          <portSpacing port="sink_performance 3" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="write_excel" compatibility="9.1.000" expanded="true" height="82" name="Write Excel (2)" width="90" x="707" y="340">
        <parameter key="excel_file" value="E:\AIED_New\AIED_GBT_Test_Predictions_CV\CV_Predictions_GBT_39_Attributes.xlsx"/>
        <parameter key="file_format" value="xlsx"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="sheet_name" value="RapidMiner Data"/>
        <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
        <parameter key="number_format" value="#.0"/>
      </operator>
      <connect from_op="Retrieve Assistment_Data_Labels" from_port="output" to_op="Multiply (6)" to_port="input"/>
      <connect from_op="Multiply (6)" from_port="output 1" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Multiply (6)" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
      <connect from_op="Multiply (6)" from_port="output 3" to_op="Select Attributes (3)" to_port="example set input"/>
      <connect from_op="Select Attributes (3)" from_port="example set output" to_op="Cross Validation (4)" to_port="example set"/>
      <connect from_op="Cross Validation (4)" from_port="test result set" to_op="Write Excel (3)" to_port="input"/>
      <connect from_op="Cross Validation (4)" from_port="performance 1" to_port="result 5"/>
      <connect from_op="Cross Validation (4)" from_port="performance 2" to_port="result 6"/>
      <connect from_op="Write Excel (3)" from_port="through" to_port="result 9"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Cross Validation (3)" to_port="example set"/>
      <connect from_op="Cross Validation (3)" from_port="test result set" to_op="Write Excel" to_port="input"/>
      <connect from_op="Cross Validation (3)" from_port="performance 1" to_port="result 1"/>
      <connect from_op="Cross Validation (3)" from_port="performance 2" to_port="result 2"/>
      <connect from_op="Cross Validation (3)" from_port="performance 3" to_port="result 10"/>
      <connect from_op="Write Excel" from_port="through" to_port="result 7"/>
      <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Cross Validation (2)" to_port="example set"/>
      <connect from_op="Cross Validation (2)" from_port="test result set" to_op="Write Excel (2)" to_port="input"/>
      <connect from_op="Cross Validation (2)" from_port="performance 1" to_port="result 3"/>
      <connect from_op="Cross Validation (2)" from_port="performance 2" to_port="result 4"/>
      <connect from_op="Write Excel (2)" from_port="through" to_port="result 8"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
      <portSpacing port="sink_result 6" spacing="0"/>
      <portSpacing port="sink_result 7" spacing="0"/>
      <portSpacing port="sink_result 8" spacing="0"/>
      <portSpacing port="sink_result 9" spacing="0"/>
      <portSpacing port="sink_result 10" spacing="0"/>
      <portSpacing port="sink_result 11" spacing="0"/>
    </process>
  </operator>
</process>

Thanks,
Varun

Answers

  • varunm1
    varunm1 New Altair Community Member
    edited January 2019
    Update on this. I see that there are duplicate columns in excel. Once I remove duplicated it gave me a correct number of samples but I am not sure why duplicate values are coming in excel. I see the confusion matrix in performance has a correct number of samples @mschmitz and @David_A any suggestion?
  • lionelderkrikor
    lionelderkrikor New Altair Community Member
    Hi @varunm1,

    Interesting and amusing problem !

    I think it's linked to the 2 Performances operators you are using in the CV operator. RM performs a 5 fold CV for each Performance operator...
    but when you connect the Performances operators in serie , you will retrieve the original number of rows of your dataset. (it's not a priori linked to the Write Excel operator).

    However what I don' t undestand is why you don't obtain [2 * number of rows] but (10-1)/5 = 9/5 * [number of rows] (the second CV is not complete...).

    Regards,

    Lionel

  • varunm1
    varunm1 New Altair Community Member
    edited January 2019
    Hi @lionelderkrikor,

    That's what confuses me as well. But when I check the confusion matrix, it gives correct sample numbers (316974). Only while writing to excel or csv this is happening. But it's giving both performance results.



    Thanks,
    Varun
  • varunm1
    varunm1 New Altair Community Member
    @mschmitz any suggestion on this issue? Thanks
  • MartinLiebig
    MartinLiebig
    Altair Employee
    I can reproduce and this looks like a bug, which is somewhat connected to the multiply in your apply model. I'll create a ticket on this.

    As a workaround you can just append the performance vectors like this:
    <?xml version="1.0" encoding="UTF-8"?><process version="9.1.000"><br>  <context><br>    <input/><br>    <output/><br>    <macros/><br>  </context><br>  <operator activated="true" class="process" compatibility="9.1.000" expanded="true" name="Process"><br>    <parameter key="logverbosity" value="init"/><br>    <parameter key="random_seed" value="2001"/><br>    <parameter key="send_mail" value="never"/><br>    <parameter key="notification_email" value=""/><br>    <parameter key="process_duration_for_mail" value="30"/><br>    <parameter key="encoding" value="SYSTEM"/><br>    <process expanded="true"><br>      <operator activated="true" class="generate_data" compatibility="9.1.000" expanded="true" height="68" name="Generate Data" width="90" x="45" y="238"><br>        <parameter key="target_function" value="one third classification"/><br>        <parameter key="number_examples" value="100"/><br>        <parameter key="number_of_attributes" value="5"/><br>        <parameter key="attributes_lower_bound" value="-10.0"/><br>        <parameter key="attributes_upper_bound" value="10.0"/><br>        <parameter key="gaussian_standard_deviation" value="10.0"/><br>        <parameter key="largest_radius" value="10.0"/><br>        <parameter key="use_local_random_seed" value="false"/><br>        <parameter key="local_random_seed" value="1992"/><br>        <parameter key="datamanagement" value="double_array"/><br>        <parameter key="data_management" value="auto"/><br>      </operator><br>      <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="145" name="Cross Validation (3)" width="90" x="313" y="238"><br>        <parameter key="split_on_batch_attribute" value="false"/><br>        <parameter key="leave_one_out" value="false"/><br>        <parameter key="number_of_folds" value="5"/><br>        <parameter key="sampling_type" value="automatic"/><br>        <parameter key="use_local_random_seed" value="false"/><br>        <parameter key="local_random_seed" value="1992"/><br>        <parameter key="enable_parallel_execution" value="true"/><br>        <process expanded="true"><br>          <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees" width="90" x="112" y="34"><br>            <parameter key="number_of_trees" value="20"/><br>            <parameter key="reproducible" value="false"/><br>            <parameter key="maximum_number_of_threads" value="4"/><br>            <parameter key="use_local_random_seed" value="false"/><br>            <parameter key="local_random_seed" value="1992"/><br>            <parameter key="maximal_depth" value="20"/><br>            <parameter key="min_rows" value="10.0"/><br>            <parameter key="min_split_improvement" value="0.0"/><br>            <parameter key="number_of_bins" value="20"/><br>            <parameter key="learning_rate" value="0.1"/><br>            <parameter key="sample_rate" value="1.0"/><br>            <parameter key="distribution" value="AUTO"/><br>            <parameter key="early_stopping" value="false"/><br>            <parameter key="stopping_rounds" value="1"/><br>            <parameter key="stopping_metric" value="AUTO"/><br>            <parameter key="stopping_tolerance" value="0.001"/><br>            <parameter key="max_runtime_seconds" value="0"/><br>            <list key="expert_parameters"/><br>          </operator><br>          <connect from_port="training set" to_op="Gradient Boosted Trees" to_port="training set"/><br>          <connect from_op="Gradient Boosted Trees" from_port="model" to_port="model"/><br>          <portSpacing port="source_training set" spacing="0"/><br>          <portSpacing port="sink_model" spacing="0"/><br>          <portSpacing port="sink_through 1" spacing="0"/><br>        </process><br>        <process expanded="true"><br>          <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (3)" width="90" x="45" y="34"><br>            <list key="application_parameters"/><br>            <parameter key="create_view" value="false"/><br>          </operator><br>          <operator activated="false" class="multiply" compatibility="9.1.000" expanded="true" height="68" name="Multiply (4)" width="90" x="45" y="136"/><br>          <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (5)" width="90" x="179" y="34"><br>            <parameter key="main_criterion" value="first"/><br>            <parameter key="accuracy" value="true"/><br>            <parameter key="classification_error" value="false"/><br>            <parameter key="kappa" value="true"/><br>            <parameter key="weighted_mean_recall" value="false"/><br>            <parameter key="weighted_mean_precision" value="false"/><br>            <parameter key="spearman_rho" value="false"/><br>            <parameter key="kendall_tau" value="false"/><br>            <parameter key="absolute_error" value="false"/><br>            <parameter key="relative_error" value="false"/><br>            <parameter key="relative_error_lenient" value="false"/><br>            <parameter key="relative_error_strict" value="false"/><br>            <parameter key="normalized_absolute_error" value="false"/><br>            <parameter key="root_mean_squared_error" value="true"/><br>            <parameter key="root_relative_squared_error" value="false"/><br>            <parameter key="squared_error" value="false"/><br>            <parameter key="correlation" value="false"/><br>            <parameter key="squared_correlation" value="false"/><br>            <parameter key="cross-entropy" value="false"/><br>            <parameter key="margin" value="false"/><br>            <parameter key="soft_margin_loss" value="false"/><br>            <parameter key="logistic_loss" value="false"/><br>            <parameter key="skip_undefined_labels" value="true"/><br>            <parameter key="use_example_weights" value="true"/><br>            <list key="class_weights"/><br>          </operator><br>          <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance" width="90" x="313" y="34"><br>            <parameter key="use_example_weights" value="true"/><br>          </operator><br>          <operator activated="false" class="collect" compatibility="9.1.000" expanded="true" height="68" name="Collect" width="90" x="380" y="187"><br>            <parameter key="unfold" value="false"/><br>          </operator><br>          <connect from_port="model" to_op="Apply Model (3)" to_port="model"/><br>          <connect from_port="test set" to_op="Apply Model (3)" to_port="unlabelled data"/><br>          <connect from_op="Apply Model (3)" from_port="labelled data" to_op="Performance (5)" to_port="labelled data"/><br>          <connect from_op="Performance (5)" from_port="performance" to_op="Performance" to_port="performance"/><br>          <connect from_op="Performance (5)" from_port="example set" to_op="Performance" to_port="labelled data"/><br>          <connect from_op="Performance" from_port="performance" to_port="performance 1"/><br>          <connect from_op="Performance" from_port="example set" to_port="test set results"/><br>          <portSpacing port="source_model" spacing="0"/><br>          <portSpacing port="source_test set" spacing="0"/><br>          <portSpacing port="source_through 1" spacing="0"/><br>          <portSpacing port="sink_test set results" spacing="0"/><br>          <portSpacing port="sink_performance 1" spacing="0"/><br>          <portSpacing port="sink_performance 2" spacing="0"/><br>        </process><br>      </operator><br>      <connect from_op="Generate Data" from_port="output" to_op="Cross Validation (3)" to_port="example set"/><br>      <connect from_op="Cross Validation (3)" from_port="test result set" to_port="result 1"/><br>      <connect from_op="Cross Validation (3)" from_port="performance 1" to_port="result 2"/><br>      <portSpacing port="source_input 1" spacing="0"/><br>      <portSpacing port="sink_result 1" spacing="0"/><br>      <portSpacing port="sink_result 2" spacing="0"/><br>      <portSpacing port="sink_result 3" spacing="0"/><br>    </process><br>  </operator><br></process><br><br>

    BR,
    Martin

  • varunm1
    varunm1 New Altair Community Member
    Thanks @mschmitz
  • varunm1
    varunm1 New Altair Community Member
    edited February 2019
    @mschmitz one quick question. Does this bug impact performance metrics (accuracy etc.) by any chance? I see the confusion matrix has the correct number of samples but just want to cross check.
  • Marco_Boeck
    Marco_Boeck New Altair Community Member
    Hi varunm1,

    We just looked at the code, and it's only duplicating rows in the final test set that the test output port returns. So the performance metrics are not affected.

    Regards,
    Marco
  • varunm1
    varunm1 New Altair Community Member
    Thanks @Marco_Boeck for confirming.
  • sgenzer
    sgenzer
    Altair Employee
    @Marco_Boeck please advise if this should get pushed to Product Feedback.
  • jczogalla
    jczogalla New Altair Community Member
    @sgenzer you can push it there, but it will be fixed in 9.2.