🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

"Cross Validation test examples are more than input samples"

User: "varunm1"
New Altair Community Member
Updated by Jocelyn
Hi,

I am storing cross validation predictions using Write Excel. I see that the output samples (570553) of Cross validation 5 fold for Gradient Boosted Tree are more than input samples (316974). I am not sure if I am doing something wrong. Please see XML below.

<?xml version="1.0" encoding="UTF-8"?><process version="9.1.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.1.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="9.1.000" expanded="true" height="68" name="Retrieve Assistment_Data_Labels" width="90" x="45" y="85">
        <parameter key="repository_entry" value="../../data/AIED_New/Assistment_Data_Labels"/>
      </operator>
      <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="124" name="Multiply (6)" width="90" x="179" y="85"/>
      <operator activated="true" class="select_attributes" compatibility="9.1.000" expanded="true" height="82" name="Select Attributes (3)" width="90" x="380" y="442">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="Ln|Ln-1|NumActions|RES_BORED|RES_CONFUSED|RES_FRUSTRATED|RES_GAMING|RES_OFFTASK|attemptCount|correct|endsWithScaffolding|frIsHelpRequest|frIsHelpRequestScaffolding|frPast5HelpRequest|frPast5WrongCount|frPast8HelpRequest|frPast8WrongCount|frTotalSkillOpportunitiesScaffolding|hint|hintCount|hintTotal|manywrong|original|past8BottomOut|problemType|scaffold|skill|sumRight|sumTimePerSkill|timeGreater10SecAndNextActionRight|timeSinceSkill|timeTaken|totalFrAttempted|totalFrPastWrongCount|totalFrPercentPastWrong|totalFrSkillOpportunities|totalFrSkillOpportunitiesByScaffolding|totalFrTimeOnSkill|totalTimeByPercentCorrectForskill"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
      </operator>
      <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="166" name="Cross Validation (4)" width="90" x="581" y="442">
        <parameter key="split_on_batch_attribute" value="false"/>
        <parameter key="leave_one_out" value="false"/>
        <parameter key="number_of_folds" value="5"/>
        <parameter key="sampling_type" value="automatic"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
        <parameter key="enable_parallel_execution" value="true"/>
        <process expanded="true">
          <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees (3)" width="90" x="112" y="34">
            <parameter key="number_of_trees" value="20"/>
            <parameter key="reproducible" value="false"/>
            <parameter key="maximum_number_of_threads" value="4"/>
            <parameter key="use_local_random_seed" value="false"/>
            <parameter key="local_random_seed" value="1992"/>
            <parameter key="maximal_depth" value="20"/>
            <parameter key="min_rows" value="10.0"/>
            <parameter key="min_split_improvement" value="0.0"/>
            <parameter key="number_of_bins" value="20"/>
            <parameter key="learning_rate" value="0.1"/>
            <parameter key="sample_rate" value="1.0"/>
            <parameter key="distribution" value="AUTO"/>
            <parameter key="early_stopping" value="false"/>
            <parameter key="stopping_rounds" value="1"/>
            <parameter key="stopping_metric" value="AUTO"/>
            <parameter key="stopping_tolerance" value="0.001"/>
            <parameter key="max_runtime_seconds" value="0"/>
            <list key="expert_parameters"/>
          </operator>
          <connect from_port="training set" to_op="Gradient Boosted Trees (3)" to_port="training set"/>
          <connect from_op="Gradient Boosted Trees (3)" from_port="model" to_port="model"/>
          <portSpacing port="source_training set" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true">
          <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (4)" width="90" x="45" y="34">
            <list key="application_parameters"/>
            <parameter key="create_view" value="false"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="103" name="Multiply (3)" width="90" x="45" y="136"/>
          <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance (3)" width="90" x="246" y="340">
            <parameter key="use_example_weights" value="true"/>
          </operator>
          <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (4)" width="90" x="246" y="238">
            <parameter key="main_criterion" value="first"/>
            <parameter key="accuracy" value="true"/>
            <parameter key="classification_error" value="false"/>
            <parameter key="kappa" value="true"/>
            <parameter key="weighted_mean_recall" value="false"/>
            <parameter key="weighted_mean_precision" value="false"/>
            <parameter key="spearman_rho" value="false"/>
            <parameter key="kendall_tau" value="false"/>
            <parameter key="absolute_error" value="false"/>
            <parameter key="relative_error" value="false"/>
            <parameter key="relative_error_lenient" value="false"/>
            <parameter key="relative_error_strict" value="false"/>
            <parameter key="normalized_absolute_error" value="false"/>
            <parameter key="root_mean_squared_error" value="true"/>
            <parameter key="root_relative_squared_error" value="false"/>
            <parameter key="squared_error" value="false"/>
            <parameter key="correlation" value="false"/>
            <parameter key="squared_correlation" value="false"/>
            <parameter key="cross-entropy" value="false"/>
            <parameter key="margin" value="false"/>
            <parameter key="soft_margin_loss" value="false"/>
            <parameter key="logistic_loss" value="false"/>
            <parameter key="skip_undefined_labels" value="true"/>
            <parameter key="use_example_weights" value="true"/>
            <list key="class_weights"/>
          </operator>
          <connect from_port="model" to_op="Apply Model (4)" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model (4)" to_port="unlabelled data"/>
          <connect from_op="Apply Model (4)" from_port="labelled data" to_op="Multiply (3)" to_port="input"/>
          <connect from_op="Multiply (3)" from_port="output 1" to_op="Performance (4)" to_port="labelled data"/>
          <connect from_op="Multiply (3)" from_port="output 2" to_op="Performance (3)" to_port="labelled data"/>
          <connect from_op="Performance (3)" from_port="performance" to_port="performance 2"/>
          <connect from_op="Performance (3)" from_port="example set" to_port="test set results"/>
          <connect from_op="Performance (4)" from_port="performance" to_port="performance 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_test set results" spacing="0"/>
          <portSpacing port="sink_performance 1" spacing="0"/>
          <portSpacing port="sink_performance 2" spacing="0"/>
          <portSpacing port="sink_performance 3" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="write_excel" compatibility="9.1.000" expanded="true" height="82" name="Write Excel (3)" width="90" x="707" y="595">
        <parameter key="excel_file" value="E:\AIED_New\AIED_GBT_Test_Predictions_CV\CV_Predictions_GBT_40_Attributes.xlsx"/>
        <parameter key="file_format" value="xlsx"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="sheet_name" value="RapidMiner Data"/>
        <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
        <parameter key="number_format" value="#.0"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="9.1.000" expanded="true" height="82" name="Select Attributes" width="90" x="380" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="Ln|Ln-1|RES_BORED|RES_CONFUSED|RES_FRUSTRATED|RES_GAMING|RES_OFFTASK|attemptCount|correct|endsWithScaffolding|frIsHelpRequest|frIsHelpRequestScaffolding|frPast5HelpRequest|frPast5WrongCount|frPast8HelpRequest|frPast8WrongCount|frTotalSkillOpportunitiesScaffolding|hint|hintCount|hintTotal|manywrong|original|past8BottomOut|problemType|scaffold|skill|sumRight|sumTimePerSkill|timeGreater10SecAndNextActionRight|timeSinceSkill|timeTaken|totalFrPastWrongCount|totalFrPercentPastWrong|totalFrSkillOpportunities|totalFrSkillOpportunitiesByScaffolding|totalFrTimeOnSkill|totalTimeByPercentCorrectForskill"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
      </operator>
      <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="187" name="Cross Validation (3)" width="90" x="581" y="34">
        <parameter key="split_on_batch_attribute" value="false"/>
        <parameter key="leave_one_out" value="false"/>
        <parameter key="number_of_folds" value="5"/>
        <parameter key="sampling_type" value="automatic"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
        <parameter key="enable_parallel_execution" value="true"/>
        <process expanded="true">
          <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees" width="90" x="112" y="34">
            <parameter key="number_of_trees" value="20"/>
            <parameter key="reproducible" value="false"/>
            <parameter key="maximum_number_of_threads" value="4"/>
            <parameter key="use_local_random_seed" value="false"/>
            <parameter key="local_random_seed" value="1992"/>
            <parameter key="maximal_depth" value="20"/>
            <parameter key="min_rows" value="10.0"/>
            <parameter key="min_split_improvement" value="0.0"/>
            <parameter key="number_of_bins" value="20"/>
            <parameter key="learning_rate" value="0.1"/>
            <parameter key="sample_rate" value="1.0"/>
            <parameter key="distribution" value="AUTO"/>
            <parameter key="early_stopping" value="false"/>
            <parameter key="stopping_rounds" value="1"/>
            <parameter key="stopping_metric" value="AUTO"/>
            <parameter key="stopping_tolerance" value="0.001"/>
            <parameter key="max_runtime_seconds" value="0"/>
            <list key="expert_parameters"/>
          </operator>
          <connect from_port="training set" to_op="Gradient Boosted Trees" to_port="training set"/>
          <connect from_op="Gradient Boosted Trees" from_port="model" to_port="model"/>
          <portSpacing port="source_training set" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true">
          <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (3)" width="90" x="45" y="34">
            <list key="application_parameters"/>
            <parameter key="create_view" value="false"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="124" name="Multiply (4)" width="90" x="45" y="136"/>
          <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance" width="90" x="179" y="340">
            <parameter key="use_example_weights" value="true"/>
          </operator>
          <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (5)" width="90" x="246" y="238">
            <parameter key="main_criterion" value="first"/>
            <parameter key="accuracy" value="true"/>
            <parameter key="classification_error" value="false"/>
            <parameter key="kappa" value="true"/>
            <parameter key="weighted_mean_recall" value="false"/>
            <parameter key="weighted_mean_precision" value="false"/>
            <parameter key="spearman_rho" value="false"/>
            <parameter key="kendall_tau" value="false"/>
            <parameter key="absolute_error" value="false"/>
            <parameter key="relative_error" value="false"/>
            <parameter key="relative_error_lenient" value="false"/>
            <parameter key="relative_error_strict" value="false"/>
            <parameter key="normalized_absolute_error" value="false"/>
            <parameter key="root_mean_squared_error" value="true"/>
            <parameter key="root_relative_squared_error" value="false"/>
            <parameter key="squared_error" value="false"/>
            <parameter key="correlation" value="false"/>
            <parameter key="squared_correlation" value="false"/>
            <parameter key="cross-entropy" value="false"/>
            <parameter key="margin" value="false"/>
            <parameter key="soft_margin_loss" value="false"/>
            <parameter key="logistic_loss" value="false"/>
            <parameter key="skip_undefined_labels" value="true"/>
            <parameter key="use_example_weights" value="true"/>
            <list key="class_weights"/>
          </operator>
          <connect from_port="model" to_op="Apply Model (3)" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model (3)" to_port="unlabelled data"/>
          <connect from_op="Apply Model (3)" from_port="labelled data" to_op="Multiply (4)" to_port="input"/>
          <connect from_op="Multiply (4)" from_port="output 2" to_op="Performance (5)" to_port="labelled data"/>
          <connect from_op="Multiply (4)" from_port="output 3" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="performance 3"/>
          <connect from_op="Performance" from_port="example set" to_port="test set results"/>
          <connect from_op="Performance (5)" from_port="performance" to_port="performance 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_test set results" spacing="0"/>
          <portSpacing port="sink_performance 1" spacing="0"/>
          <portSpacing port="sink_performance 2" spacing="0"/>
          <portSpacing port="sink_performance 3" spacing="0"/>
          <portSpacing port="sink_performance 4" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="write_excel" compatibility="9.1.000" expanded="true" height="82" name="Write Excel" width="90" x="715" y="187">
        <parameter key="excel_file" value="E:\AIED_New\AIED_GBT_Test_Predictions_CV\CV_Predictions_GBT_38_Attributes.xlsx"/>
        <parameter key="file_format" value="xlsx"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="sheet_name" value="RapidMiner Data"/>
        <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
        <parameter key="number_format" value="#.0"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="9.1.000" expanded="true" height="82" name="Select Attributes (2)" width="90" x="380" y="238">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="Ln|Ln-1|RES_BORED|RES_CONFUSED|RES_FRUSTRATED|RES_GAMING|RES_OFFTASK|attemptCount|correct|endsWithScaffolding|frIsHelpRequest|frIsHelpRequestScaffolding|frPast5HelpRequest|frPast5WrongCount|frPast8HelpRequest|frPast8WrongCount|frTotalSkillOpportunitiesScaffolding|hint|hintCount|hintTotal|manywrong|original|past8BottomOut|problemType|scaffold|skill|sumRight|sumTimePerSkill|timeGreater10SecAndNextActionRight|timeSinceSkill|timeTaken|totalFrAttempted|totalFrPastWrongCount|totalFrPercentPastWrong|totalFrSkillOpportunities|totalFrSkillOpportunitiesByScaffolding|totalFrTimeOnSkill|totalTimeByPercentCorrectForskill"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
      </operator>
      <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="166" name="Cross Validation (2)" width="90" x="581" y="238">
        <parameter key="split_on_batch_attribute" value="false"/>
        <parameter key="leave_one_out" value="false"/>
        <parameter key="number_of_folds" value="5"/>
        <parameter key="sampling_type" value="automatic"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
        <parameter key="enable_parallel_execution" value="true"/>
        <process expanded="true">
          <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees (2)" width="90" x="112" y="34">
            <parameter key="number_of_trees" value="20"/>
            <parameter key="reproducible" value="false"/>
            <parameter key="maximum_number_of_threads" value="4"/>
            <parameter key="use_local_random_seed" value="false"/>
            <parameter key="local_random_seed" value="1992"/>
            <parameter key="maximal_depth" value="20"/>
            <parameter key="min_rows" value="10.0"/>
            <parameter key="min_split_improvement" value="0.0"/>
            <parameter key="number_of_bins" value="20"/>
            <parameter key="learning_rate" value="0.1"/>
            <parameter key="sample_rate" value="1.0"/>
            <parameter key="distribution" value="AUTO"/>
            <parameter key="early_stopping" value="false"/>
            <parameter key="stopping_rounds" value="1"/>
            <parameter key="stopping_metric" value="AUTO"/>
            <parameter key="stopping_tolerance" value="0.001"/>
            <parameter key="max_runtime_seconds" value="0"/>
            <list key="expert_parameters"/>
          </operator>
          <connect from_port="training set" to_op="Gradient Boosted Trees (2)" to_port="training set"/>
          <connect from_op="Gradient Boosted Trees (2)" from_port="model" to_port="model"/>
          <portSpacing port="source_training set" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true">
          <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (2)" width="90" x="45" y="34">
            <list key="application_parameters"/>
            <parameter key="create_view" value="false"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="9.1.000" expanded="true" height="103" name="Multiply (2)" width="90" x="45" y="136"/>
          <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance (8)" width="90" x="246" y="340">
            <parameter key="use_example_weights" value="true"/>
          </operator>
          <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (2)" width="90" x="246" y="85">
            <parameter key="main_criterion" value="first"/>
            <parameter key="accuracy" value="true"/>
            <parameter key="classification_error" value="false"/>
            <parameter key="kappa" value="true"/>
            <parameter key="weighted_mean_recall" value="false"/>
            <parameter key="weighted_mean_precision" value="false"/>
            <parameter key="spearman_rho" value="false"/>
            <parameter key="kendall_tau" value="false"/>
            <parameter key="absolute_error" value="false"/>
            <parameter key="relative_error" value="false"/>
            <parameter key="relative_error_lenient" value="false"/>
            <parameter key="relative_error_strict" value="false"/>
            <parameter key="normalized_absolute_error" value="false"/>
            <parameter key="root_mean_squared_error" value="true"/>
            <parameter key="root_relative_squared_error" value="false"/>
            <parameter key="squared_error" value="false"/>
            <parameter key="correlation" value="false"/>
            <parameter key="squared_correlation" value="false"/>
            <parameter key="cross-entropy" value="false"/>
            <parameter key="margin" value="false"/>
            <parameter key="soft_margin_loss" value="false"/>
            <parameter key="logistic_loss" value="false"/>
            <parameter key="skip_undefined_labels" value="true"/>
            <parameter key="use_example_weights" value="true"/>
            <list key="class_weights"/>
          </operator>
          <connect from_port="model" to_op="Apply Model (2)" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model (2)" to_port="unlabelled data"/>
          <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Multiply (2)" to_port="input"/>
          <connect from_op="Multiply (2)" from_port="output 1" to_op="Performance (2)" to_port="labelled data"/>
          <connect from_op="Multiply (2)" from_port="output 2" to_op="Performance (8)" to_port="labelled data"/>
          <connect from_op="Performance (8)" from_port="performance" to_port="performance 1"/>
          <connect from_op="Performance (8)" from_port="example set" to_port="test set results"/>
          <connect from_op="Performance (2)" from_port="performance" to_port="performance 2"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_test set results" spacing="0"/>
          <portSpacing port="sink_performance 1" spacing="0"/>
          <portSpacing port="sink_performance 2" spacing="0"/>
          <portSpacing port="sink_performance 3" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="write_excel" compatibility="9.1.000" expanded="true" height="82" name="Write Excel (2)" width="90" x="707" y="340">
        <parameter key="excel_file" value="E:\AIED_New\AIED_GBT_Test_Predictions_CV\CV_Predictions_GBT_39_Attributes.xlsx"/>
        <parameter key="file_format" value="xlsx"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="sheet_name" value="RapidMiner Data"/>
        <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
        <parameter key="number_format" value="#.0"/>
      </operator>
      <connect from_op="Retrieve Assistment_Data_Labels" from_port="output" to_op="Multiply (6)" to_port="input"/>
      <connect from_op="Multiply (6)" from_port="output 1" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Multiply (6)" from_port="output 2" to_op="Select Attributes (2)" to_port="example set input"/>
      <connect from_op="Multiply (6)" from_port="output 3" to_op="Select Attributes (3)" to_port="example set input"/>
      <connect from_op="Select Attributes (3)" from_port="example set output" to_op="Cross Validation (4)" to_port="example set"/>
      <connect from_op="Cross Validation (4)" from_port="test result set" to_op="Write Excel (3)" to_port="input"/>
      <connect from_op="Cross Validation (4)" from_port="performance 1" to_port="result 5"/>
      <connect from_op="Cross Validation (4)" from_port="performance 2" to_port="result 6"/>
      <connect from_op="Write Excel (3)" from_port="through" to_port="result 9"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Cross Validation (3)" to_port="example set"/>
      <connect from_op="Cross Validation (3)" from_port="test result set" to_op="Write Excel" to_port="input"/>
      <connect from_op="Cross Validation (3)" from_port="performance 1" to_port="result 1"/>
      <connect from_op="Cross Validation (3)" from_port="performance 2" to_port="result 2"/>
      <connect from_op="Cross Validation (3)" from_port="performance 3" to_port="result 10"/>
      <connect from_op="Write Excel" from_port="through" to_port="result 7"/>
      <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Cross Validation (2)" to_port="example set"/>
      <connect from_op="Cross Validation (2)" from_port="test result set" to_op="Write Excel (2)" to_port="input"/>
      <connect from_op="Cross Validation (2)" from_port="performance 1" to_port="result 3"/>
      <connect from_op="Cross Validation (2)" from_port="performance 2" to_port="result 4"/>
      <connect from_op="Write Excel (2)" from_port="through" to_port="result 8"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
      <portSpacing port="sink_result 6" spacing="0"/>
      <portSpacing port="sink_result 7" spacing="0"/>
      <portSpacing port="sink_result 8" spacing="0"/>
      <portSpacing port="sink_result 9" spacing="0"/>
      <portSpacing port="sink_result 10" spacing="0"/>
      <portSpacing port="sink_result 11" spacing="0"/>
    </process>
  </operator>
</process>

Thanks,
Varun

Find more posts tagged with

Sort by:
1 - 11 of 111
    User: "varunm1"
    New Altair Community Member
    OP
    Updated by varunm1
    Update on this. I see that there are duplicate columns in excel. Once I remove duplicated it gave me a correct number of samples but I am not sure why duplicate values are coming in excel. I see the confusion matrix in performance has a correct number of samples @mschmitz and @David_A any suggestion?
    User: "lionelderkrikor"
    New Altair Community Member
    Hi @varunm1,

    Interesting and amusing problem !

    I think it's linked to the 2 Performances operators you are using in the CV operator. RM performs a 5 fold CV for each Performance operator...
    but when you connect the Performances operators in serie , you will retrieve the original number of rows of your dataset. (it's not a priori linked to the Write Excel operator).

    However what I don' t undestand is why you don't obtain [2 * number of rows] but (10-1)/5 = 9/5 * [number of rows] (the second CV is not complete...).

    Regards,

    Lionel

    User: "varunm1"
    New Altair Community Member
    OP
    Updated by varunm1
    Hi @lionelderkrikor,

    That's what confuses me as well. But when I check the confusion matrix, it gives correct sample numbers (316974). Only while writing to excel or csv this is happening. But it's giving both performance results.



    Thanks,
    Varun
    User: "varunm1"
    New Altair Community Member
    OP
    @mschmitz any suggestion on this issue? Thanks
    I can reproduce and this looks like a bug, which is somewhat connected to the multiply in your apply model. I'll create a ticket on this.

    As a workaround you can just append the performance vectors like this:
    <?xml version="1.0" encoding="UTF-8"?><process version="9.1.000"><br>  <context><br>    <input/><br>    <output/><br>    <macros/><br>  </context><br>  <operator activated="true" class="process" compatibility="9.1.000" expanded="true" name="Process"><br>    <parameter key="logverbosity" value="init"/><br>    <parameter key="random_seed" value="2001"/><br>    <parameter key="send_mail" value="never"/><br>    <parameter key="notification_email" value=""/><br>    <parameter key="process_duration_for_mail" value="30"/><br>    <parameter key="encoding" value="SYSTEM"/><br>    <process expanded="true"><br>      <operator activated="true" class="generate_data" compatibility="9.1.000" expanded="true" height="68" name="Generate Data" width="90" x="45" y="238"><br>        <parameter key="target_function" value="one third classification"/><br>        <parameter key="number_examples" value="100"/><br>        <parameter key="number_of_attributes" value="5"/><br>        <parameter key="attributes_lower_bound" value="-10.0"/><br>        <parameter key="attributes_upper_bound" value="10.0"/><br>        <parameter key="gaussian_standard_deviation" value="10.0"/><br>        <parameter key="largest_radius" value="10.0"/><br>        <parameter key="use_local_random_seed" value="false"/><br>        <parameter key="local_random_seed" value="1992"/><br>        <parameter key="datamanagement" value="double_array"/><br>        <parameter key="data_management" value="auto"/><br>      </operator><br>      <operator activated="true" class="concurrency:cross_validation" compatibility="9.1.000" expanded="true" height="145" name="Cross Validation (3)" width="90" x="313" y="238"><br>        <parameter key="split_on_batch_attribute" value="false"/><br>        <parameter key="leave_one_out" value="false"/><br>        <parameter key="number_of_folds" value="5"/><br>        <parameter key="sampling_type" value="automatic"/><br>        <parameter key="use_local_random_seed" value="false"/><br>        <parameter key="local_random_seed" value="1992"/><br>        <parameter key="enable_parallel_execution" value="true"/><br>        <process expanded="true"><br>          <operator activated="true" class="h2o:gradient_boosted_trees" compatibility="9.0.000" expanded="true" height="103" name="Gradient Boosted Trees" width="90" x="112" y="34"><br>            <parameter key="number_of_trees" value="20"/><br>            <parameter key="reproducible" value="false"/><br>            <parameter key="maximum_number_of_threads" value="4"/><br>            <parameter key="use_local_random_seed" value="false"/><br>            <parameter key="local_random_seed" value="1992"/><br>            <parameter key="maximal_depth" value="20"/><br>            <parameter key="min_rows" value="10.0"/><br>            <parameter key="min_split_improvement" value="0.0"/><br>            <parameter key="number_of_bins" value="20"/><br>            <parameter key="learning_rate" value="0.1"/><br>            <parameter key="sample_rate" value="1.0"/><br>            <parameter key="distribution" value="AUTO"/><br>            <parameter key="early_stopping" value="false"/><br>            <parameter key="stopping_rounds" value="1"/><br>            <parameter key="stopping_metric" value="AUTO"/><br>            <parameter key="stopping_tolerance" value="0.001"/><br>            <parameter key="max_runtime_seconds" value="0"/><br>            <list key="expert_parameters"/><br>          </operator><br>          <connect from_port="training set" to_op="Gradient Boosted Trees" to_port="training set"/><br>          <connect from_op="Gradient Boosted Trees" from_port="model" to_port="model"/><br>          <portSpacing port="source_training set" spacing="0"/><br>          <portSpacing port="sink_model" spacing="0"/><br>          <portSpacing port="sink_through 1" spacing="0"/><br>        </process><br>        <process expanded="true"><br>          <operator activated="true" class="apply_model" compatibility="9.1.000" expanded="true" height="82" name="Apply Model (3)" width="90" x="45" y="34"><br>            <list key="application_parameters"/><br>            <parameter key="create_view" value="false"/><br>          </operator><br>          <operator activated="false" class="multiply" compatibility="9.1.000" expanded="true" height="68" name="Multiply (4)" width="90" x="45" y="136"/><br>          <operator activated="true" class="performance_classification" compatibility="9.1.000" expanded="true" height="82" name="Performance (5)" width="90" x="179" y="34"><br>            <parameter key="main_criterion" value="first"/><br>            <parameter key="accuracy" value="true"/><br>            <parameter key="classification_error" value="false"/><br>            <parameter key="kappa" value="true"/><br>            <parameter key="weighted_mean_recall" value="false"/><br>            <parameter key="weighted_mean_precision" value="false"/><br>            <parameter key="spearman_rho" value="false"/><br>            <parameter key="kendall_tau" value="false"/><br>            <parameter key="absolute_error" value="false"/><br>            <parameter key="relative_error" value="false"/><br>            <parameter key="relative_error_lenient" value="false"/><br>            <parameter key="relative_error_strict" value="false"/><br>            <parameter key="normalized_absolute_error" value="false"/><br>            <parameter key="root_mean_squared_error" value="true"/><br>            <parameter key="root_relative_squared_error" value="false"/><br>            <parameter key="squared_error" value="false"/><br>            <parameter key="correlation" value="false"/><br>            <parameter key="squared_correlation" value="false"/><br>            <parameter key="cross-entropy" value="false"/><br>            <parameter key="margin" value="false"/><br>            <parameter key="soft_margin_loss" value="false"/><br>            <parameter key="logistic_loss" value="false"/><br>            <parameter key="skip_undefined_labels" value="true"/><br>            <parameter key="use_example_weights" value="true"/><br>            <list key="class_weights"/><br>          </operator><br>          <operator activated="true" class="performance" compatibility="9.1.000" expanded="true" height="82" name="Performance" width="90" x="313" y="34"><br>            <parameter key="use_example_weights" value="true"/><br>          </operator><br>          <operator activated="false" class="collect" compatibility="9.1.000" expanded="true" height="68" name="Collect" width="90" x="380" y="187"><br>            <parameter key="unfold" value="false"/><br>          </operator><br>          <connect from_port="model" to_op="Apply Model (3)" to_port="model"/><br>          <connect from_port="test set" to_op="Apply Model (3)" to_port="unlabelled data"/><br>          <connect from_op="Apply Model (3)" from_port="labelled data" to_op="Performance (5)" to_port="labelled data"/><br>          <connect from_op="Performance (5)" from_port="performance" to_op="Performance" to_port="performance"/><br>          <connect from_op="Performance (5)" from_port="example set" to_op="Performance" to_port="labelled data"/><br>          <connect from_op="Performance" from_port="performance" to_port="performance 1"/><br>          <connect from_op="Performance" from_port="example set" to_port="test set results"/><br>          <portSpacing port="source_model" spacing="0"/><br>          <portSpacing port="source_test set" spacing="0"/><br>          <portSpacing port="source_through 1" spacing="0"/><br>          <portSpacing port="sink_test set results" spacing="0"/><br>          <portSpacing port="sink_performance 1" spacing="0"/><br>          <portSpacing port="sink_performance 2" spacing="0"/><br>        </process><br>      </operator><br>      <connect from_op="Generate Data" from_port="output" to_op="Cross Validation (3)" to_port="example set"/><br>      <connect from_op="Cross Validation (3)" from_port="test result set" to_port="result 1"/><br>      <connect from_op="Cross Validation (3)" from_port="performance 1" to_port="result 2"/><br>      <portSpacing port="source_input 1" spacing="0"/><br>      <portSpacing port="sink_result 1" spacing="0"/><br>      <portSpacing port="sink_result 2" spacing="0"/><br>      <portSpacing port="sink_result 3" spacing="0"/><br>    </process><br>  </operator><br></process><br><br>

    BR,
    Martin

    User: "varunm1"
    New Altair Community Member
    OP
    Thanks @mschmitz
    User: "varunm1"
    New Altair Community Member
    OP
    Updated by varunm1
    @mschmitz one quick question. Does this bug impact performance metrics (accuracy etc.) by any chance? I see the confusion matrix has the correct number of samples but just want to cross check.
    User: "Marco_Boeck"
    New Altair Community Member
    Hi varunm1,

    We just looked at the code, and it's only duplicating rows in the final test set that the test output port returns. So the performance metrics are not affected.

    Regards,
    Marco
    User: "varunm1"
    New Altair Community Member
    OP
    Thanks @Marco_Boeck for confirming.
    User: "sgenzer"
    Altair Employee
    @Marco_Boeck please advise if this should get pushed to Product Feedback.
    User: "jczogalla"
    New Altair Community Member
    @sgenzer you can push it there, but it will be fixed in 9.2.