🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

"Regression problem with cross-validation"

dramhamptonUser: "dramhampton"
New Altair Community Member
Updated by Jocelyn
Hi all

I have a concern about the output from cross-validation with regression.  The CV operator should break the data into (say) 10 segments and sequentially use each 10% of the data as a test set for a model built with the other 90% to measure performance - but when reporting out its model, that should be done with all the data, and the predictions made with the model using all the data.

That means that if you have a single attribute to use as a predictor, and plot the predicted value against this, you should get a straight line.

However, I get a jerky line.  This is specific to CV, if I try the same exercise with split validation it works fine.

Am I misunderstanding the way CV works or...?

To make it easier to see the problem I have adapted the Iris dataset to illustrate it, with this process:

<?xml version="1.0" encoding="UTF-8"?><process version="9.2.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.2.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="9.2.000" expanded="true" height="68" name="Retrieve Iris" width="90" x="45" y="85">
        <parameter key="repository_entry" value="//Samples/data/Iris"/>
      </operator>
      <operator activated="true" class="filter_examples" compatibility="9.2.000" expanded="true" height="103" name="Filter Examples" width="90" x="179" y="85">
        <parameter key="parameter_expression" value=""/>
        <parameter key="condition_class" value="custom_filters"/>
        <parameter key="invert_filter" value="false"/>
        <list key="filters_list">
          <parameter key="filters_entry_key" value="label.equals.Iris-virginica"/>
        </list>
        <parameter key="filters_logic_and" value="true"/>
        <parameter key="filters_check_metadata" value="true"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="9.2.000" expanded="true" height="82" name="Select Attributes" width="90" x="313" y="85">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value="label"/>
        <parameter key="attributes" value="a4|a2"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="true"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="9.2.000" expanded="true" height="82" name="Set Role" width="90" x="447" y="85">
        <parameter key="attribute_name" value="a4"/>
        <parameter key="target_role" value="label"/>
        <list key="set_additional_roles"/>
      </operator>
      <operator activated="true" class="concurrency:cross_validation" compatibility="9.2.000" expanded="true" height="145" name="Cross Validation" width="90" x="380" y="238">
        <parameter key="split_on_batch_attribute" value="false"/>
        <parameter key="leave_one_out" value="false"/>
        <parameter key="number_of_folds" value="10"/>
        <parameter key="sampling_type" value="automatic"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
        <parameter key="enable_parallel_execution" value="true"/>
        <process expanded="true">
          <operator activated="true" class="linear_regression" compatibility="9.2.000" expanded="true" height="103" name="Linear Regression" width="90" x="112" y="34">
            <parameter key="feature_selection" value="M5 prime"/>
            <parameter key="alpha" value="0.05"/>
            <parameter key="max_iterations" value="10"/>
            <parameter key="forward_alpha" value="0.05"/>
            <parameter key="backward_alpha" value="0.05"/>
            <parameter key="eliminate_colinear_features" value="true"/>
            <parameter key="min_tolerance" value="0.05"/>
            <parameter key="use_bias" value="true"/>
            <parameter key="ridge" value="1.0E-8"/>
          </operator>
          <connect from_port="training set" to_op="Linear Regression" to_port="training set"/>
          <connect from_op="Linear Regression" from_port="model" to_port="model"/>
          <portSpacing port="source_training set" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true">
          <operator activated="true" class="apply_model" compatibility="9.2.000" expanded="true" height="82" name="Apply Model" width="90" x="45" y="34">
            <list key="application_parameters"/>
            <parameter key="create_view" value="false"/>
          </operator>
          <operator activated="true" class="performance_regression" compatibility="9.2.000" expanded="true" height="82" name="Performance" width="90" x="179" y="34">
            <parameter key="main_criterion" value="first"/>
            <parameter key="root_mean_squared_error" value="false"/>
            <parameter key="absolute_error" value="false"/>
            <parameter key="relative_error" value="false"/>
            <parameter key="relative_error_lenient" value="false"/>
            <parameter key="relative_error_strict" value="false"/>
            <parameter key="normalized_absolute_error" value="false"/>
            <parameter key="root_relative_squared_error" value="false"/>
            <parameter key="squared_error" value="false"/>
            <parameter key="correlation" value="false"/>
            <parameter key="squared_correlation" value="true"/>
            <parameter key="prediction_average" value="false"/>
            <parameter key="spearman_rho" value="false"/>
            <parameter key="kendall_tau" value="false"/>
            <parameter key="skip_undefined_labels" value="true"/>
            <parameter key="use_example_weights" value="true"/>
          </operator>
          <connect from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="performance 1"/>
          <connect from_op="Performance" from_port="example set" to_port="test set results"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_test set results" spacing="0"/>
          <portSpacing port="sink_performance 1" spacing="0"/>
          <portSpacing port="sink_performance 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="apply_model" compatibility="9.2.000" expanded="true" height="82" name="Apply Model (2)" width="90" x="581" y="238">
        <list key="application_parameters"/>
        <parameter key="create_view" value="false"/>
      </operator>
      <connect from_op="Retrieve Iris" from_port="output" to_op="Filter Examples" to_port="example set input"/>
      <connect from_op="Filter Examples" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Cross Validation" to_port="example set"/>
      <connect from_op="Cross Validation" from_port="model" to_op="Apply Model (2)" to_port="model"/>
      <connect from_op="Cross Validation" from_port="example set" to_op="Apply Model (2)" to_port="unlabelled data"/>
      <connect from_op="Cross Validation" from_port="test result set" to_port="result 3"/>
      <connect from_op="Cross Validation" from_port="performance 1" to_port="result 4"/>
      <connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 1"/>
      <connect from_op="Apply Model (2)" from_port="model" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="210"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="63"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
    </process>
  </operator>
</process>


Many thanks for your help

David

Find more posts tagged with

Sort by:
1 - 2 of 21