X-Validation logged value changes a lot out of optimization loop on same trainin

njasaj
njasaj New Altair Community Member
edited November 5 in Community Q&A
Hi,
I am trying to find best parameters for SVM regression by Cross Validation and parameter optimization. I have used log operator to log optimization process and values. In log window, i search for best parameters based on correlation coefficient criteria, for example it's  logged  value is 84.5.The problem is that logged performance doesn't remain fixed and differs a lot (changes to 78) every time i apply the same parameters and data  in a new X-validation or even when i change the steps  and change it form 30 to 60 for example i get the same result in log window but differet result when test those parameter in a new X-validation.
Thanks a lot.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
 <context>
   <input/>
   <output/>
   <macros/>
 </context>
 <operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
   <process expanded="true" height="314" width="748">
     <operator activated="true" class="read_csv" compatibility="5.2.008" expanded="true" height="60" name="Read CSV" width="90" x="48" y="117">
       <list key="annotations"/>
       <list key="data_set_meta_data_information"/>
     </operator>
     <operator activated="true" class="normalize" compatibility="5.2.008" expanded="true" height="94" name="Normalize" width="90" x="313" y="210">
       <parameter key="include_special_attributes" value="true"/>
       <parameter key="method" value="range transformation"/>
     </operator>
     <operator activated="true" class="filter_examples" compatibility="5.2.008" expanded="true" height="76" name="Filter Examples" width="90" x="447" y="30">
       <parameter key="condition_class" value="attribute_value_filter"/>
       <parameter key="parameter_string" value="well-id=0.5"/>
       <parameter key="invert_filter" value="true"/>
     </operator>
     <operator activated="true" class="optimize_parameters_grid" compatibility="5.2.008" expanded="true" height="94" name="Optimize Parameters (Grid)" width="90" x="648" y="30">
       <list key="parameters">
         <parameter key="SVM.C" value="[30;3000;30;linear]"/>
         <parameter key="SVM.gamma" value="[0.1;0.15;4;linear]"/>
       </list>
       <process expanded="true" height="629" width="950">
         <operator activated="true" class="x_validation" compatibility="5.2.008" expanded="true" height="112" name="Validation" width="90" x="179" y="210">
           <parameter key="sampling_type" value="shuffled sampling"/>
           <process expanded="true" height="629" width="450">
             <operator activated="true" class="support_vector_machine_libsvm" compatibility="5.2.008" expanded="true" height="76" name="SVM" width="90" x="179" y="30">
               <parameter key="svm_type" value="epsilon-SVR"/>
               <parameter key="gamma" value="0.1"/>
               <parameter key="C" value="875.75"/>
               <parameter key="p" value="0.01"/>
               <list key="class_weights"/>
             </operator>
             <connect from_port="training" to_op="SVM" to_port="training set"/>
             <connect from_op="SVM" from_port="model" to_port="model"/>
             <portSpacing port="source_training" spacing="0"/>
             <portSpacing port="sink_model" spacing="0"/>
             <portSpacing port="sink_through 1" spacing="0"/>
           </process>
           <process expanded="true" height="629" width="450">
             <operator activated="true" class="apply_model" compatibility="5.2.008" expanded="true" height="76" name="Apply Model" width="90" x="97" y="32">
               <list key="application_parameters"/>
             </operator>
             <operator activated="true" class="performance_regression" compatibility="5.2.008" expanded="true" height="76" name="Performance" width="90" x="246" y="30">
               <parameter key="main_criterion" value="squared_correlation"/>
               <parameter key="absolute_error" value="true"/>
               <parameter key="correlation" value="true"/>
               <parameter key="skip_undefined_labels" value="false"/>
             </operator>
             <connect from_port="model" to_op="Apply Model" to_port="model"/>
             <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
             <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
             <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
             <portSpacing port="source_model" spacing="0"/>
             <portSpacing port="source_test set" spacing="0"/>
             <portSpacing port="source_through 1" spacing="0"/>
             <portSpacing port="sink_averagable 1" spacing="0"/>
             <portSpacing port="sink_averagable 2" spacing="0"/>
           </process>
         </operator>
         <operator activated="true" class="log" compatibility="5.2.008" expanded="true" height="76" name="Log" width="90" x="514" y="210">
           <list key="log">
             <parameter key="C" value="operator.SVM.parameter.C"/>
             <parameter key="Gamma" value="operator.SVM.parameter.gamma"/>
             <parameter key="C OF D" value="operator.Validation.value.performance"/>
             <parameter key="RMSE" value="operator.Validation.value.performance1"/>
             <parameter key="ABS" value="operator.Validation.value.performance2"/>
             <parameter key="CC" value="operator.Validation.value.performance3"/>
             <parameter key="DEVIATION" value="operator.Validation.value.deviation"/>
             <parameter key="VARIANCE" value="operator.Validation.value.variance"/>
           </list>
         </operator>
         <connect from_port="input 1" to_op="Validation" to_port="training"/>
         <connect from_op="Validation" from_port="averagable 1" to_op="Log" to_port="through 1"/>
         <connect from_op="Log" from_port="through 1" to_port="performance"/>
         <portSpacing port="source_input 1" spacing="0"/>
         <portSpacing port="source_input 2" spacing="0"/>
         <portSpacing port="sink_performance" spacing="0"/>
         <portSpacing port="sink_result 1" spacing="0"/>
       </process>
     </operator>
     <connect from_op="Read CSV" from_port="output" to_op="Normalize" to_port="example set input"/>
     <connect from_op="Normalize" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
     <connect from_op="Filter Examples" from_port="example set output" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
     <connect from_op="Optimize Parameters (Grid)" from_port="performance" to_port="result 1"/>
     <connect from_op="Optimize Parameters (Grid)" from_port="parameter" to_port="result 2"/>
     <portSpacing port="source_input 1" spacing="162"/>
     <portSpacing port="sink_result 1" spacing="0"/>
     <portSpacing port="sink_result 2" spacing="0"/>
     <portSpacing port="sink_result 3" spacing="0"/>
   </process>
 </operator>
</process>

Answers

  • MariusHelf
    MariusHelf New Altair Community Member
    Hi,

    as always it is a good idea to describe your data in detail. In this case it would be very interesting to know the amount of examples you are using.
    Additionally, the performance vector does not only show the value of interest, but also its standard deviation. If that value is high, than with a high probability the performance will differ when you apply the learner on new data. It means, that your data is very hard to analyse, or that the model/parameters/preprocessing you are using do not fit the properties and requirements of the data.

    In this case, the performance of the X-Validation does also heavily depend on the split - if you have "easy" examples on the testing side, then you will get a higher estimation for the performance. So even if you apply a cross validation twice on the same data with the same model and parameters, the results may differ. To overcome this, you can fix the local random seed of both X-Validations to the same value. This will cause it to create always the same split. Just be careful not to optimize the random seed instead of the model parameter ;)

    Best, Marius
  • njasaj
    njasaj New Altair Community Member
    Hi Marius,
    Thanks for your reply. I have  1100 samples for training and 400 samples. Coefficient of determination deviation of cross validation is about 0.08. I tried to use parameters with lowest cross validation deviation.
    I'm currently just working on my training set.Now by fixing random seed, cross validation is much more stable than before but there is still change in performance criteria when i apply cross validation on same data set twice or more. I'm really in trouble with this.  :'(

    Regards
  • MariusHelf
    MariusHelf New Altair Community Member
    Hi,

    a standard deviation of 0.08 means that with a probability of 67% the performance will be within 0.84 +- 0.08. A X-Validation with a new random seed can be interpreted as new data, so a drop to 74% is perfectly valid and expected.

    Please post the process where you apply the X-Validation twice on the same data and get different results, so that I can have a look on it (please try to reduce the process to the minimal relevant parts).

    Best,
    Marius
  • njasaj
    njasaj New Altair Community Member
    Hi Marius,
    I try to find the parameters from log window of the previous code(note that i use random seed of 640 after your reply but this is not implemented in previous post).
    I copy those parameters into the SVM setting box of the below code (Here also i use random seed of 640). The result differs a lot .Even after restarting program i get different results from below code with exact same parameters and data.

    Thank you a lot for your patience and replies.

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.2.008">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
        <process expanded="true" height="629" width="366">
          <operator activated="true" class="read_csv" compatibility="5.2.008" expanded="true" height="60" name="Read CSV" width="90" x="45" y="75">
            <parameter key="column_separators" value=","/>
            <parameter key="first_row_as_names" value="false"/>
            <list key="annotations">
              <parameter key="0" value="Name"/>
            </list>
            <parameter key="encoding" value="windows-1252"/>
            <list key="data_set_meta_data_information">
              <parameter key="0" value="LD.true.real.attribute"/>
              <parameter key="1" value="well-id.true.integer.attribute"/>
              <parameter key="2" value="DT.true.real.attribute"/>
              <parameter key="3" value="HCAL.true.real.attribute"/>
              <parameter key="4" value="GR.true.real.attribute"/>
              <parameter key="5" value="HLLD.true.real.attribute"/>
              <parameter key="6" value="HLLS.true.real.attribute"/>
              <parameter key="7" value="NPHI.true.real.attribute"/>
              <parameter key="8" value="PEFZ.true.real.attribute"/>
              <parameter key="9" value="RHOZ.true.real.attribute"/>
              <parameter key="10" value="RXOZ.true.real.attribute"/>
              <parameter key="11" value="cluster_att.true.real.attribute"/>
              <parameter key="12" value="ID.true.integer.id"/>
              <parameter key="13" value="Perm(scaled).false.real.attribute"/>
              <parameter key="14" value="prediction.false.real.attribute"/>
              <parameter key="15" value="tafavot.false.polynominal.attribute"/>
              <parameter key="16" value="new_llog.true.real.label"/>
            </list>
          </operator>
          <operator activated="true" class="normalize" compatibility="5.2.008" expanded="true" height="94" name="Normalize" width="90" x="45" y="210">
            <parameter key="include_special_attributes" value="true"/>
            <parameter key="method" value="range transformation"/>
          </operator>
          <operator activated="true" class="filter_examples" compatibility="5.2.008" expanded="true" height="76" name="Filter Examples" width="90" x="179" y="75">
            <parameter key="condition_class" value="attribute_value_filter"/>
            <parameter key="parameter_string" value="well-id=0.5"/>
            <parameter key="invert_filter" value="true"/>
          </operator>
          <operator activated="true" class="x_validation" compatibility="5.2.008" expanded="true" height="112" name="Validation" width="90" x="246" y="390">
            <parameter key="sampling_type" value="shuffled sampling"/>
            <parameter key="use_local_random_seed" value="true"/>
            <parameter key="local_random_seed" value="640"/>
            <process expanded="true" height="606" width="418">
              <operator activated="true" class="support_vector_machine_libsvm" compatibility="5.2.008" expanded="true" height="76" name="SVM" width="90" x="179" y="30">
                <parameter key="svm_type" value="epsilon-SVR"/>
                <parameter key="gamma" value="0.11"/>
                <parameter key="C" value="1600.0"/>
                <parameter key="p" value="0.01"/>
                <list key="class_weights"/>
              </operator>
              <connect from_port="training" to_op="SVM" to_port="training set"/>
              <connect from_op="SVM" from_port="model" to_port="model"/>
              <portSpacing port="source_training" spacing="0"/>
              <portSpacing port="sink_model" spacing="0"/>
              <portSpacing port="sink_through 1" spacing="0"/>
            </process>
            <process expanded="true" height="606" width="418">
              <operator activated="true" class="apply_model" compatibility="5.2.008" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
                <list key="application_parameters"/>
              </operator>
              <operator activated="true" class="performance_regression" compatibility="5.2.008" expanded="true" height="76" name="Performance" width="90" x="231" y="30">
                <parameter key="main_criterion" value="squared_correlation"/>
                <parameter key="absolute_error" value="true"/>
                <parameter key="correlation" value="true"/>
                <parameter key="skip_undefined_labels" value="false"/>
              </operator>
              <connect from_port="model" to_op="Apply Model" to_port="model"/>
              <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
              <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
              <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
              <portSpacing port="source_model" spacing="0"/>
              <portSpacing port="source_test set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Read CSV" from_port="output" to_op="Normalize" to_port="example set input"/>
          <connect from_op="Normalize" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
          <connect from_op="Filter Examples" from_port="example set output" to_op="Validation" to_port="training"/>
          <connect from_op="Validation" from_port="averagable 1" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="162"/>
          <portSpacing port="sink_result 1" spacing="396"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process> [ /code]