"[SOLVED] Confidence for Regression Problems"

New Altair Community Member

Mar 19, 2013

Updated Nov 5, 2024 by Jocelyn

Hi,
I have a basic regression problem with several numeric attributes. First, I perform feature subset selection, then model training, and then prediction on test data. By the way, it is awesome, that RapidMiner let's me do all those things without hassle. However, I encountered some situations where I would like to get more information regarding the stability of the methods I used.

I have two questions:
(1) Is it possible to get variance from "Loop and Average" regarding the averagable in addition to mean? This works for performance vectors, but not weight vectors.

I built a module for feature subset selection which returns the best subset. It is represented by an attribute weight vector, i.e. 0 = not selected, 1 = selected. This process has a random component and I repeat it with "Loop and Average" to get different random seeds, and average the selection vectors. The result should be a table of each attribute and their relative selection count with mean and variance. However, I only get mean. I tried a workaround with "Log". I can get the performances for each cross-validation fold, and also each cross-validation itself using "applycount" as primary and foreign keys. However, I cannot get any information from the "Optimize Selection" operator, e.g. "feature_names" always delivers "?". Please, have a look at the logs in the example.


<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.005">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.005" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="5.3.005" expanded="true" height="60" name="Retrieve" width="90" x="112" y="30">
        <parameter key="repository_entry" value="//Samples/data/Polynomial"/>
      </operator>
      <operator activated="true" class="sample" compatibility="5.3.005" expanded="true" height="76" name="Sample" width="90" x="313" y="30">
        <parameter key="sample_size" value="50"/>
        <list key="sample_size_per_class"/>
        <list key="sample_ratio_per_class"/>
        <list key="sample_probability_per_class"/>
      </operator>
      <operator activated="true" class="loop_and_average" compatibility="5.3.005" expanded="true" height="94" name="Loop and Average" width="90" x="514" y="30">
        <process expanded="true">
          <operator activated="true" class="optimize_selection_brute_force" compatibility="5.3.005" expanded="true" height="94" name="Optimize Selection (Brute Force)" width="90" x="380" y="30">
            <parameter key="max_number_of_attributes" value="3"/>
            <process expanded="true">
              <operator activated="true" class="x_validation" compatibility="5.3.005" expanded="true" height="112" name="X-Validation" width="90" x="380" y="30">
                <parameter key="sampling_type" value="shuffled sampling"/>
                <process expanded="true">
                  <operator activated="true" class="support_vector_machine" compatibility="5.3.005" expanded="true" height="112" name="SVM" width="90" x="179" y="30">
                    <parameter key="kernel_type" value="polynomial"/>
                    <parameter key="kernel_degree" value="3.0"/>
                    <parameter key="C" value="1.0"/>
                  </operator>
                  <connect from_port="training" to_op="SVM" to_port="training set"/>
                  <connect from_op="SVM" from_port="model" to_port="model"/>
                  <portSpacing port="source_training" spacing="0"/>
                  <portSpacing port="sink_model" spacing="0"/>
                  <portSpacing port="sink_through 1" spacing="0"/>
                </process>
                <process expanded="true">
                  <operator activated="true" class="apply_model" compatibility="5.3.005" expanded="true" height="76" name="Apply Model" width="90" x="112" y="30">
                    <list key="application_parameters"/>
                  </operator>
                  <operator activated="true" class="performance_regression" compatibility="5.3.005" expanded="true" height="76" name="Performance" width="90" x="313" y="30">
                    <parameter key="main_criterion" value="correlation"/>
                    <parameter key="correlation" value="true"/>
                  </operator>
                  <operator activated="true" class="log" compatibility="5.3.005" expanded="true" height="76" name="Log Folds" width="90" x="514" y="30">
                    <list key="log">
                      <parameter key="FoldID" value="operator.Performance.value.applycount"/>
                      <parameter key="Performance" value="operator.Performance.value.performance"/>
                      <parameter key="ValidationID" value="operator.X-Validation.value.applycount"/>
                    </list>
                  </operator>
                  <connect from_port="model" to_op="Apply Model" to_port="model"/>
                  <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
                  <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
                  <connect from_op="Performance" from_port="performance" to_op="Log Folds" to_port="through 1"/>
                  <connect from_op="Log Folds" from_port="through 1" to_port="averagable 1"/>
                  <portSpacing port="source_model" spacing="0"/>
                  <portSpacing port="source_test set" spacing="0"/>
                  <portSpacing port="source_through 1" spacing="0"/>
                  <portSpacing port="sink_averagable 1" spacing="0"/>
                  <portSpacing port="sink_averagable 2" spacing="0"/>
                </process>
              </operator>
              <operator activated="true" class="log" compatibility="5.3.005" expanded="true" height="76" name="Log Validation" width="90" x="581" y="30">
                <list key="log">
                  <parameter key="ValidationID" value="operator.X-Validation.value.applycount"/>
                  <parameter key="Performance" value="operator.X-Validation.value.performance"/>
                  <parameter key="Deviation" value="operator.X-Validation.value.deviation"/>
                  <parameter key="LoopID" value="operator.Optimize Selection (Brute Force).value.applycount"/>
                </list>
              </operator>
              <connect from_port="example set" to_op="X-Validation" to_port="training"/>
              <connect from_op="X-Validation" from_port="averagable 1" to_op="Log Validation" to_port="through 1"/>
              <connect from_op="Log Validation" from_port="through 1" to_port="performance"/>
              <portSpacing port="source_example set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="sink_performance" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="log" compatibility="5.3.005" expanded="true" height="112" name="Log Loop" width="90" x="581" y="30">
            <list key="log">
              <parameter key="LoopID" value="operator.Optimize Selection (Brute Force).value.applycount"/>
              <parameter key="Feature Names" value="operator.Optimize Selection (Brute Force).value.feature_names"/>
              <parameter key="Performance" value="operator.Optimize Selection (Brute Force).value.performance"/>
              <parameter key="Best" value="operator.Optimize Selection (Brute Force).value.best"/>
            </list>
          </operator>
          <connect from_port="in 1" to_op="Optimize Selection (Brute Force)" to_port="example set in"/>
          <connect from_op="Optimize Selection (Brute Force)" from_port="example set out" to_op="Log Loop" to_port="through 3"/>
          <connect from_op="Optimize Selection (Brute Force)" from_port="weights" to_op="Log Loop" to_port="through 1"/>
          <connect from_op="Optimize Selection (Brute Force)" from_port="performance" to_op="Log Loop" to_port="through 2"/>
          <connect from_op="Log Loop" from_port="through 1" to_port="averagable 2"/>
          <connect from_op="Log Loop" from_port="through 2" to_port="averagable 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_averagable 1" spacing="0"/>
          <portSpacing port="sink_averagable 2" spacing="0"/>
          <portSpacing port="sink_averagable 3" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Retrieve" from_port="output" to_op="Sample" to_port="example set input"/>
      <connect from_op="Sample" from_port="example set output" to_op="Loop and Average" to_port="in 1"/>
      <connect from_op="Loop and Average" from_port="averagable 1" to_port="result 1"/>
      <connect from_op="Loop and Average" from_port="averagable 2" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

(2) Is it possible to get variance from "Bagging" in combination with "Apply Model" regarding the label in addition to mean? This also does not seem to work with "X-Prediction".

I built a module for model training and prediction. Training is basically bagging applied to the induction algorithm. Then, I use "Apply Model" for prediction which should internally apply each bag model to the sample and compute the average. However, it just adds a prediction column with mean, but not variance.


<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.005">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.005" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="5.3.005" expanded="true" height="60" name="Retrieve" width="90" x="112" y="30">
        <parameter key="repository_entry" value="//Samples/data/Polynomial"/>
      </operator>
      <operator activated="true" class="split_data" compatibility="5.3.005" expanded="true" height="94" name="Split Data" width="90" x="313" y="30">
        <enumeration key="partitions">
          <parameter key="ratio" value="0.7"/>
          <parameter key="ratio" value="0.3"/>
        </enumeration>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.3.005" expanded="true" height="76" name="Select Attributes" width="90" x="514" y="30">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="|a3|a2|a1|label"/>
      </operator>
      <operator activated="true" class="bagging" compatibility="5.3.005" expanded="true" height="76" name="Bagging" width="90" x="715" y="30">
        <parameter key="iterations" value="100"/>
        <process expanded="true">
          <operator activated="true" class="support_vector_machine" compatibility="5.3.005" expanded="true" height="112" name="SVM" width="90" x="514" y="30">
            <parameter key="kernel_type" value="polynomial"/>
            <parameter key="kernel_degree" value="3.0"/>
            <parameter key="C" value="1.0"/>
          </operator>
          <connect from_port="training set" to_op="SVM" to_port="training set"/>
          <connect from_op="SVM" from_port="model" to_port="model"/>
          <portSpacing port="source_training set" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="apply_model" compatibility="5.3.005" expanded="true" height="76" name="Apply Model" width="90" x="715" y="255">
        <list key="application_parameters"/>
      </operator>
      <operator activated="true" class="performance_regression" compatibility="5.3.005" expanded="true" height="76" name="Performance" width="90" x="916" y="255">
        <parameter key="main_criterion" value="correlation"/>
        <parameter key="correlation" value="true"/>
      </operator>
      <connect from_op="Retrieve" from_port="output" to_op="Split Data" to_port="example set"/>
      <connect from_op="Split Data" from_port="partition 1" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Split Data" from_port="partition 2" to_op="Apply Model" to_port="unlabelled data"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Bagging" to_port="training set"/>
      <connect from_op="Bagging" from_port="model" to_op="Apply Model" to_port="model"/>
      <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
      <connect from_op="Performance" from_port="performance" to_port="result 1"/>
      <connect from_op="Performance" from_port="example set" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Best regards,
Stefan

Find more posts tagged with

AI Studio

Performance

Regression

🎉Community Raffle - Win $25

"[SOLVED] Confidence for Regression Problems"

Find more posts tagged with

Quick Links