🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

"[SOLVED] Confidence for Regression Problems"

User: "Stefan"
New Altair Community Member
Updated by Jocelyn
Hi,
I have a basic regression problem with several numeric attributes. First, I perform feature subset selection, then model training, and then prediction on test data. By the way, it is awesome, that RapidMiner let's me do all those things without hassle. However, I encountered some situations where I would like to get more information regarding the stability of the methods I used.

I have two questions:
(1) Is it possible to get variance from "Loop and Average" regarding the averagable in addition to mean? This works for performance vectors, but not weight vectors.

I built a module for feature subset selection which returns the best subset. It is represented by an attribute weight vector, i.e. 0 = not selected, 1 = selected. This process has a random component and I repeat it with "Loop and Average" to get different random seeds, and average the selection vectors. The result should be a table of each attribute and their relative selection count with mean and variance. However, I only get mean. I tried a workaround with "Log". I can get the performances for each cross-validation fold, and also each cross-validation itself using "applycount" as primary and foreign keys. However, I cannot get any information from the "Optimize Selection" operator, e.g. "feature_names" always delivers "?". Please, have a look at the logs in the example.

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.005">
 <context>
   <input/>
   <output/>
   <macros/>
 </context>
 <operator activated="true" class="process" compatibility="5.3.005" expanded="true" name="Process">
   <process expanded="true">
     <operator activated="true" class="retrieve" compatibility="5.3.005" expanded="true" height="60" name="Retrieve" width="90" x="112" y="30">
       <parameter key="repository_entry" value="//Samples/data/Polynomial"/>
     </operator>
     <operator activated="true" class="sample" compatibility="5.3.005" expanded="true" height="76" name="Sample" width="90" x="313" y="30">
       <parameter key="sample_size" value="50"/>
       <list key="sample_size_per_class"/>
       <list key="sample_ratio_per_class"/>
       <list key="sample_probability_per_class"/>
     </operator>
     <operator activated="true" class="loop_and_average" compatibility="5.3.005" expanded="true" height="94" name="Loop and Average" width="90" x="514" y="30">
       <process expanded="true">
         <operator activated="true" class="optimize_selection_brute_force" compatibility="5.3.005" expanded="true" height="94" name="Optimize Selection (Brute Force)" width="90" x="380" y="30">
           <parameter key="max_number_of_attributes" value="3"/>
           <process expanded="true">
             <operator activated="true" class="x_validation" compatibility="5.3.005" expanded="true" height="112" name="X-Validation" width="90" x="380" y="30">
               <parameter key="sampling_type" value="shuffled sampling"/>
               <process expanded="true">
                 <operator activated="true" class="support_vector_machine" compatibility="5.3.005" expanded="true" height="112" name="SVM" width="90" x="179" y="30">
                   <parameter key="kernel_type" value="polynomial"/>
                   <parameter key="kernel_degree" value="3.0"/>
                   <parameter key="C" value="1.0"/>
                 </operator>
                 <connect from_port="training" to_op="SVM" to_port="training set"/>
                 <connect from_op="SVM" from_port="model" to_port="model"/>
                 <portSpacing port="source_training" spacing="0"/>
                 <portSpacing port="sink_model" spacing="0"/>
                 <portSpacing port="sink_through 1" spacing="0"/>
               </process>
               <process expanded="true">
                 <operator activated="true" class="apply_model" compatibility="5.3.005" expanded="true" height="76" name="Apply Model" width="90" x="112" y="30">
                   <list key="application_parameters"/>
                 </operator>
                 <operator activated="true" class="performance_regression" compatibility="5.3.005" expanded="true" height="76" name="Performance" width="90" x="313" y="30">
                   <parameter key="main_criterion" value="correlation"/>
                   <parameter key="correlation" value="true"/>
                 </operator>
                 <operator activated="true" class="log" compatibility="5.3.005" expanded="true" height="76" name="Log Folds" width="90" x="514" y="30">
                   <list key="log">
                     <parameter key="FoldID" value="operator.Performance.value.applycount"/>
                     <parameter key="Performance" value="operator.Performance.value.performance"/>
                     <parameter key="ValidationID" value="operator.X-Validation.value.applycount"/>
                   </list>
                 </operator>
                 <connect from_port="model" to_op="Apply Model" to_port="model"/>
                 <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
                 <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
                 <connect from_op="Performance" from_port="performance" to_op="Log Folds" to_port="through 1"/>
                 <connect from_op="Log Folds" from_port="through 1" to_port="averagable 1"/>
                 <portSpacing port="source_model" spacing="0"/>
                 <portSpacing port="source_test set" spacing="0"/>
                 <portSpacing port="source_through 1" spacing="0"/>
                 <portSpacing port="sink_averagable 1" spacing="0"/>
                 <portSpacing port="sink_averagable 2" spacing="0"/>
               </process>
             </operator>
             <operator activated="true" class="log" compatibility="5.3.005" expanded="true" height="76" name="Log Validation" width="90" x="581" y="30">
               <list key="log">
                 <parameter key="ValidationID" value="operator.X-Validation.value.applycount"/>
                 <parameter key="Performance" value="operator.X-Validation.value.performance"/>
                 <parameter key="Deviation" value="operator.X-Validation.value.deviation"/>
                 <parameter key="LoopID" value="operator.Optimize Selection (Brute Force).value.applycount"/>
               </list>
             </operator>
             <connect from_port="example set" to_op="X-Validation" to_port="training"/>
             <connect from_op="X-Validation" from_port="averagable 1" to_op="Log Validation" to_port="through 1"/>
             <connect from_op="Log Validation" from_port="through 1" to_port="performance"/>
             <portSpacing port="source_example set" spacing="0"/>
             <portSpacing port="source_through 1" spacing="0"/>
             <portSpacing port="sink_performance" spacing="0"/>
           </process>
         </operator>
         <operator activated="true" class="log" compatibility="5.3.005" expanded="true" height="112" name="Log Loop" width="90" x="581" y="30">
           <list key="log">
             <parameter key="LoopID" value="operator.Optimize Selection (Brute Force).value.applycount"/>
             <parameter key="Feature Names" value="operator.Optimize Selection (Brute Force).value.feature_names"/>
             <parameter key="Performance" value="operator.Optimize Selection (Brute Force).value.performance"/>
             <parameter key="Best" value="operator.Optimize Selection (Brute Force).value.best"/>
           </list>
         </operator>
         <connect from_port="in 1" to_op="Optimize Selection (Brute Force)" to_port="example set in"/>
         <connect from_op="Optimize Selection (Brute Force)" from_port="example set out" to_op="Log Loop" to_port="through 3"/>
         <connect from_op="Optimize Selection (Brute Force)" from_port="weights" to_op="Log Loop" to_port="through 1"/>
         <connect from_op="Optimize Selection (Brute Force)" from_port="performance" to_op="Log Loop" to_port="through 2"/>
         <connect from_op="Log Loop" from_port="through 1" to_port="averagable 2"/>
         <connect from_op="Log Loop" from_port="through 2" to_port="averagable 1"/>
         <portSpacing port="source_in 1" spacing="0"/>
         <portSpacing port="source_in 2" spacing="0"/>
         <portSpacing port="sink_averagable 1" spacing="0"/>
         <portSpacing port="sink_averagable 2" spacing="0"/>
         <portSpacing port="sink_averagable 3" spacing="0"/>
       </process>
     </operator>
     <connect from_op="Retrieve" from_port="output" to_op="Sample" to_port="example set input"/>
     <connect from_op="Sample" from_port="example set output" to_op="Loop and Average" to_port="in 1"/>
     <connect from_op="Loop and Average" from_port="averagable 1" to_port="result 1"/>
     <connect from_op="Loop and Average" from_port="averagable 2" to_port="result 2"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="0"/>
     <portSpacing port="sink_result 2" spacing="0"/>
     <portSpacing port="sink_result 3" spacing="0"/>
   </process>
 </operator>
</process>
(2) Is it possible to get variance from "Bagging" in combination with "Apply Model" regarding the label in addition to mean? This also does not seem to work with "X-Prediction".

I built a module for model training and prediction. Training is basically bagging applied to the induction algorithm. Then, I use "Apply Model" for prediction which should internally apply each bag model to the sample and compute the average. However, it just adds a prediction column with mean, but not variance.

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.005">
 <context>
   <input/>
   <output/>
   <macros/>
 </context>
 <operator activated="true" class="process" compatibility="5.3.005" expanded="true" name="Process">
   <process expanded="true">
     <operator activated="true" class="retrieve" compatibility="5.3.005" expanded="true" height="60" name="Retrieve" width="90" x="112" y="30">
       <parameter key="repository_entry" value="//Samples/data/Polynomial"/>
     </operator>
     <operator activated="true" class="split_data" compatibility="5.3.005" expanded="true" height="94" name="Split Data" width="90" x="313" y="30">
       <enumeration key="partitions">
         <parameter key="ratio" value="0.7"/>
         <parameter key="ratio" value="0.3"/>
       </enumeration>
     </operator>
     <operator activated="true" class="select_attributes" compatibility="5.3.005" expanded="true" height="76" name="Select Attributes" width="90" x="514" y="30">
       <parameter key="attribute_filter_type" value="subset"/>
       <parameter key="attributes" value="|a3|a2|a1|label"/>
     </operator>
     <operator activated="true" class="bagging" compatibility="5.3.005" expanded="true" height="76" name="Bagging" width="90" x="715" y="30">
       <parameter key="iterations" value="100"/>
       <process expanded="true">
         <operator activated="true" class="support_vector_machine" compatibility="5.3.005" expanded="true" height="112" name="SVM" width="90" x="514" y="30">
           <parameter key="kernel_type" value="polynomial"/>
           <parameter key="kernel_degree" value="3.0"/>
           <parameter key="C" value="1.0"/>
         </operator>
         <connect from_port="training set" to_op="SVM" to_port="training set"/>
         <connect from_op="SVM" from_port="model" to_port="model"/>
         <portSpacing port="source_training set" spacing="0"/>
         <portSpacing port="sink_model" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="apply_model" compatibility="5.3.005" expanded="true" height="76" name="Apply Model" width="90" x="715" y="255">
       <list key="application_parameters"/>
     </operator>
     <operator activated="true" class="performance_regression" compatibility="5.3.005" expanded="true" height="76" name="Performance" width="90" x="916" y="255">
       <parameter key="main_criterion" value="correlation"/>
       <parameter key="correlation" value="true"/>
     </operator>
     <connect from_op="Retrieve" from_port="output" to_op="Split Data" to_port="example set"/>
     <connect from_op="Split Data" from_port="partition 1" to_op="Select Attributes" to_port="example set input"/>
     <connect from_op="Split Data" from_port="partition 2" to_op="Apply Model" to_port="unlabelled data"/>
     <connect from_op="Select Attributes" from_port="example set output" to_op="Bagging" to_port="training set"/>
     <connect from_op="Bagging" from_port="model" to_op="Apply Model" to_port="model"/>
     <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
     <connect from_op="Performance" from_port="performance" to_port="result 1"/>
     <connect from_op="Performance" from_port="example set" to_port="result 2"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="0"/>
     <portSpacing port="sink_result 2" spacing="0"/>
     <portSpacing port="sink_result 3" spacing="0"/>
   </process>
 </operator>
</process>
Best regards,
Stefan

Find more posts tagged with