🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

"Why Matlab and Rapidminer give different results for SVM optimization"

User: "Sarah"
New Altair Community Member
Updated by Jocelyn
Hi,

I'm using both Matlab and Rapidminer to do SVM classification with optimization for parameters. The data I used have 5000 obs, 36 integer attributes and one binomial label. I'm expecting similar results, yet they turned out to be different. The C statistics from Matlab is 0.672 while that from Rapidminer is 0.598. Also, they gives difference choices of optimal parameters for C and gamma. Rapidminer gives 0.25 and 0.25 respectively, and Matlab gives 4 and 0.25. I would greatly appreciate your help!

Below is the process code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="5.3.008" expanded="true" height="60" name="Retrieve donation_sarah5" width="90" x="45" y="30">
        <parameter key="repository_entry" value="../data/donation_sarah5"/>
      </operator>
      <operator activated="true" class="normalize" compatibility="5.3.008" expanded="true" height="94" name="Normalize" width="90" x="45" y="120"/>
      <operator activated="true" class="split_data" compatibility="5.3.008" expanded="true" height="94" name="Split Data" width="90" x="179" y="255">
        <enumeration key="partitions">
          <parameter key="ratio" value="0.8"/>
          <parameter key="ratio" value="0.2"/>
        </enumeration>
        <parameter key="sampling_type" value="stratified sampling"/>
      </operator>
      <operator activated="true" class="optimize_parameters_grid" compatibility="5.3.008" expanded="true" height="112" name="Optimize Parameters (Grid)" width="90" x="246" y="30">
        <list key="parameters">
          <parameter key="SVM.C" value=".25,1,4"/>
          <parameter key="SVM.kernel_gamma" value=".25,1,4"/>
        </list>
        <process expanded="true">
          <operator activated="true" class="x_validation" compatibility="5.3.008" expanded="true" height="112" name="Validation" width="90" x="45" y="30">
            <description>A cross-validation evaluating a decision tree model.</description>
            <process expanded="true">
              <operator activated="true" class="support_vector_machine" compatibility="5.3.008" expanded="true" height="112" name="SVM" width="90" x="112" y="30">
                <parameter key="kernel_type" value="radial"/>
                <parameter key="kernel_gamma" value="4"/>
                <parameter key="C" value="4"/>
                <parameter key="scale" value="false"/>
              </operator>
              <connect from_port="training" to_op="SVM" to_port="training set"/>
              <connect from_op="SVM" from_port="model" to_port="model"/>
              <portSpacing port="source_training" spacing="0"/>
              <portSpacing port="sink_model" spacing="0"/>
              <portSpacing port="sink_through 1" spacing="0"/>
            </process>
            <process expanded="true">
              <operator activated="true" class="apply_model" compatibility="5.3.008" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
                <list key="application_parameters"/>
              </operator>
              <operator activated="true" class="performance_binominal_classification" compatibility="5.3.008" expanded="true" height="76" name="Performance" width="90" x="226" y="30">
                <parameter key="accuracy" value="false"/>
                <parameter key="AUC" value="true"/>
              </operator>
              <connect from_port="model" to_op="Apply Model" to_port="model"/>
              <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
              <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
              <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
              <portSpacing port="source_model" spacing="0"/>
              <portSpacing port="source_test set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <connect from_port="input 1" to_op="Validation" to_port="training"/>
          <connect from_op="Validation" from_port="model" to_port="result 1"/>
          <connect from_op="Validation" from_port="averagable 1" to_port="performance"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_performance" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="apply_model" compatibility="5.3.008" expanded="true" height="76" name="Apply Model (2)" width="90" x="380" y="165">
        <list key="application_parameters"/>
      </operator>
      <operator activated="true" class="performance_binominal_classification" compatibility="5.3.008" expanded="true" height="76" name="Performance (2)" width="90" x="581" y="255">
        <parameter key="accuracy" value="false"/>
        <parameter key="AUC" value="true"/>
      </operator>
      <connect from_op="Retrieve donation_sarah5" from_port="output" to_op="Normalize" to_port="example set input"/>
      <connect from_op="Normalize" from_port="example set output" to_op="Split Data" to_port="example set"/>
      <connect from_op="Split Data" from_port="partition 1" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
      <connect from_op="Split Data" from_port="partition 2" to_op="Apply Model (2)" to_port="unlabelled data"/>
      <connect from_op="Optimize Parameters (Grid)" from_port="parameter" to_port="result 4"/>
      <connect from_op="Optimize Parameters (Grid)" from_port="result 1" to_op="Apply Model (2)" to_port="model"/>
      <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Performance (2)" to_port="labelled data"/>
      <connect from_op="Apply Model (2)" from_port="model" to_port="result 1"/>
      <connect from_op="Performance (2)" from_port="performance" to_port="result 2"/>
      <connect from_op="Performance (2)" from_port="example set" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
    </process>
  </operator>
</process>

And here is my Matlab code:
clear all;
load donation;

Y = donation(:,40);
X = donation(:,2:36);
B = donation(:,40)> prctile(Y,80);
a_logical = logical( B );
B1 = a_logical + 0;

%check what percentage of donation are from the 20% people
ptg = sum(Y.*B1)/sum(Y)*100;
disp('percentage of donation that are from the top 20% people');
disp(ptg);
%randomly split the data into 80% and 20%
  A = [X B1];
  numA = size(A, 1);
  trainsize = floor(0.8 * numA);
  testsize = numA - trainsize;
  ridx = randperm(numA);
  traindata = A(ridx(1:trainsize),:);
  testdata = A(ridx(trainsize + 1 : end),:);
  Xtestdata = testdata(:,1:35);
  B1testdata = testdata(:,36);
  Xtraindata = traindata(:,1:35);
  B1traindata = traindata(:,36);
  n = size(B1traindata,1);
%cross-validation
%Gaussian Radial Basis Function kernel
L = [1/4 1 4];
AUCtrain = [];
for j = L(1:1:3)
for m = L(1:1:3)
indices = crossvalind('Kfold', n, 10);
Bp = [];
Br = [];
for i = 1:10
    test = (indices == i); train = ~test;
    xtst = Xtraindata(test,:);
    ytst = B1traindata(test,:);
    xtr = Xtraindata(train,:);
    ytr = B1traindata(train,:);
    SVMStruct = svmtrain(xtr,ytr,'kernel_function','rbf','RBF_Sigma', j ,'BoxConstraint', m);
    Group = svmclassify(SVMStruct,xtst);
    Bp = [Bp; Group];
    Br =  [Br; ytst];
end
[X1,Y1,T,AUCij] = perfcurve(Br,Bp,1);
AUCtrain = [AUCtrain;AUCij];
end
end
disp ('SVM_C statistics on the training data with ten-fold cross validation');
disp (AUCtrain');
%use the optimal parameter for the testdata
SVMStruct = svmtrain(Xtraindata,B1traindata,'kernel_function','rbf','RBF_Sigma', 4 ,'BoxConstraint', 1/4);
Group = svmclassify(SVMStruct,Xtestdata);
[X1,Y1,T,AUCtest] = perfcurve(B1testdata,Group,1);
disp ('SVM_C statistics on the test data after ten-fold cross validation');
disp (AUCtest);


Best,
Sarah

Find more posts tagged with