"Why Matlab and Rapidminer give different results for SVM optimization"
Hi,
I'm using both Matlab and Rapidminer to do SVM classification with optimization for parameters. The data I used have 5000 obs, 36 integer attributes and one binomial label. I'm expecting similar results, yet they turned out to be different. The C statistics from Matlab is 0.672 while that from Rapidminer is 0.598. Also, they gives difference choices of optimal parameters for C and gamma. Rapidminer gives 0.25 and 0.25 respectively, and Matlab gives 4 and 0.25. I would greatly appreciate your help!
Below is the process code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="5.3.008" expanded="true" height="60" name="Retrieve donation_sarah5" width="90" x="45" y="30">
<parameter key="repository_entry" value="../data/donation_sarah5"/>
</operator>
<operator activated="true" class="normalize" compatibility="5.3.008" expanded="true" height="94" name="Normalize" width="90" x="45" y="120"/>
<operator activated="true" class="split_data" compatibility="5.3.008" expanded="true" height="94" name="Split Data" width="90" x="179" y="255">
<enumeration key="partitions">
<parameter key="ratio" value="0.8"/>
<parameter key="ratio" value="0.2"/>
</enumeration>
<parameter key="sampling_type" value="stratified sampling"/>
</operator>
<operator activated="true" class="optimize_parameters_grid" compatibility="5.3.008" expanded="true" height="112" name="Optimize Parameters (Grid)" width="90" x="246" y="30">
<list key="parameters">
<parameter key="SVM.C" value=".25,1,4"/>
<parameter key="SVM.kernel_gamma" value=".25,1,4"/>
</list>
<process expanded="true">
<operator activated="true" class="x_validation" compatibility="5.3.008" expanded="true" height="112" name="Validation" width="90" x="45" y="30">
<description>A cross-validation evaluating a decision tree model.</description>
<process expanded="true">
<operator activated="true" class="support_vector_machine" compatibility="5.3.008" expanded="true" height="112" name="SVM" width="90" x="112" y="30">
<parameter key="kernel_type" value="radial"/>
<parameter key="kernel_gamma" value="4"/>
<parameter key="C" value="4"/>
<parameter key="scale" value="false"/>
</operator>
<connect from_port="training" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="5.3.008" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" compatibility="5.3.008" expanded="true" height="76" name="Performance" width="90" x="226" y="30">
<parameter key="accuracy" value="false"/>
<parameter key="AUC" value="true"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_port="input 1" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="model" to_port="result 1"/>
<connect from_op="Validation" from_port="averagable 1" to_port="performance"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_performance" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" compatibility="5.3.008" expanded="true" height="76" name="Apply Model (2)" width="90" x="380" y="165">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" compatibility="5.3.008" expanded="true" height="76" name="Performance (2)" width="90" x="581" y="255">
<parameter key="accuracy" value="false"/>
<parameter key="AUC" value="true"/>
</operator>
<connect from_op="Retrieve donation_sarah5" from_port="output" to_op="Normalize" to_port="example set input"/>
<connect from_op="Normalize" from_port="example set output" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
<connect from_op="Split Data" from_port="partition 2" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Optimize Parameters (Grid)" from_port="parameter" to_port="result 4"/>
<connect from_op="Optimize Parameters (Grid)" from_port="result 1" to_op="Apply Model (2)" to_port="model"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_op="Performance (2)" to_port="labelled data"/>
<connect from_op="Apply Model (2)" from_port="model" to_port="result 1"/>
<connect from_op="Performance (2)" from_port="performance" to_port="result 2"/>
<connect from_op="Performance (2)" from_port="example set" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>
And here is my Matlab code:
clear all;
load donation;
Y = donation(:,40);
X = donation(:,2:36);
B = donation(:,40)> prctile(Y,80);
a_logical = logical( B );
B1 = a_logical + 0;
%check what percentage of donation are from the 20% people
ptg = sum(Y.*B1)/sum(Y)*100;
disp('percentage of donation that are from the top 20% people');
disp(ptg);
%randomly split the data into 80% and 20%
A = [X B1];
numA = size(A, 1);
trainsize = floor(0.8 * numA);
testsize = numA - trainsize;
ridx = randperm(numA);
traindata = A(ridx(1:trainsize),:);
testdata = A(ridx(trainsize + 1 : end),:);
Xtestdata = testdata(:,1:35);
B1testdata = testdata(:,36);
Xtraindata = traindata(:,1:35);
B1traindata = traindata(:,36);
n = size(B1traindata,1);
%cross-validation
%Gaussian Radial Basis Function kernel
L = [1/4 1 4];
AUCtrain = [];
for j = L(1:1:3)
for m = L(1:1:3)
indices = crossvalind('Kfold', n, 10);
Bp = [];
Br = [];
for i = 1:10
test = (indices == i); train = ~test;
xtst = Xtraindata(test,:);
ytst = B1traindata(test,:);
xtr = Xtraindata(train,:);
ytr = B1traindata(train,:);
SVMStruct = svmtrain(xtr,ytr,'kernel_function','rbf','RBF_Sigma', j ,'BoxConstraint', m);
Group = svmclassify(SVMStruct,xtst);
Bp = [Bp; Group];
Br = [Br; ytst];
end
[X1,Y1,T,AUCij] = perfcurve(Br,Bp,1);
AUCtrain = [AUCtrain;AUCij];
end
end
disp ('SVM_C statistics on the training data with ten-fold cross validation');
disp (AUCtrain');
%use the optimal parameter for the testdata
SVMStruct = svmtrain(Xtraindata,B1traindata,'kernel_function','rbf','RBF_Sigma', 4 ,'BoxConstraint', 1/4);
Group = svmclassify(SVMStruct,Xtestdata);
[X1,Y1,T,AUCtest] = perfcurve(B1testdata,Group,1);
disp ('SVM_C statistics on the test data after ten-fold cross validation');
disp (AUCtest);
Best,
Sarah
I'm using both Matlab and Rapidminer to do SVM classification with optimization for parameters. The data I used have 5000 obs, 36 integer attributes and one binomial label. I'm expecting similar results, yet they turned out to be different. The C statistics from Matlab is 0.672 while that from Rapidminer is 0.598. Also, they gives difference choices of optimal parameters for C and gamma. Rapidminer gives 0.25 and 0.25 respectively, and Matlab gives 4 and 0.25. I would greatly appreciate your help!
Below is the process code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="5.3.008" expanded="true" height="60" name="Retrieve donation_sarah5" width="90" x="45" y="30">
<parameter key="repository_entry" value="../data/donation_sarah5"/>
</operator>
<operator activated="true" class="normalize" compatibility="5.3.008" expanded="true" height="94" name="Normalize" width="90" x="45" y="120"/>
<operator activated="true" class="split_data" compatibility="5.3.008" expanded="true" height="94" name="Split Data" width="90" x="179" y="255">
<enumeration key="partitions">
<parameter key="ratio" value="0.8"/>
<parameter key="ratio" value="0.2"/>
</enumeration>
<parameter key="sampling_type" value="stratified sampling"/>
</operator>
<operator activated="true" class="optimize_parameters_grid" compatibility="5.3.008" expanded="true" height="112" name="Optimize Parameters (Grid)" width="90" x="246" y="30">
<list key="parameters">
<parameter key="SVM.C" value=".25,1,4"/>
<parameter key="SVM.kernel_gamma" value=".25,1,4"/>
</list>
<process expanded="true">
<operator activated="true" class="x_validation" compatibility="5.3.008" expanded="true" height="112" name="Validation" width="90" x="45" y="30">
<description>A cross-validation evaluating a decision tree model.</description>
<process expanded="true">
<operator activated="true" class="support_vector_machine" compatibility="5.3.008" expanded="true" height="112" name="SVM" width="90" x="112" y="30">
<parameter key="kernel_type" value="radial"/>
<parameter key="kernel_gamma" value="4"/>
<parameter key="C" value="4"/>
<parameter key="scale" value="false"/>
</operator>
<connect from_port="training" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="5.3.008" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" compatibility="5.3.008" expanded="true" height="76" name="Performance" width="90" x="226" y="30">
<parameter key="accuracy" value="false"/>
<parameter key="AUC" value="true"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_port="input 1" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="model" to_port="result 1"/>
<connect from_op="Validation" from_port="averagable 1" to_port="performance"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_performance" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" compatibility="5.3.008" expanded="true" height="76" name="Apply Model (2)" width="90" x="380" y="165">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" compatibility="5.3.008" expanded="true" height="76" name="Performance (2)" width="90" x="581" y="255">
<parameter key="accuracy" value="false"/>
<parameter key="AUC" value="true"/>
</operator>
<connect from_op="Retrieve donation_sarah5" from_port="output" to_op="Normalize" to_port="example set input"/>
<connect from_op="Normalize" from_port="example set output" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
<connect from_op="Split Data" from_port="partition 2" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Optimize Parameters (Grid)" from_port="parameter" to_port="result 4"/>
<connect from_op="Optimize Parameters (Grid)" from_port="result 1" to_op="Apply Model (2)" to_port="model"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_op="Performance (2)" to_port="labelled data"/>
<connect from_op="Apply Model (2)" from_port="model" to_port="result 1"/>
<connect from_op="Performance (2)" from_port="performance" to_port="result 2"/>
<connect from_op="Performance (2)" from_port="example set" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>
And here is my Matlab code:
clear all;
load donation;
Y = donation(:,40);
X = donation(:,2:36);
B = donation(:,40)> prctile(Y,80);
a_logical = logical( B );
B1 = a_logical + 0;
%check what percentage of donation are from the 20% people
ptg = sum(Y.*B1)/sum(Y)*100;
disp('percentage of donation that are from the top 20% people');
disp(ptg);
%randomly split the data into 80% and 20%
A = [X B1];
numA = size(A, 1);
trainsize = floor(0.8 * numA);
testsize = numA - trainsize;
ridx = randperm(numA);
traindata = A(ridx(1:trainsize),:);
testdata = A(ridx(trainsize + 1 : end),:);
Xtestdata = testdata(:,1:35);
B1testdata = testdata(:,36);
Xtraindata = traindata(:,1:35);
B1traindata = traindata(:,36);
n = size(B1traindata,1);
%cross-validation
%Gaussian Radial Basis Function kernel
L = [1/4 1 4];
AUCtrain = [];
for j = L(1:1:3)
for m = L(1:1:3)
indices = crossvalind('Kfold', n, 10);
Bp = [];
Br = [];
for i = 1:10
test = (indices == i); train = ~test;
xtst = Xtraindata(test,:);
ytst = B1traindata(test,:);
xtr = Xtraindata(train,:);
ytr = B1traindata(train,:);
SVMStruct = svmtrain(xtr,ytr,'kernel_function','rbf','RBF_Sigma', j ,'BoxConstraint', m);
Group = svmclassify(SVMStruct,xtst);
Bp = [Bp; Group];
Br = [Br; ytst];
end
[X1,Y1,T,AUCij] = perfcurve(Br,Bp,1);
AUCtrain = [AUCtrain;AUCij];
end
end
disp ('SVM_C statistics on the training data with ten-fold cross validation');
disp (AUCtrain');
%use the optimal parameter for the testdata
SVMStruct = svmtrain(Xtraindata,B1traindata,'kernel_function','rbf','RBF_Sigma', 4 ,'BoxConstraint', 1/4);
Group = svmclassify(SVMStruct,Xtestdata);
[X1,Y1,T,AUCtest] = perfcurve(B1testdata,Group,1);
disp ('SVM_C statistics on the test data after ten-fold cross validation');
disp (AUCtest);
Best,
Sarah