Hello,
I'm doing some research for my bachelor thesis and have some questions about Rapidminer 5 and AUC.
Im trying to compare three Classificators ( SVM, Linear Regression and Linear Discriminant Analysis) with 10 fold crossvalidation and AUC as MAIN CRITERION.
For Linear Regression and Linear Discriminant Analysis I always get the same AUC (0.5). No matter how many variables I'm using or how I normalize the data before. On the other hand Specificity and Sensitivity values change. For example, the Linear Regression Model delivers a Specificity 78%, Sensitivity 71% and AUC 0.5. Linear Discriminant Analysis delivers a Specificity 70.81%, Sensitivity 64.5% and AUC 0.5
The SVM values (Specificity 81.5%, Sensitivity 79.5%, AUC 0,904) seems to be OK.
Can this be correct or is this a data Problem? Does a Linear Model always have an AUC of 0.5 no matter if Specificity and Sensitivity values are both high ?
I couldn't find any literature infos about this strange behavior. I always thought that with a high Specificity eg. 85% and Sensitivity e.g. 87% I get a high AUC value.
The whole process with Training Data can be downloaded here
http://www.myexperiment.org/packs/151/download
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.0.8" expanded="true" name="Root">
<process expanded="true" height="562" width="1007">
<operator activated="true" class="read_aml" compatibility="5.0.10" expanded="true" height="60" name="TestSet" width="90" x="45" y="30">
<parameter key="attributes" value="C:\Users\pawel\Documents\naruto1.aml"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.0.8" expanded="true" height="112" name="Multiply" width="90" x="45" y="255"/>
<operator activated="true" class="normalize" compatibility="5.0.10" expanded="true" height="94" name="Z-score" width="90" x="246" y="75"/>
<operator activated="true" class="x_validation" compatibility="5.0.10" expanded="true" height="112" name="SVM X-Val" width="90" x="447" y="30">
<process expanded="true" height="385" width="330">
<operator activated="true" class="support_vector_machine_libsvm" compatibility="5.0.10" expanded="true" height="76" name="SVM" width="90" x="119" y="62">
<list key="class_weights"/>
</operator>
<connect from_port="training" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true" height="385" width="330">
<operator activated="true" class="apply_model" compatibility="5.0.10" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" compatibility="5.0.10" expanded="true" height="76" name="SVM Perf" width="90" x="179" y="30">
<parameter key="main_criterion" value="AUC"/>
<parameter key="classification_error" value="true"/>
<parameter key="AUC" value="true"/>
<parameter key="sensitivity" value="true"/>
<parameter key="specificity" value="true"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="SVM Perf" to_port="labelled data"/>
<connect from_op="SVM Perf" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="normalize" compatibility="5.0.10" expanded="true" height="94" name="Min-Max" width="90" x="269" y="296">
<parameter key="method" value="range transformation"/>
</operator>
<operator activated="true" class="x_validation" compatibility="5.0.8" expanded="true" height="112" name="LDA X-val" width="90" x="447" y="300">
<parameter key="sampling_type" value="linear sampling"/>
<process expanded="true" height="405" width="347">
<operator activated="true" class="linear_discriminant_analysis" compatibility="5.0.8" expanded="true" height="76" name="LDA (2)" width="90" x="133" y="30"/>
<connect from_port="training" to_op="LDA (2)" to_port="training set"/>
<connect from_op="LDA (2)" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true" height="405" width="347">
<operator activated="true" class="apply_model" compatibility="5.0.8" expanded="true" height="76" name="Apply Model (7)" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" compatibility="5.0.8" expanded="true" height="76" name="LDA Perf" width="90" x="200" y="30">
<parameter key="main_criterion" value="AUC"/>
<parameter key="classification_error" value="true"/>
<parameter key="AUC" value="true"/>
<parameter key="sensitivity" value="true"/>
<parameter key="specificity" value="true"/>
</operator>
<connect from_port="model" to_op="Apply Model (7)" to_port="model"/>
<connect from_port="test set" to_op="Apply Model (7)" to_port="unlabelled data"/>
<connect from_op="Apply Model (7)" from_port="labelled data" to_op="LDA Perf" to_port="labelled data"/>
<connect from_op="LDA Perf" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="x_validation" compatibility="5.0.8" expanded="true" height="112" name="LR X-val" width="90" x="447" y="165">
<process expanded="true" height="405" width="347">
<operator activated="true" class="classification_by_regression" compatibility="5.0.8" expanded="true" height="76" name="Classification by Regression (2)" width="90" x="133" y="30">
<process expanded="true" height="385" width="710">
<operator activated="true" class="linear_regression" compatibility="5.0.8" expanded="true" height="94" name="Linear Regression (2)" width="90" x="319" y="30">
<parameter key="feature_selection" value="none"/>
</operator>
<connect from_port="training set" to_op="Linear Regression (2)" to_port="training set"/>
<connect from_op="Linear Regression (2)" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
</process>
</operator>
<connect from_port="training" to_op="Classification by Regression (2)" to_port="training set"/>
<connect from_op="Classification by Regression (2)" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true" height="405" width="347">
<operator activated="true" class="apply_model" compatibility="5.0.8" expanded="true" height="76" name="Apply Model (5)" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" compatibility="5.0.8" expanded="true" height="76" name="LR Perf" width="90" x="200" y="30">
<parameter key="main_criterion" value="AUC"/>
<parameter key="classification_error" value="true"/>
<parameter key="AUC" value="true"/>
<parameter key="sensitivity" value="true"/>
<parameter key="specificity" value="true"/>
</operator>
<connect from_port="model" to_op="Apply Model (5)" to_port="model"/>
<connect from_port="test set" to_op="Apply Model (5)" to_port="unlabelled data"/>
<connect from_op="Apply Model (5)" from_port="labelled data" to_op="LR Perf" to_port="labelled data"/>
<connect from_op="LR Perf" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_op="TestSet" from_port="output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="LR X-val" to_port="training"/>
<connect from_op="Multiply" from_port="output 2" to_op="Min-Max" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 3" to_op="Z-score" to_port="example set input"/>
<connect from_op="Z-score" from_port="example set output" to_op="SVM X-Val" to_port="training"/>
<connect from_op="SVM X-Val" from_port="averagable 1" to_port="result 3"/>
<connect from_op="Min-Max" from_port="example set output" to_op="LDA X-val" to_port="training"/>
<connect from_op="LDA X-val" from_port="averagable 1" to_port="result 2"/>
<connect from_op="LR X-val" from_port="averagable 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>