How to plot train accuracy vs. test accuracy?
hi,
currently I am testing some configuration parameters C and gamma with a SVM inside X-Val inside Opt.Parameter Operator...
the process looks like this:
<?xml version="1.0" encoding="UTF-8"?><process version="7.2.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.2.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="false" class="retrieve" compatibility="7.2.000" expanded="true" height="68" name="Retrieve" width="90" x="45" y="238">
<parameter key="repository_entry" value="//RapidMiner_Nils/Nils/Master/Data/Master Excelliste_Gefügebezeichnung_3 klassen"/>
</operator>
<operator activated="false" class="split_data" compatibility="7.2.000" expanded="true" height="103" name="Split Data" width="90" x="179" y="238">
<enumeration key="partitions">
<parameter key="ratio" value="0.5"/>
<parameter key="ratio" value="0.5"/>
</enumeration>
<parameter key="sampling_type" value="stratified sampling"/>
<parameter key="use_local_random_seed" value="true"/>
</operator>
<operator activated="false" class="write_excel" compatibility="7.2.000" expanded="true" height="82" name="Write Excel (2)" width="90" x="45" y="442">
<parameter key="excel_file" value="C:\Users\Admin\Desktop\testData.xlsx"/>
</operator>
<operator activated="false" class="write_excel" compatibility="7.2.000" expanded="true" height="82" name="Write Excel" width="90" x="45" y="136">
<parameter key="excel_file" value="C:\Users\Admin\Desktop\trainData.xlsx"/>
</operator>
<operator activated="true" class="retrieve" compatibility="7.2.000" expanded="true" height="68" name="Retrieve testData" width="90" x="179" y="391">
<parameter key="repository_entry" value="//RapidMiner_Nils/repositories/Local Repository/data/test und training/testData"/>
</operator>
<operator activated="true" class="normalize" compatibility="7.2.000" expanded="true" height="103" name="Normalize Test Data" width="90" x="313" y="391"/>
<operator activated="true" class="retrieve" compatibility="7.2.000" expanded="true" height="68" name="Retrieve trainData" width="90" x="179" y="136">
<parameter key="repository_entry" value="//RapidMiner_Nils/repositories/Local Repository/data/test und training/trainData"/>
</operator>
<operator activated="true" class="normalize" compatibility="7.2.000" expanded="true" height="103" name="Normalize" width="90" x="45" y="34"/>
<operator activated="true" class="log" compatibility="7.2.000" expanded="true" height="82" name="Log Normaize Parameter" width="90" x="179" y="34">
<list key="log">
<parameter key="attributes" value="operator.Normalize.parameter.attributes"/>
<parameter key="value type" value="operator.Normalize.parameter.value_type"/>
</list>
</operator>
<operator activated="true" class="multiply" compatibility="7.2.000" expanded="true" height="124" name="Multiply Trainings Data" width="90" x="313" y="34"/>
<operator activated="true" class="optimize_parameters_grid" compatibility="7.2.000" expanded="true" height="103" name="Optimize Parameters (Grid)" width="90" x="648" y="34">
<list key="parameters">
<parameter key="SVM.C" value="[1000;300000;10;linear]"/>
<parameter key="SVM.gamma" value="[0.001;1;10;linear]"/>
</list>
<process expanded="true">
<operator activated="true" class="x_validation" compatibility="7.2.000" expanded="true" height="124" name="Validation" width="90" x="313" y="34">
<parameter key="number_of_validations" value="5"/>
<process expanded="true">
<operator activated="true" class="support_vector_machine_libsvm" compatibility="7.2.000" expanded="true" height="82" name="SVM" width="90" x="246" y="34">
<parameter key="gamma" value="1.0"/>
<parameter key="C" value="300000.0"/>
<list key="class_weights"/>
</operator>
<connect from_port="training" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="7.2.000" expanded="true" height="82" name="Apply Model" width="90" x="112" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="7.2.000" expanded="true" height="82" name="Performance" width="90" x="313" y="34">
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="log" compatibility="7.2.000" expanded="true" height="82" name="Log" width="90" x="648" y="85">
<list key="log">
<parameter key="C" value="operator.SVM.parameter.C"/>
<parameter key="gamma" value="operator.SVM.parameter.gamma"/>
<parameter key="XVAL_Performance" value="operator.Validation.value.performance"/>
<parameter key="XVAL_iteration" value="operator.Validation.value.iteration"/>
<parameter key="XVAL_time" value="operator.Validation.value.time"/>
<parameter key="Perf_acc" value="operator.Performance.value.accuracy"/>
<parameter key="Perf_kappa" value="operator.Performance.value.kappa"/>
<parameter key="Perf_time" value="operator.Performance.value.cpu-time"/>
<parameter key="TESTPERF_acc" value="operator.TESTPERF.value.accuracy"/>
<parameter key="TESTPERF_kappa" value="operator.TESTPERF.value.kappa"/>
<parameter key="TRAINPERF_acc" value="operator.TRAINPERF.value.accuracy"/>
<parameter key="TRAINPERF_kappa" value="operator.TRAINPERF.value.kappa"/>
<parameter key="TESTPERF_time" value="operator.TESTPERF.value.time"/>
<parameter key="TRAINPERF_time" value="operator.TRAINPERF.value.time"/>
</list>
</operator>
<connect from_port="input 1" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="averagable 1" to_op="Log" to_port="through 1"/>
<connect from_op="Log" from_port="through 1" to_port="performance"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_performance" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
</process>
</operator>
<operator activated="true" class="set_parameters" compatibility="7.2.000" expanded="true" height="82" name="Set Parameters" width="90" x="849" y="85">
<list key="name_map">
<parameter key="SVM" value="SVM Trainings Data"/>
</list>
</operator>
<operator activated="true" class="support_vector_machine_libsvm" compatibility="7.2.000" expanded="true" height="82" name="SVM Trainings Data" width="90" x="581" y="187">
<parameter key="gamma" value="0.001"/>
<parameter key="C" value="270100.0"/>
<list key="class_weights"/>
</operator>
<operator activated="true" class="multiply" compatibility="7.2.000" expanded="true" height="103" name="Multiply Model" width="90" x="782" y="187"/>
<operator activated="true" class="apply_model" compatibility="7.2.000" expanded="true" height="82" name="Apply Model (2)" width="90" x="447" y="289">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="7.2.000" expanded="true" height="82" name="TRAINPERF" width="90" x="715" y="340">
<parameter key="classification_error" value="true"/>
<list key="class_weights"/>
</operator>
<operator activated="true" class="apply_model" compatibility="7.2.000" expanded="true" height="82" name="Apply Model (3)" width="90" x="581" y="442">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="7.2.000" expanded="true" height="82" name="TESTPERF" width="90" x="715" y="442">
<parameter key="classification_error" value="true"/>
<list key="class_weights"/>
</operator>
<operator activated="true" class="log" compatibility="7.2.000" expanded="true" height="103" name="LOG ALL" width="90" x="849" y="340">
<list key="log">
<parameter key="accuracy" value="operator.Performance.value.accuracy"/>
<parameter key="classification error" value="operator.Performance.value.classification_error"/>
</list>
</operator>
<connect from_op="Retrieve" from_port="output" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_op="Write Excel" to_port="input"/>
<connect from_op="Split Data" from_port="partition 2" to_op="Write Excel (2)" to_port="input"/>
<connect from_op="Retrieve testData" from_port="output" to_op="Normalize Test Data" to_port="example set input"/>
<connect from_op="Normalize Test Data" from_port="example set output" to_op="Apply Model (3)" to_port="unlabelled data"/>
<connect from_op="Retrieve trainData" from_port="output" to_op="Normalize" to_port="example set input"/>
<connect from_op="Normalize" from_port="example set output" to_op="Log Normaize Parameter" to_port="through 1"/>
<connect from_op="Log Normaize Parameter" from_port="through 1" to_op="Multiply Trainings Data" to_port="input"/>
<connect from_op="Multiply Trainings Data" from_port="output 1" to_op="SVM Trainings Data" to_port="training set"/>
<connect from_op="Multiply Trainings Data" from_port="output 2" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Multiply Trainings Data" from_port="output 3" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
<connect from_op="Optimize Parameters (Grid)" from_port="performance" to_port="result 1"/>
<connect from_op="Optimize Parameters (Grid)" from_port="parameter" to_op="Set Parameters" to_port="parameter set"/>
<connect from_op="Set Parameters" from_port="parameter set" to_port="result 3"/>
<connect from_op="SVM Trainings Data" from_port="model" to_op="Multiply Model" to_port="input"/>
<connect from_op="Multiply Model" from_port="output 1" to_op="Apply Model (2)" to_port="model"/>
<connect from_op="Multiply Model" from_port="output 2" to_op="Apply Model (3)" to_port="model"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_op="TRAINPERF" to_port="labelled data"/>
<connect from_op="TRAINPERF" from_port="performance" to_op="LOG ALL" to_port="through 1"/>
<connect from_op="Apply Model (3)" from_port="labelled data" to_op="TESTPERF" to_port="labelled data"/>
<connect from_op="TESTPERF" from_port="performance" to_op="LOG ALL" to_port="through 2"/>
<connect from_op="LOG ALL" from_port="through 1" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>
what I would really like to achieve, is going to the different C and gamma with their respective performance, BOTH FOR TESTING AND TRAINING VALIDATION (+performance) and plot them together with C, gamma and their performance in a single graph.. best would be a series graph maybe...
the thing is, I want to see when OVERFITTING occurs, like see a discrepancy some time when training accuracy goes up, and when (on what C and gamma configuration) the testing performance decreases ...
if I could also plot X-Val performance together with the others in the graph, that would be perfect..
is this somehow realisable in Rapidminer? maybe in series multiple, but I have the problem, that in my current configuration I get only best parameters C and gamma to be set for the testing and training model, how can I apply C and gamma for all possible grid combinations and set them for the test/ training run, so that at each configuration, also the test/train validation+performance is executed?