Hi there,
i am quite new to rapidminer and the whole subject of classification. my problem is in short the following:
- textclassification
- OHSUMED-91-dataset (~13 000 medical abstracts)
- i want to compare my results with the results of someone else who measured his classification with the precission-recall-breakeven-point
for reaching this breakeven-point i tried:
- metaCost-Operator
- thresholdFinder / thresholdApplier
but with manual adjusting costs i cant seem to find the breakeven-point. is there a way for automatically finding the right threshold ??? maybe my whole approach for this is inappropriate ?!?
any help would be appreciated.
thanks in advance
klaus
my process:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Root">
<description><p>This process demonstrates how a threshold can be obtained from a soft classifier and applied to an independent test set.</p><ol><li>The learner used in this process makes soft predictions instead of crisp classifications. The prediction confidences delivered by all learners in RapidMiner which are able to handle nominal labels (classification) will be used as soft predictions. <br><icon>groups/24/learner</icon></li><li>The ThresholdFinder is used to determine the best threshold with respect to class weights. In this case, a wrong classification of the first class (negative) will cause costs five times bigger than the other error. <br><icon>groups/24/postprocessing</icon></li><li>Please note that a ModelApplier must be performed on the test set before a threshold can be found. Since this model must be applied again later, the model applier keeps the input model. <br><icon>operators/24/model_applier</icon></li><li>The IOConsumer ensures that the prediction is made on the correct data set. <br><icon>operators/24/io_consumer</icon></li><li>The last steps apply the model and the threshold on the data set at hand. <br><icon>groups/24/validation</icon></li></ol></description>
<parameter key="logverbosity" value="status"/>
<parameter key="random_seed" value="1903"/>
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="ProcessTrainingSet" width="90" x="45" y="30">
<list key="text_directories">
<parameter key="C23_less" value="C:\stuff\Dokumentenklassifikation\Datasets\Ohsumed_91\ohsumed-first-20000-divided-C23_notC23\training\C23_less"/>
<parameter key="notC23_less" value="C:\stuff\Dokumentenklassifikation\Datasets\Ohsumed_91\ohsumed-first-20000-divided-C23_notC23\training\notC23_less"/>
</list>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30"/>
<operator activated="false" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="179" y="30"/>
<operator activated="true" class="text:stem_snowball" compatibility="5.3.002" expanded="true" height="60" name="Stem (Snowball)" width="90" x="315" y="30"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="623" y="30"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
<connect from_op="Stem (Snowball)" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="support_vector_machine_libsvm" compatibility="5.3.015" expanded="true" height="76" name="SVM (2)" width="90" x="246" y="30">
<parameter key="gamma" value="1.0"/>
<parameter key="C" value="1.0"/>
<parameter key="nu" value="0.4"/>
<list key="class_weights"/>
</operator>
<operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="ProcessTestSet" width="90" x="112" y="165">
<list key="text_directories">
<parameter key="C23_less" value="C:\stuff\Dokumentenklassifikation\Datasets\Ohsumed_91\ohsumed-first-20000-divided-C23_notC23\test\C23_less"/>
<parameter key="notC23_less" value="C:\stuff\Dokumentenklassifikation\Datasets\Ohsumed_91\ohsumed-first-20000-divided-C23_notC23\test\notC23_less"/>
</list>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (2)" width="90" x="45" y="30"/>
<operator activated="false" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="179" y="30"/>
<operator activated="true" class="text:stem_snowball" compatibility="5.3.002" expanded="true" height="60" name="Stem (2)" width="90" x="380" y="30"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases (2)" width="90" x="648" y="30"/>
<connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
<connect from_op="Tokenize (2)" from_port="document" to_op="Stem (2)" to_port="document"/>
<connect from_op="Stem (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
<connect from_op="Transform Cases (2)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" compatibility="5.3.015" expanded="true" height="76" name="TestModelApplier" width="90" x="380" y="120">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="find_threshold" compatibility="5.3.015" expanded="true" height="76" name="ThresholdFinder" width="90" x="514" y="120">
<parameter key="misclassification_costs_first" value="1.335"/>
<parameter key="use_example_weights" value="false"/>
</operator>
<operator activated="true" class="apply_threshold" compatibility="5.3.015" expanded="true" height="76" name="ThresholdApplier" width="90" x="648" y="120"/>
<operator activated="true" class="performance_classification" compatibility="5.3.015" expanded="true" height="76" name="Performance (3)" width="90" x="782" y="120">
<parameter key="classification_error" value="true"/>
<parameter key="kappa" value="true"/>
<parameter key="weighted_mean_recall" value="true"/>
<parameter key="weighted_mean_precision" value="true"/>
<parameter key="spearman_rho" value="true"/>
<parameter key="kendall_tau" value="true"/>
<parameter key="absolute_error" value="true"/>
<parameter key="relative_error" value="true"/>
<parameter key="relative_error_lenient" value="true"/>
<parameter key="relative_error_strict" value="true"/>
<parameter key="normalized_absolute_error" value="true"/>
<parameter key="root_mean_squared_error" value="true"/>
<parameter key="root_relative_squared_error" value="true"/>
<parameter key="squared_error" value="true"/>
<parameter key="correlation" value="true"/>
<parameter key="squared_correlation" value="true"/>
<parameter key="cross-entropy" value="true"/>
<parameter key="margin" value="true"/>
<parameter key="soft_margin_loss" value="true"/>
<parameter key="logistic_loss" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_op="ProcessTrainingSet" from_port="example set" to_op="SVM (2)" to_port="training set"/>
<connect from_op="ProcessTrainingSet" from_port="word list" to_op="ProcessTestSet" to_port="word list"/>
<connect from_op="SVM (2)" from_port="model" to_op="TestModelApplier" to_port="model"/>
<connect from_op="ProcessTestSet" from_port="example set" to_op="TestModelApplier" to_port="unlabelled data"/>
<connect from_op="TestModelApplier" from_port="labelled data" to_op="ThresholdFinder" to_port="example set"/>
<connect from_op="ThresholdFinder" from_port="example set" to_op="ThresholdApplier" to_port="example set"/>
<connect from_op="ThresholdFinder" from_port="threshold" to_op="ThresholdApplier" to_port="threshold"/>
<connect from_op="ThresholdApplier" from_port="example set" to_op="Performance (3)" to_port="labelled data"/>
<connect from_op="Performance (3)" from_port="performance" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="36"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>