Hi - I've noticed a handful of related threads on this topic, but no great solutions that will work with the problem I'm facing...
I have used RapidMiner + WVTool to build a text-mining model. When i use ModelApplier to apply that model to a new data set, it seems to somehow misinterpret my original labels.
After several failed attempts to get a correct validation of my model using Model Applier, I finally tested this out by building a model and then applying that same model to the *identical* data set sorted merely in a different order. When I do this, it returns a file that has somehow flipped the values in the label field. So clearly something is going wrong either with my code or with the ModelApplier Operator.
Here are some threads that sound related...
http://rapid-i.com/rapidforum/index.php/topic,776.0.htmlhttp://rapid-i.com/rapidforum/index.php/topic,281.0.htmlhttp://rapid-i.com/rapidforum/index.php/topic,319.0.htmlHere's my code...
<?xml version="1.0" encoding="windows-1252"?>
<process version="4.4">
<operator name="Root" class="Process" expanded="yes">
<parameter key="logverbosity" value="init"/>
<parameter key="logfile" value="OUT_%{process_name}_RootLog0.log"/>
<parameter key="resultfile" value="OUT_%{process_name}_RootResults0.res"/>
<parameter key="random_seed" value="2001"/>
<parameter key="encoding" value="SYSTEM"/>
<operator name="MemoryCleanUp_START" class="MemoryCleanUp">
</operator>
<operator name="ExcelExampleSource_ModDev" class="ExcelExampleSource">
<parameter key="excel_file" value="%{process_name}.xls"/>
<parameter key="sheet_number" value="1"/>
<parameter key="row_offset" value="0"/>
<parameter key="column_offset" value="0"/>
<parameter key="first_row_as_names" value="true"/>
<parameter key="create_label" value="true"/>
<parameter key="label_column" value="1"/>
<parameter key="create_id" value="true"/>
<parameter key="id_column" value="3"/>
<parameter key="decimal_point_character" value="."/>
<parameter key="datamanagement" value="double_array"/>
</operator>
<operator name="Nominal2String_ModDev" class="Nominal2String">
</operator>
<operator name="StringTextInput_ModDev" class="StringTextInput" expanded="no">
<parameter key="filter_nominal_attributes" value="false"/>
<parameter key="remove_original_attributes" value="true"/>
<parameter key="default_content_type" value=""/>
<parameter key="default_content_encoding" value=""/>
<parameter key="default_content_language" value=""/>
<parameter key="prune_below" value="50"/>
<parameter key="prune_above" value="-1"/>
<parameter key="vector_creation" value="TermFrequency"/>
<parameter key="use_content_attributes" value="false"/>
<parameter key="use_given_word_list" value="false"/>
<parameter key="return_word_list" value="false"/>
<parameter key="output_word_list" value="OUT_%{process_name}_Words_ModDev.txt"/>
<parameter key="id_attribute_type" value="number"/>
<list key="namespaces">
</list>
<parameter key="create_text_visualizer" value="false"/>
<parameter key="on_the_fly_pruning" value="-1"/>
<operator name="StringTokenizer" class="StringTokenizer">
</operator>
<operator name="TokenLengthFilter" class="TokenLengthFilter">
<parameter key="min_chars" value="4"/>
<parameter key="max_chars" value="2147483647"/>
</operator>
<operator name="LovinsStemmer" class="LovinsStemmer">
</operator>
<operator name="StopwordFilterFile" class="StopwordFilterFile">
<parameter key="file" value="_STOPWORDS.txt"/>
<parameter key="case_sensitive" value="false"/>
</operator>
<operator name="TermNGramGenerator" class="TermNGramGenerator">
<parameter key="max_length" value="4"/>
</operator>
</operator>
<operator name="ExampleSetWriter_ModDev" class="ExampleSetWriter">
<parameter key="example_set_file" value="OUT_%{process_name}_ExampleSetFile_ModDevInput.dat"/>
<parameter key="attribute_description_file" value="OUT_%{process_name}_AttDescFile_ModDevInput.aml"/>
<parameter key="format" value="dense"/>
<parameter key="fraction_digits" value="-1"/>
<parameter key="quote_nominal_values" value="false"/>
<parameter key="zipped" value="false"/>
<parameter key="overwrite_mode" value="overwrite first, append then"/>
</operator>
<operator name="MemoryCleanUp_02" class="MemoryCleanUp">
</operator>
<operator name="XValidation" class="XValidation" expanded="yes">
<parameter key="keep_example_set" value="true"/>
<parameter key="create_complete_model" value="true"/>
<parameter key="average_performances_only" value="true"/>
<parameter key="leave_one_out" value="false"/>
<parameter key="number_of_validations" value="10"/>
<parameter key="sampling_type" value="stratified sampling"/>
<parameter key="local_random_seed" value="-1"/>
<operator name="LibSVMLearner" class="LibSVMLearner">
<parameter key="keep_example_set" value="true"/>
<parameter key="svm_type" value="C-SVC"/>
<parameter key="kernel_type" value="linear"/>
<parameter key="degree" value="1"/>
<parameter key="gamma" value="0.0"/>
<parameter key="coef0" value="0.0"/>
<parameter key="C" value="0.0"/>
<parameter key="nu" value="0.5"/>
<parameter key="cache_size" value="80"/>
<parameter key="epsilon" value="0.0010"/>
<parameter key="p" value="0.1"/>
<list key="class_weights">
</list>
<parameter key="shrinking" value="true"/>
<parameter key="calculate_confidences" value="true"/>
<parameter key="confidence_for_multiclass" value="true"/>
</operator>
<operator name="OperatorChain" class="OperatorChain" expanded="no">
<operator name="ModelApplier" class="ModelApplier">
<parameter key="keep_model" value="true"/>
<list key="application_parameters">
</list>
<parameter key="create_view" value="false"/>
</operator>
<operator name="BinominalClassificationPerformance" class="BinominalClassificationPerformance">
<parameter key="keep_example_set" value="true"/>
<parameter key="main_criterion" value="AUC"/>
<parameter key="AUC" value="true"/>
<parameter key="precision" value="true"/>
<parameter key="recall" value="true"/>
<parameter key="lift" value="true"/>
<parameter key="fallout" value="true"/>
<parameter key="f_measure" value="true"/>
<parameter key="false_positive" value="true"/>
<parameter key="false_negative" value="true"/>
<parameter key="true_positive" value="true"/>
<parameter key="true_negative" value="true"/>
<parameter key="sensitivity" value="true"/>
<parameter key="specificity" value="true"/>
<parameter key="youden" value="true"/>
<parameter key="positive_predictive_value" value="true"/>
<parameter key="negative_predictive_value" value="true"/>
<parameter key="psep" value="true"/>
<parameter key="skip_undefined_labels" value="true"/>
<parameter key="use_example_weights" value="true"/>
</operator>
<operator name="ECS_ModelResults" class="ExampleSetWriter">
<parameter key="example_set_file" value="OUT_%{process_name}_ExampleSetFile_ModDevOutput_LiftCurve.dat"/>
<parameter key="format" value="special_format"/>
<parameter key="special_format" value="$i $l $p $d"/>
<parameter key="fraction_digits" value="-1"/>
<parameter key="quote_nominal_values" value="true"/>
<parameter key="zipped" value="false"/>
<parameter key="overwrite_mode" value="overwrite first, append then"/>
</operator>
<operator name="PerformanceWriter" class="PerformanceWriter">
<parameter key="performance_file" value="OUT_%{process_name}_Perf_ModDevOutput.per"/>
</operator>
<operator name="ResultWriter" class="ResultWriter">
<parameter key="result_file" value="OUT_%{process_name}_Results_ModDevOutput.res"/>
</operator>
<operator name="ModelWriter1" class="ModelWriter">
<parameter key="model_file" value="OUT_%{process_name}_Model_ModDevOutput1.mod"/>
<parameter key="overwrite_existing_file" value="true"/>
<parameter key="output_type" value="XML"/>
</operator>
</operator>
</operator>
<operator name="ModelWriter2" class="ModelWriter">
<parameter key="model_file" value="OUT_%{process_name}_Model_ModDevOutput2.mod"/>
<parameter key="overwrite_existing_file" value="true"/>
<parameter key="output_type" value="XML"/>
</operator>
<operator name="ExcelExampleSource_ModVal" class="ExcelExampleSource">
<parameter key="excel_file" value="C:\_20090403_NPSr_Dec08_KWA\_20090403_NPSr_Dec08.xls"/>
<parameter key="sheet_number" value="1"/>
<parameter key="row_offset" value="0"/>
<parameter key="column_offset" value="0"/>
<parameter key="first_row_as_names" value="true"/>
<parameter key="create_label" value="true"/>
<parameter key="label_column" value="1"/>
<parameter key="create_id" value="true"/>
<parameter key="id_column" value="3"/>
<parameter key="decimal_point_character" value="."/>
<parameter key="datamanagement" value="double_array"/>
</operator>
<operator name="Nominal2String_ModVal" class="Nominal2String">
</operator>
<operator name="StringTextInput_ModVal" class="StringTextInput" expanded="yes">
<parameter key="filter_nominal_attributes" value="false"/>
<parameter key="remove_original_attributes" value="true"/>
<parameter key="default_content_type" value=""/>
<parameter key="default_content_encoding" value=""/>
<parameter key="default_content_language" value=""/>
<parameter key="prune_below" value="1"/>
<parameter key="prune_above" value="-1"/>
<parameter key="vector_creation" value="TermFrequency"/>
<parameter key="use_content_attributes" value="false"/>
<parameter key="use_given_word_list" value="false"/>
<parameter key="return_word_list" value="false"/>
<parameter key="output_word_list" value="OUT_%{process_name}_Words_ModVal.txt"/>
<parameter key="id_attribute_type" value="number"/>
<list key="namespaces">
</list>
<parameter key="create_text_visualizer" value="false"/>
<parameter key="on_the_fly_pruning" value="-1"/>
<operator name="StringTokenizer (2)" class="StringTokenizer">
</operator>
<operator name="TokenLengthFilter (2)" class="TokenLengthFilter">
<parameter key="min_chars" value="4"/>
<parameter key="max_chars" value="2147483647"/>
</operator>
<operator name="LovinsStemmer (2)" class="LovinsStemmer">
</operator>
<operator name="StopwordFilterFile (2)" class="StopwordFilterFile">
<parameter key="file" value="C:\_04_NPS_ModelVal\_STOPWORDS.txt"/>
<parameter key="case_sensitive" value="false"/>
</operator>
<operator name="TermNGramGenerator (2)" class="TermNGramGenerator">
<parameter key="max_length" value="4"/>
</operator>
</operator>
<operator name="ExampleSetWriter_ModValInput" class="ExampleSetWriter">
<parameter key="example_set_file" value="OUT_%{process_name}_ExampleSetFile_ModValInput.dat"/>
<parameter key="attribute_description_file" value="OUT_%{process_name}_AttDescFile_ModVal.aml"/>
<parameter key="format" value="dense"/>
<parameter key="fraction_digits" value="-1"/>
<parameter key="quote_nominal_values" value="false"/>
<parameter key="zipped" value="false"/>
<parameter key="overwrite_mode" value="overwrite first, append then"/>
</operator>
<operator name="ModelApplier_ModVal" class="ModelApplier">
<parameter key="keep_model" value="false"/>
<list key="application_parameters">
</list>
<parameter key="create_view" value="false"/>
</operator>
<operator name="ExampleSetWriter_ModVal" class="ExampleSetWriter">
<parameter key="example_set_file" value="OUT_%{process_name}_ExampleSetFile_ModValOutput_LiftCurve.dat"/>
<parameter key="format" value="special_format"/>
<parameter key="special_format" value="$i $l $p $d"/>
<parameter key="fraction_digits" value="-1"/>
<parameter key="quote_nominal_values" value="true"/>
<parameter key="zipped" value="false"/>
<parameter key="overwrite_mode" value="overwrite first, append then"/>
</operator>
</operator>
</process>