SVD produces strange results when Xvalidation folds greater than 2
dbrown
New Altair Community Member
Hi,
I am working on a text classification problem where I am using an SVD to perform dimensionality reduction. I am using a XValidation loop where I apply the SVD to the training data and select "return preprocessing model" in order to apply the same SVD model to the test data.
When I use XValidation # of validations = 2, I obtain "good" results, with a binary classification performance of roughly 95% for both categories. However, whenever I increase # validations beyond 2 the performance is terrible; with # validations = 3 the performance drops to about 50%. The choice of number of folds shouldn't have this much of an effect; and when I perform other text processing tasks with RM I have never experience this. It is only when I have attempted to use the SVD that I have had this problem.
Is my implementation below correct (i.e., is the test set being mapped to the same reduced-dimension space as the training set)?
If so, any ideas why the # of validations has such a huge impact here?
Thanks,
David
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
<context>
<input>
<location/>
</input>
<output>
<location/>
<location/>
</output>
<macros/>
</context>
<operator activated="true" class="process" expanded="true" name="Process">
<process expanded="true" height="415" width="415">
<operator activated="true" class="text:process_document_from_file" expanded="true" height="76" name="Text Input" width="90" x="45" y="30">
<list key="text_directories">
<parameter key="C1" value="C:\My Documents\Research\Corpus1"/>
<parameter key="C2" value="C:\My Documents\Research\Corpus2"/>
</list>
<process expanded="true" height="401" width="437">
<operator activated="true" class="text:tokenize" expanded="true" height="60" name="Tokenize" width="90" x="78" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="246" y="30"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="x_validation" expanded="true" height="112" name="Validation" width="90" x="179" y="30">
<parameter key="number_of_validations" value="2"/>
<parameter key="use_local_random_seed" value="true"/>
<parameter key="local_random_seed" value="1"/>
<process expanded="true" height="383" width="346">
<operator activated="true" class="singular_value_decomposition" expanded="true" height="94" name="SVD" width="90" x="45" y="30">
<parameter key="return_preprocessing_model" value="true"/>
<parameter key="dimensions" value="100"/>
</operator>
<operator activated="true" class="support_vector_machine_libsvm" expanded="true" height="76" name="SVM" width="90" x="179" y="120">
<parameter key="kernel_type" value="linear"/>
<list key="class_weights"/>
<parameter key="shrinking" value="false"/>
<parameter key="confidence_for_multiclass" value="false"/>
</operator>
<connect from_port="training" to_op="SVD" to_port="example set input"/>
<connect from_op="SVD" from_port="example set output" to_op="SVM" to_port="training set"/>
<connect from_op="SVD" from_port="preprocessing model" to_port="through 1"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
<portSpacing port="sink_through 2" spacing="0"/>
</process>
<process expanded="true" height="415" width="212">
<operator activated="true" class="apply_model" expanded="true" height="76" name="Apply SVM Model" width="90" x="45" y="120">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="apply_model" expanded="true" height="76" name="Apply ML Model" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" expanded="true" height="76" name="Performance" width="90" x="45" y="300">
<parameter key="precision" value="true"/>
<parameter key="recall" value="true"/>
</operator>
<connect from_port="model" to_op="Apply ML Model" to_port="model"/>
<connect from_port="test set" to_op="Apply SVM Model" to_port="unlabelled data"/>
<connect from_port="through 1" to_op="Apply SVM Model" to_port="model"/>
<connect from_op="Apply SVM Model" from_port="labelled data" to_op="Apply ML Model" to_port="unlabelled data"/>
<connect from_op="Apply ML Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="source_through 2" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_op="Text Input" from_port="example set" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
I am working on a text classification problem where I am using an SVD to perform dimensionality reduction. I am using a XValidation loop where I apply the SVD to the training data and select "return preprocessing model" in order to apply the same SVD model to the test data.
When I use XValidation # of validations = 2, I obtain "good" results, with a binary classification performance of roughly 95% for both categories. However, whenever I increase # validations beyond 2 the performance is terrible; with # validations = 3 the performance drops to about 50%. The choice of number of folds shouldn't have this much of an effect; and when I perform other text processing tasks with RM I have never experience this. It is only when I have attempted to use the SVD that I have had this problem.
Is my implementation below correct (i.e., is the test set being mapped to the same reduced-dimension space as the training set)?
If so, any ideas why the # of validations has such a huge impact here?
Thanks,
David
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
<context>
<input>
<location/>
</input>
<output>
<location/>
<location/>
</output>
<macros/>
</context>
<operator activated="true" class="process" expanded="true" name="Process">
<process expanded="true" height="415" width="415">
<operator activated="true" class="text:process_document_from_file" expanded="true" height="76" name="Text Input" width="90" x="45" y="30">
<list key="text_directories">
<parameter key="C1" value="C:\My Documents\Research\Corpus1"/>
<parameter key="C2" value="C:\My Documents\Research\Corpus2"/>
</list>
<process expanded="true" height="401" width="437">
<operator activated="true" class="text:tokenize" expanded="true" height="60" name="Tokenize" width="90" x="78" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="246" y="30"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="x_validation" expanded="true" height="112" name="Validation" width="90" x="179" y="30">
<parameter key="number_of_validations" value="2"/>
<parameter key="use_local_random_seed" value="true"/>
<parameter key="local_random_seed" value="1"/>
<process expanded="true" height="383" width="346">
<operator activated="true" class="singular_value_decomposition" expanded="true" height="94" name="SVD" width="90" x="45" y="30">
<parameter key="return_preprocessing_model" value="true"/>
<parameter key="dimensions" value="100"/>
</operator>
<operator activated="true" class="support_vector_machine_libsvm" expanded="true" height="76" name="SVM" width="90" x="179" y="120">
<parameter key="kernel_type" value="linear"/>
<list key="class_weights"/>
<parameter key="shrinking" value="false"/>
<parameter key="confidence_for_multiclass" value="false"/>
</operator>
<connect from_port="training" to_op="SVD" to_port="example set input"/>
<connect from_op="SVD" from_port="example set output" to_op="SVM" to_port="training set"/>
<connect from_op="SVD" from_port="preprocessing model" to_port="through 1"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
<portSpacing port="sink_through 2" spacing="0"/>
</process>
<process expanded="true" height="415" width="212">
<operator activated="true" class="apply_model" expanded="true" height="76" name="Apply SVM Model" width="90" x="45" y="120">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="apply_model" expanded="true" height="76" name="Apply ML Model" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" expanded="true" height="76" name="Performance" width="90" x="45" y="300">
<parameter key="precision" value="true"/>
<parameter key="recall" value="true"/>
</operator>
<connect from_port="model" to_op="Apply ML Model" to_port="model"/>
<connect from_port="test set" to_op="Apply SVM Model" to_port="unlabelled data"/>
<connect from_port="through 1" to_op="Apply SVM Model" to_port="model"/>
<connect from_op="Apply SVM Model" from_port="labelled data" to_op="Apply ML Model" to_port="unlabelled data"/>
<connect from_op="Apply ML Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="source_through 2" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_op="Text Input" from_port="example set" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Tagged:
0
Answers
-
Hi,
apart from the fact, that you misspelled the first model applier with SVM instead of SVD, I don't see any obvious errors
But to achieve realistic results, you must include the processing of the documents into the XValidation. Otherwise your learner always will have informations about the complete text set through tfidf representation and the number of attributes.
You might to this by building an example set first, containing only two attributes, one with the text, one with the label. I will modify the process accordingly.
Please vary the local random seed of the XValidation to test, if the good results on 2 folds are a random effect.
If this does not apply, then it appears to me, that the performance of 50% is the realistic one, and not the good 95% estimate<?xml version="1.0" encoding="UTF-8" standalone="no"?>
Greetings,
<process version="5.0">
<context>
<input>
<location/>
</input>
<output>
<location/>
<location/>
</output>
<macros/>
</context>
<operator activated="true" class="process" expanded="true" name="Process">
<process expanded="true" height="415" width="547">
<operator activated="true" class="text:process_document_from_file" expanded="true" height="76" name="Text Input" width="90" x="45" y="30">
<list key="text_directories">
<parameter key="C1" value="C:\My Documents\Research\Corpus1"/>
<parameter key="C2" value="C:\My Documents\Research\Corpus2"/>
</list>
<parameter key="create_word_vector" value="false"/>
<parameter key="keep_text" value="true"/>
<process expanded="true" height="401" width="437">
<connect from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="select_attributes" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="30">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="text|label"/>
</operator>
<operator activated="true" class="x_validation" expanded="true" height="112" name="Validation" width="90" x="380" y="30">
<parameter key="number_of_validations" value="2"/>
<parameter key="use_local_random_seed" value="true"/>
<parameter key="local_random_seed" value="1"/>
<process expanded="true" height="610" width="435">
<operator activated="true" class="text:process_document_from_data" expanded="true" height="76" name="Process Documents from Data" width="90" x="45" y="30">
<list key="specify_weights"/>
<process expanded="true" height="610" width="300">
<operator activated="true" class="text:tokenize" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="180" y="30"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="singular_value_decomposition" expanded="true" height="94" name="SVD" width="90" x="180" y="30">
<parameter key="return_preprocessing_model" value="true"/>
<parameter key="dimensions" value="100"/>
</operator>
<operator activated="true" class="support_vector_machine_libsvm" expanded="true" height="76" name="SVM" width="90" x="315" y="30">
<parameter key="kernel_type" value="linear"/>
<list key="class_weights"/>
<parameter key="shrinking" value="false"/>
<parameter key="confidence_for_multiclass" value="false"/>
</operator>
<connect from_port="training" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="SVD" to_port="example set input"/>
<connect from_op="Process Documents from Data" from_port="word list" to_port="through 2"/>
<connect from_op="SVD" from_port="example set output" to_op="SVM" to_port="training set"/>
<connect from_op="SVD" from_port="preprocessing model" to_port="through 1"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
<portSpacing port="sink_through 2" spacing="0"/>
<portSpacing port="sink_through 3" spacing="0"/>
</process>
<process expanded="true" height="610" width="435">
<operator activated="true" class="text:process_document_from_data" expanded="true" height="76" name="Process Documents from Data (2)" width="90" x="45" y="30">
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" expanded="true" name="Tokenize (2)"/>
<operator activated="true" class="text:filter_stopwords_english" expanded="true" name="Filter Stopwords (2)"/>
<connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
<connect from_op="Tokenize (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
<connect from_op="Filter Stopwords (2)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" expanded="true" height="76" name="Apply SVD Model" width="90" x="44" y="165">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="apply_model" expanded="true" height="76" name="Apply ML Model" width="90" x="179" y="165">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" expanded="true" height="76" name="Performance" width="90" x="45" y="300">
<parameter key="precision" value="true"/>
<parameter key="recall" value="true"/>
</operator>
<connect from_port="model" to_op="Apply ML Model" to_port="model"/>
<connect from_port="test set" to_op="Process Documents from Data (2)" to_port="example set"/>
<connect from_port="through 1" to_op="Apply SVD Model" to_port="model"/>
<connect from_port="through 2" to_op="Process Documents from Data (2)" to_port="word list"/>
<connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Apply SVD Model" to_port="unlabelled data"/>
<connect from_op="Apply SVD Model" from_port="labelled data" to_op="Apply ML Model" to_port="unlabelled data"/>
<connect from_op="Apply ML Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="source_through 2" spacing="0"/>
<portSpacing port="source_through 3" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_op="Text Input" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Sebastian0 -
Thanks for the feedback Sebastian. I will try what you recommend and post if I have further questions. I agree that processing the tfidf outside of the XValidation biases the results somewhat and was just doing that to keep things simple and get a sanity check on the SVD before plunging in. But thanks for the modified code!
0