One-class SVM for text classfication
I classify several examples (each example is a small text) using the one-class SVM. I got a problem that I cannot fix "Needs a nominal label with 2 or more values". I tried to set role for the label, to change the label to numerical, to use meta "Polynomial to binominal". Nothing worked. n the last case I got a messsage that there is no obvious mistake, but still the program does not give me any results. There were some posts here related to the topic, but there is still no correct answer. I would appreciate any help. Here is my original xml, It works for LibSVM in multi-class case:
<?xml version="1.0" encoding="UTF-8" standalone="no"?> <process version="5.3.013"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process"> <process expanded="true"> <operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="Process Categories" width="90" x="112" y="75"> <list key="text_directories"> <parameter key="En1" value="D:\Categories\En"/> </list> <parameter key="keep_text" value="true"/> <process expanded="true"> <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" name="Tokenize"/> <operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" name="Filter Stopwords (English)"/> <operator activated="true" class="text:stem_porter" compatibility="5.3.002" expanded="true" name="Stem (Porter)"/> <operator activated="true" class="text:generate_n_grams_terms" compatibility="5.3.002" expanded="true" name="Generate n-Grams (Terms)"> <parameter key="max_length" value="4"/> </operator> <connect from_port="document" to_op="Tokenize" to_port="document"/> <connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/> <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Stem (Porter)" to_port="document"/> <connect from_op="Stem (Porter)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/> <connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="x_validation" compatibility="5.3.013" expanded="true" height="112" name="Validation" width="90" x="313" y="75"> <process expanded="true"> <operator activated="true" class="support_vector_machine_libsvm" compatibility="5.3.013" expanded="true" height="76" name="SVM" width="90" x="110" y="30"> <parameter key="svm_type" value="one-class"/> <list key="class_weights"/> </operator> <connect from_port="training" to_op="SVM" to_port="training set"/> <connect from_op="SVM" from_port="model" to_port="model"/> <portSpacing port="source_training" spacing="0"/> <portSpacing port="sink_model" spacing="0"/> <portSpacing port="sink_through 1" spacing="0"/> </process> <process expanded="true"> <operator activated="true" class="apply_model" compatibility="5.3.013" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30"> <list key="application_parameters"/> </operator> <operator activated="true" class="performance_classification" compatibility="5.3.013" expanded="true" height="76" name="Performance" width="90" x="198" y="30"> <parameter key="classification_error" value="true"/> <parameter key="weighted_mean_recall" value="true"/> <parameter key="weighted_mean_precision" value="true"/> <list key="class_weights"/> </operator> <connect from_port="model" to_op="Apply Model" to_port="model"/> <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/> <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/> <connect from_op="Performance" from_port="performance" to_port="averagable 1"/> <portSpacing port="source_model" spacing="0"/> <portSpacing port="source_test set" spacing="0"/> <portSpacing port="source_through 1" spacing="0"/> <portSpacing port="sink_averagable 1" spacing="0"/> <portSpacing port="sink_averagable 2" spacing="0"/> </process> </operator> <connect from_op="Process Categories" from_port="example set" to_op="Validation" to_port="training"/> <connect from_op="Validation" from_port="model" to_port="result 1"/> <connect from_op="Validation" from_port="training" to_port="result 2"/> <connect from_op="Validation" from_port="averagable 1" to_port="result 3"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> <portSpacing port="sink_result 3" spacing="0"/> <portSpacing port="sink_result 4" spacing="0"/> </process> </operator> </process>
Best Answer
-
Hi
i've checked it again. it is the other way around. you need to have a nominal coloum with only one class as label. See attached process.
~Martin
Spoiler<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.1.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.1.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.1.001" expanded="true" height="68" name="Retrieve Golf" width="90" x="45" y="34">
<parameter key="repository_entry" value="//Samples/data/Golf"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="7.1.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="179" y="34">
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.1.001" expanded="true" height="82" name="Select Attributes" width="90" x="313" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Play"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="7.1.001" expanded="true" height="82" name="Generate Attributes" width="90" x="447" y="34">
<list key="function_descriptions">
<parameter key="label" value=""a label""/>
</list>
</operator>
<operator activated="true" class="set_role" compatibility="7.1.001" expanded="true" height="82" name="Set Role" width="90" x="581" y="34">
<parameter key="attribute_name" value="label"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="support_vector_machine_libsvm" compatibility="7.1.001" expanded="true" height="82" name="SVM" width="90" x="782" y="34">
<parameter key="svm_type" value="one-class"/>
<list key="class_weights"/>
</operator>
<connect from_op="Retrieve Golf" from_port="output" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>0
Answers
-
hello @In777
Not sure if I this solves your issue, but typical problems when doing type conversion and your column is has a special role (e.g columns like labels , weights etc.) is folks do not check the "inclide special attributes" checkbox.
Can you confirm if that was not the case
0 -
Dear @In777,
thanks for reaching out. The LibSVM One class svm is sometimes a bit confusing. You acutually need to have a binary label to do it. The second class needs to have 0 examples.
~Martin
0 -
I do not think that the problem is with the transformation. I modified my workflow, but it still does not work. I got a message about bug report. Here is my xml:
l version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.1.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="7.1.001" expanded="true" height="82" name="Process Categories" width="90" x="112" y="85">
<list key="text_directories">
<parameter key="Energy" value="D:\Categories\Energy"/>
<parameter key="Education" value="D:\Categories\Education"/>
</list>
<parameter key="keep_text" value="true"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.1.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="7.1.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="179" y="34"/>
<operator activated="true" class="text:stem_porter" compatibility="7.1.001" expanded="true" height="68" name="Stem (Porter)" width="90" x="313" y="34"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="7.1.001" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="447" y="34">
<parameter key="max_length" value="4"/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
<connect from_op="Stem (Porter)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="x_validation" compatibility="7.1.001" expanded="true" height="124" name="Validation" width="90" x="380" y="85">
<parameter key="number_of_validations" value="3"/>
<process expanded="true">
<operator activated="true" class="filter_examples" compatibility="6.4.000" expanded="true" height="103" name="Filter Examples" width="90" x="45" y="34">
<parameter key="parameter_string" value="label=Energy"/>
<parameter key="condition_class" value="attribute_value_filter"/>
<list key="filters_list"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.1.001" expanded="true" height="82" name="Select Attributes" width="90" x="180" y="30">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="label"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="6.4.000" expanded="true" height="82" name="Generate Attributes" width="90" x="45" y="187">
<list key="function_descriptions">
<parameter key="label" value=""Energy""/>
</list>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.013" expanded="true" height="82" name="Set Role" width="90" x="179" y="187">
<parameter key="attribute_name" value="label"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="support_vector_machine_libsvm" compatibility="7.1.001" expanded="true" height="82" name="SVM" width="90" x="313" y="34">
<parameter key="svm_type" value="one-class"/>
<list key="class_weights"/>
</operator>
<connect from_port="training" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="7.1.001" expanded="true" height="82" name="Apply Model" width="90" x="45" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="nominal_to_binominal" compatibility="7.1.001" expanded="true" height="103" name="Nominal to Binominal" width="90" x="179" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="prediction(label)"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="performance" compatibility="7.1.001" expanded="true" height="82" name="Performance" width="90" x="313" y="34">
<parameter key="use_example_weights" value="false"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Nominal to Binominal" to_port="example set input"/>
<connect from_op="Nominal to Binominal" from_port="example set output" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_op="Process Categories" from_port="example set" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="model" to_port="result 1"/>
<connect from_op="Validation" from_port="training" to_port="result 2"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>0 -
Thank you for your suggestion. I've tried to leave one of the directories with the examples empty, unfortunately that did not help. I still get a bug report (see my previous post).
0 -
Hi
i've checked it again. it is the other way around. you need to have a nominal coloum with only one class as label. See attached process.
~Martin
Spoiler<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.1.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.1.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.1.001" expanded="true" height="68" name="Retrieve Golf" width="90" x="45" y="34">
<parameter key="repository_entry" value="//Samples/data/Golf"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="7.1.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="179" y="34">
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.1.001" expanded="true" height="82" name="Select Attributes" width="90" x="313" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Play"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="7.1.001" expanded="true" height="82" name="Generate Attributes" width="90" x="447" y="34">
<list key="function_descriptions">
<parameter key="label" value=""a label""/>
</list>
</operator>
<operator activated="true" class="set_role" compatibility="7.1.001" expanded="true" height="82" name="Set Role" width="90" x="581" y="34">
<parameter key="attribute_name" value="label"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="support_vector_machine_libsvm" compatibility="7.1.001" expanded="true" height="82" name="SVM" width="90" x="782" y="34">
<parameter key="svm_type" value="one-class"/>
<list key="class_weights"/>
</operator>
<connect from_op="Retrieve Golf" from_port="output" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>0