Classification with LibSVM
ArmMiner
New Altair Community Member
Hi
I face a problem with LibSVM operator. My problem in general is the classification of the customers' reviews in the online shop (in german). So, what I did so far:
1. Collected the reviews (200) in the excel sheet.
2. I built a text processing model only for the editing my training data (tokenization, stemming, filtering,etc.) and saved in another excel sheet. So, it is less then 200 reviews and I gave them labels . For the beginning, I want to do 2-class classification (fast delivery and xxx).
3. I built another model, which is for the classification with LibSVM. Here LibSVM wants numeric values, so before this operator I use NominalToNumerical operator.
4. When I apply test data, the result is bad. A lot of examples are misclassified.
I put also breakpoint before LibSVM in order to see what exactly does that NominalToNumerical operator. And it just gives unique numbers to training examples, so it cant work.
Now I'm thinking how to solve this problem or maybe I have to use other operator? By the way, whole data is in german.
Thanks in advance.
Best regards
Armen
Classification Model
Text Processing Model
The part of test data after text processing
I face a problem with LibSVM operator. My problem in general is the classification of the customers' reviews in the online shop (in german). So, what I did so far:
1. Collected the reviews (200) in the excel sheet.
2. I built a text processing model only for the editing my training data (tokenization, stemming, filtering,etc.) and saved in another excel sheet. So, it is less then 200 reviews and I gave them labels . For the beginning, I want to do 2-class classification (fast delivery and xxx).
3. I built another model, which is for the classification with LibSVM. Here LibSVM wants numeric values, so before this operator I use NominalToNumerical operator.
4. When I apply test data, the result is bad. A lot of examples are misclassified.
I put also breakpoint before LibSVM in order to see what exactly does that NominalToNumerical operator. And it just gives unique numbers to training examples, so it cant work.
Now I'm thinking how to solve this problem or maybe I have to use other operator? By the way, whole data is in german.
Thanks in advance.
Best regards
Armen
Classification Model
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<process expanded="true" height="386" width="815">
<operator activated="true" class="read_database" compatibility="5.2.008" expanded="true" height="60" name="Read Database (2)" width="90" x="45" y="165">
<parameter key="connection" value="sqlserver"/>
<parameter key="query" value="SELECT * FROM `test_schnell`"/>
<enumeration key="parameters"/>
</operator>
<operator activated="true" class="read_excel" compatibility="5.2.008" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
<parameter key="excel_file" value="C:\Users\MP-TEST\Desktop\Rapid_Test\Klein.xls"/>
<parameter key="imported_cell_range" value="A1:B123"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations"/>
<parameter key="locale" value="German (Germany)"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="Bewertung.true.text.attribute"/>
<parameter key="1" value="Label.true.text.label"/>
</list>
</operator>
<operator activated="true" class="set_role" compatibility="5.2.008" expanded="true" height="76" name="Set Role" width="90" x="313" y="30">
<parameter key="name" value="Bewertung"/>
<list key="set_additional_roles">
<parameter key="Label" value="label"/>
</list>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="5.2.008" expanded="true" height="94" name="Nominal to Numerical" width="90" x="447" y="75">
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="support_vector_machine_libsvm" compatibility="5.2.008" expanded="true" height="76" name="SVM" width="90" x="648" y="30">
<list key="class_weights"/>
</operator>
<operator activated="true" class="apply_model" compatibility="5.2.008" expanded="true" height="76" name="Apply Model" width="90" x="648" y="165">
<list key="application_parameters"/>
</operator>
<connect from_op="Read Database (2)" from_port="output" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Read Excel" from_port="output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
<connect from_op="Apply Model" from_port="model" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
Text Processing Model
<?xml version="1.0" encoding="UTF-8" standalone="no"?>A part of my training Data after text processing
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<process expanded="true" height="375" width="756">
<operator activated="true" class="read_excel" compatibility="5.2.008" expanded="true" height="60" name="Read Excel" width="90" x="45" y="75">
<parameter key="excel_file" value="C:\Users\MP-TEST\Desktop\Rapid_Test\Training Data - Schnell.xls"/>
<parameter key="imported_cell_range" value="A1:B201"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="Bewertung.true.text.attribute"/>
<parameter key="1" value="Label.true.text.label"/>
</list>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="5.2.008" expanded="true" height="76" name="Nominal to Text" width="90" x="179" y="120"/>
<operator activated="true" class="text:process_document_from_data" compatibility="5.2.004" expanded="true" height="76" name="Process Documents from Data" width="90" x="313" y="30">
<parameter key="prunde_below_percent" value="5.0"/>
<parameter key="prune_above_percent" value="100.0"/>
<list key="specify_weights"/>
<process expanded="true" height="386" width="774">
<operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30">
<parameter key="mode" value="specify characters"/>
<parameter key="characters" value=".:,:;:!:?:|:+-="/>
</operator>
<operator activated="true" class="text:filter_by_length" compatibility="5.2.004" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="45" y="120">
<parameter key="max_chars" value="9999"/>
</operator>
<operator activated="true" class="text:stem_dictionary" compatibility="5.2.004" expanded="true" height="76" name="Stem (Dictionary)" width="90" x="45" y="210">
<parameter key="file" value="C:\Users\MP-TEST\Desktop\Rapid_Test\Wörterbuch.TXT"/>
</operator>
<operator activated="true" class="text:filter_stopwords_german" compatibility="5.2.004" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="246" y="120"/>
<operator activated="false" class="text:stem_german" compatibility="5.2.004" expanded="true" height="60" name="Stem (German)" width="90" x="313" y="30"/>
<operator activated="true" class="text:filter_tokens_by_content" compatibility="5.2.004" expanded="true" height="60" name="Filter Tokens (by Content)" width="90" x="380" y="210">
<parameter key="condition" value="contains match"/>
<parameter key="string" value="schnell "/>
<parameter key="regular_expression" value=".*schnell.*|.*liefer.*|.*gern.*|.*wieder.*|.*versand.*|.*ware.*|.*ordnung.*|"/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Dictionary)" to_port="document"/>
<connect from_op="Stem (Dictionary)" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
<connect from_op="Filter Stopwords (German)" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
<connect from_op="Filter Tokens (by Content)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:wordlist_to_data" compatibility="5.2.004" expanded="true" height="76" name="WordList to Data" width="90" x="313" y="210"/>
<operator activated="true" class="write_excel" compatibility="5.2.008" expanded="true" height="76" name="Write Excel" width="90" x="514" y="165">
<parameter key="excel_file" value="C:\Users\MP-TEST\Desktop\Rapid_Test\Klein.xls"/>
</operator>
<connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 2"/>
<connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
<connect from_op="WordList to Data" from_port="example set" to_op="Write Excel" to_port="input"/>
<connect from_op="Write Excel" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
alles in ordnung alles perfekt und schnell nur zu empfehlen alles schnell und problemlos verlaufen anfragen werden sehr schnell beantwortet auch bei retouren sehr schnell die lieferung ging schnell und die verpackung ist im sehr guten zustand die ware wurde schnell und ordentlich verpackt geliefert einwandfreie ware gerne bei gelegenheit wieder mal gerne wieder gerne wieder gut verpackt und gute ware guter versand heute geliefert ich würde jederzeit wieder bei "mein paket" bestellen immer wieder gerne jederzeit wieder kontaktanfragen waren nicht nötig lieferung und service perfekt lieferzeit war 1 woche mit dem service zufrieden hätte auf ne schnellere e preis voll und ganz in ordnung preiswerte ware (terra 1 rasche lieferung einwandfreier ware schnell versandt schnelle lieferung schnelle lieferung und guter rucksack schnelle versendung der ware schneller versand sehr gerne wieder sehr schnell geliefert sehr schnelle lieferung sehr schnelle versand sichere lieferung mit dhl sogar schneller als versandbenachrichtigung super schneller versand super schneller versand (schneller geht es kaum) und schneller versand vom schnellen versand bis hin zum super leckerem getränk ware einwandfrei ware i ware ist wie beschrieben ware kam innerhalb weniger tage ware ok ware orginalverpackt ware und versand 1 ware wie beschrieben werde wieder hier kaufen wir waren mit dem anbieter sehr zufrieden alles in ordnung alles super immer wieder gerne alles waren in ordnung bestellung wurde sofort bearbeitet und wir hatten die lieferung innerhalb von 3 tagen bei uns blitzlieferung der artikel wurde zufriedenstellend und schnell versendet der preis für die ware ist ok die lieferung auch die ware ist schnell geliefert wurden |
bin sehr zufrieden danke das zelt ist einfach spitze dass das produkt eklig ist gern wieder gerne wieder gerne wieder jede meiner vielen anfragen vor der kauf wurden durch kuhnshop schnell und zu meiner zufriedenheit beantwortet jederzeit wieder kann ich weiterempfehlen lieferzeit top schnelle lieferung sehr schnelle lieferung sowohl bei der lieferung als auch bei der rücknahme super top preis verpackung versand war gut und angemessen schnell versand war sehr schnell vielen dank vorbildlicher service ware ok alles bestens alles hat hervorragend geklappt alles ohne probleme alles super alles super bin zufrieden alles war bestens alles wunderbar geklappt bin sehr zufrieden der anbieter kann ja nix dafür für den preis kann mann nichts falsch machen hat alles super geklappt keine bemängelung nur positiv zu meiner zufriedenheit kuhnshop zum zweiten mal bereits spitze liefert gute ergebnisse |
Tagged:
0
Answers
-
Actually, I don't want anybody to provide the solution. I'm just askin for some hints.
In my opinion the problem is in the type convertor.
Please help!
Best regards
Armen0 -
Hey,
The "Nominal To Numerical" operator is the wrong one. As you already noticed it maps a string to a unique number for this string. Try to learn and validate a model on the same data via a cross-validation first.
Below this posting you can find a process which does a simple crossvalidation. In this example i read a csv-file, but you can use your Excel-operator as well. But please note: The output of the reading operator has to have 2 attributes. One where the unprocessed text is stored (regular attribute with the value type "text") and the other one a binominal label (special attribute "label" with the value type "binominal"). Please use the wizard of the reading operator to output this kind of data. The "Process Documents" operator does the tokenization, stemming, and filtering of stop words and creates an ExampleSet which already has the correct format to learn (i.e. it has numbers). This data will be the input for the cross-validation which output provides you information about how good your learner (for instance the libsvm) performs. If your performance is bad you should play with the parameters of the learner and/or pre-processing steps (the operator inside the "Process Documents" operator) or look at your data. Maybe you do not provide enough data. Or your labeling is not very good. In your data snippet which you have posted i can see that the first examples are talking about fast delivery (in a way) but they are not classified as such.
If your cross-validation indicates a good performance you can apply your model on new data. But i would test my process with a crossvalidation first.
Good luck
Marcin
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.000" expanded="true" name="Process">
<process expanded="true" height="520" width="620">
<operator activated="true" class="read_csv" compatibility="5.3.000" expanded="true" height="60" name="Training" width="90" x="45" y="30">
<parameter key="csv_file" value="/home/marcin/temp/training.csv"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations"/>
<parameter key="encoding" value="UTF-8"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="text.true.text.attribute"/>
<parameter key="1" value="label.true.binominal.label"/>
</list>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.2.005" expanded="true" height="76" name="Process Documents from Data" width="90" x="179" y="30">
<list key="specify_weights"/>
<process expanded="true" height="538" width="620">
<operator activated="true" class="text:tokenize" compatibility="5.2.005" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30"/>
<operator activated="true" class="text:filter_stopwords_german" compatibility="5.2.005" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="179" y="30"/>
<operator activated="true" class="text:stem_german" compatibility="5.2.005" expanded="true" height="60" name="Stem (German)" width="90" x="313" y="30"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
<connect from_op="Filter Stopwords (German)" from_port="document" to_op="Stem (German)" to_port="document"/>
<connect from_op="Stem (German)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="x_validation" compatibility="5.3.000" expanded="true" height="112" name="Validation" width="90" x="380" y="30">
<process expanded="true" height="538" width="351">
<operator activated="true" class="support_vector_machine_libsvm" compatibility="5.3.000" expanded="true" height="76" name="SVM" width="90" x="112" y="30">
<list key="class_weights"/>
</operator>
<connect from_port="training" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true" height="538" width="351">
<operator activated="true" class="apply_model" compatibility="5.3.000" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.3.000" expanded="true" height="76" name="Performance" width="90" x="179" y="30"/>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_op="Training" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="model" to_port="result 1"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>0 -
Hi
Thank you for the help!
I will do so and try my best.
Thanks again.
Best regards
Armen0 -
Hey
It works normally I think - 92 % accuracy.
Thanks a lot.
Best regards
Armen0