Community & Support
Learn
Marketplace
Discussions
Categories
Discussions
General
Platform
Academic
Partner
Regional
User Groups
Documentation
Events
Altair Exchange
Share or Download Projects
Resources
News & Instructions
Programs
YouTube
Employee Resources
This tab can be seen by employees only. Please do not share these resources externally.
Groups
Join a User Group
Support
Altair RISE
A program to recognize and reward our most engaged community members
Nominate Yourself Now!
Home
Discussions
Community Q&A
k-nn clasifier
ArnoG
I am using a K-nn clasifier to perform a sentiment analysis on review texts. I export the results to an excel sheet. For every month (cumlative) I create a bar graph showing the results, Basically the bar graph shows the # of positive and negative reviews.
But till my suprise the results of the sentiment analysis difference per month. For the sentiment analysis I am using exacrly the same model, and exactly the same data & training set. When I run the analyses for january a certain review is labeled positive, when I run the same analyses till february, the same review in january is now labeled as negative.
Is there a way to prefend this?
Best regards,
Arno
Find more posts tagged with
AI Studio
Accepted answers
All comments
RalfKlinkenberg
Hi Arno,
Are you sure that you use exactly the same training data set for both applications of k-NN in January and February?
If you share your RapidMiner process, we could better check for the reason for this process behaviour.
Best regards,
Ralf
ArnoG
Hi Ralf,
Thanks for your response!
I am completely sure that I didn't change the training set.
As an example I used following process:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.0.003">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.003" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="6.0.003" expanded="true" height="60" name="Results" width="90" x="45" y="210">
<parameter key="excel_file" value="C:\Improve Your Business\Qing\Rapidminer\testing knn.xls"/>
<parameter key="sheet_number" value="2"/>
<parameter key="imported_cell_range" value="A1:A18"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="Test reviews.true.text.attribute"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="6.0.003" expanded="true" height="76" name="Select Attributes" width="90" x="313" y="210">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="text|sent|Review|Prijs|Test reviews"/>
</operator>
<operator activated="true" class="set_role" compatibility="6.0.003" expanded="true" height="76" name="Set Role" width="90" x="514" y="210">
<parameter key="attribute_name" value="Test reviews"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data (2)" width="90" x="648" y="210">
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="absolute"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="999"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (2)" width="90" x="112" y="30"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases (4)" width="90" x="246" y="30"/>
<operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.3.002" expanded="true" height="76" name="Filter Stopwords (4)" width="90" x="380" y="30">
<parameter key="file" value="C:\Improve Your Business\Qing\Rapidminer\nederlandse stopwoordenlijst.txt"/>
</operator>
<operator activated="false" class="text:stem_snowball" compatibility="5.3.002" expanded="true" height="60" name="Stem (4)" width="90" x="514" y="120">
<parameter key="language" value="Dutch"/>
</operator>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (3)" width="90" x="648" y="30">
<parameter key="min_chars" value="2"/>
</operator>
<operator activated="false" class="text:generate_n_grams_terms" compatibility="5.3.002" expanded="true" height="60" name="Generate n-Grams (2)" width="90" x="782" y="120"/>
<connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
<connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (4)" to_port="document"/>
<connect from_op="Transform Cases (4)" from_port="document" to_op="Filter Stopwords (4)" to_port="document"/>
<connect from_op="Filter Stopwords (4)" from_port="document" to_op="Filter Tokens (3)" to_port="document"/>
<connect from_op="Filter Tokens (3)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="read_excel" compatibility="6.0.003" expanded="true" height="60" name="Training set" width="90" x="45" y="30">
<parameter key="excel_file" value="C:\Improve Your Business\Qing\Rapidminer\testing knn.xls"/>
<parameter key="imported_cell_range" value="A1:B243"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="Prijs.true.text.attribute"/>
<parameter key="1" value="B.true.polynominal.label"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="6.0.003" expanded="true" height="76" name="Select Attributes (2)" width="90" x="313" y="30">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="sent|text|training set|sen|Prijs"/>
</operator>
<operator activated="true" class="set_role" compatibility="6.0.003" expanded="true" height="76" name="Set Role (2)" width="90" x="514" y="30">
<parameter key="attribute_name" value="Prijs"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="648" y="30">
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="absolute"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="999"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30">
<parameter key="characters" value=".:?!"/>
</operator>
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="179" y="30"/>
<operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.3.002" expanded="true" height="76" name="Filter Stopwords (3)" width="90" x="313" y="30">
<parameter key="file" value="C:\Improve Your Business\Qing\Rapidminer\nederlandse stopwoordenlijst.txt"/>
</operator>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="581" y="30">
<parameter key="min_chars" value="2"/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (3)" to_port="document"/>
<connect from_op="Filter Stopwords (3)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="k_nn" compatibility="6.0.003" expanded="true" height="76" name="k-NN (2)" width="90" x="782" y="30">
<parameter key="k" value="3"/>
<parameter key="weighted_vote" value="true"/>
<parameter key="measure_types" value="NumericalMeasures"/>
<parameter key="numerical_measure" value="CosineSimilarity"/>
</operator>
<operator activated="true" class="apply_model" compatibility="6.0.003" expanded="true" height="76" name="Apply Model (2)" width="90" x="916" y="120">
<list key="application_parameters"/>
</operator>
<connect from_op="Results" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Process Documents from Data (2)" to_port="example set"/>
<connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Training set" from_port="output" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="k-NN (2)" to_port="training set"/>
<connect from_op="k-NN (2)" from_port="model" to_op="Apply Model (2)" to_port="model"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
I run the process for data till april and one till may.
Images below shows the results from running the two datasets
Till April
Till May:
As you can see is the review "goede prijs" in the run till april predicted neg and in the run till may pos.
I used exactly the same RM process and exactly the same training data.
How can I get different results?
Best Regards,
Arno
Quick Links
All Categories
Recent Discussions
Activity
Unanswered
日本語 (Japanese)
한국어(Korean)
Groups