"sentiment analysis / text calssification problem"
mmoehring
New Altair Community Member
Hello,
i am trying to implement a simple sentiment analysis (related to some videos and tutorials).
But it doens't work an di don't know why.
Could you pleas help me?
______________________________________________________________________
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Files" width="90" x="313" y="255">
<list key="text_directories">
<parameter key="bsp" value="C:\Users\michaelmoehring\Documents\St-Gallen\rapidminer\test"/>
</list>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.000" expanded="true" height="60" name="Tokenize" width="90" x="112" y="30">
<parameter key="language" value="German"/>
</operator>
<operator activated="true" class="text:transform_cases" compatibility="5.3.000" expanded="true" height="60" name="Transform Cases" width="90" x="246" y="30"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="retrieve" compatibility="5.3.008" expanded="true" height="60" name="Retrieve 424242" width="90" x="45" y="75">
<parameter key="repository_entry" value="424242"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.008" expanded="true" height="76" name="Set Role" width="90" x="179" y="30">
<parameter key="attribute_name" value="wertung"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="x_validation" compatibility="5.3.008" expanded="true" height="112" name="Validation" width="90" x="313" y="30">
<parameter key="number_of_validations" value="5"/>
<process expanded="true">
<operator activated="true" class="k_nn" compatibility="5.3.008" expanded="true" height="76" name="k-NN" width="90" x="112" y="210">
<parameter key="k" value="20"/>
<parameter key="weighted_vote" value="true"/>
<parameter key="numerical_measure" value="CosineSimilarity"/>
</operator>
<connect from_port="training" to_op="k-NN" to_port="training set"/>
<connect from_op="k-NN" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="5.3.008" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.3.008" expanded="true" height="76" name="Performance" width="90" x="179" y="165"/>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" compatibility="5.3.008" expanded="true" height="76" name="Apply Model (2)" width="90" x="514" y="165">
<list key="application_parameters"/>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Retrieve 424242" from_port="output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="model" to_op="Apply Model (2)" to_port="model"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 1"/>
<connect from_op="Apply Model (2)" from_port="model" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
_______________________________________________________________________
The positive negative words a stored in the repository based on a csv with two coloums (word; positive/negative).
Thank you so much!
Greetings
i am trying to implement a simple sentiment analysis (related to some videos and tutorials).
But it doens't work an di don't know why.
Could you pleas help me?
______________________________________________________________________
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.008" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="5.3.000" expanded="true" height="76" name="Process Documents from Files" width="90" x="313" y="255">
<list key="text_directories">
<parameter key="bsp" value="C:\Users\michaelmoehring\Documents\St-Gallen\rapidminer\test"/>
</list>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.000" expanded="true" height="60" name="Tokenize" width="90" x="112" y="30">
<parameter key="language" value="German"/>
</operator>
<operator activated="true" class="text:transform_cases" compatibility="5.3.000" expanded="true" height="60" name="Transform Cases" width="90" x="246" y="30"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="retrieve" compatibility="5.3.008" expanded="true" height="60" name="Retrieve 424242" width="90" x="45" y="75">
<parameter key="repository_entry" value="424242"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.008" expanded="true" height="76" name="Set Role" width="90" x="179" y="30">
<parameter key="attribute_name" value="wertung"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="x_validation" compatibility="5.3.008" expanded="true" height="112" name="Validation" width="90" x="313" y="30">
<parameter key="number_of_validations" value="5"/>
<process expanded="true">
<operator activated="true" class="k_nn" compatibility="5.3.008" expanded="true" height="76" name="k-NN" width="90" x="112" y="210">
<parameter key="k" value="20"/>
<parameter key="weighted_vote" value="true"/>
<parameter key="numerical_measure" value="CosineSimilarity"/>
</operator>
<connect from_port="training" to_op="k-NN" to_port="training set"/>
<connect from_op="k-NN" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="5.3.008" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.3.008" expanded="true" height="76" name="Performance" width="90" x="179" y="165"/>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" compatibility="5.3.008" expanded="true" height="76" name="Apply Model (2)" width="90" x="514" y="165">
<list key="application_parameters"/>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Retrieve 424242" from_port="output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="model" to_op="Apply Model (2)" to_port="model"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 1"/>
<connect from_op="Apply Model (2)" from_port="model" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
_______________________________________________________________________
The positive negative words a stored in the repository based on a csv with two coloums (word; positive/negative).
Thank you so much!
Greetings
0
Answers
-
Hello,
I always recommend to provide a very small sample, e.g. 10 data points, of your data.
Based on limited information so far, here is my guess to what goes wrong:
The "Process Documents from Files" produces different data compared to "Retrieve 424242".
The "Process Documents from Files" typically requires two folders.
One folder with negative messages.
One folder with positive messages.
So each message in stored in a different text file.
You should then get something like this:
http://i.snag.gy/haglm.jpg
You have now transformed a text mining problem into a standard classification problem.
Hope this helps,
Best regards,
Wessel0