Sentiment analysis (positive/negative words) of txt-files with other dictionary
mikesolvay
New Altair Community Member
Hello
I am conducting some research that involves text mining of a few .txt-files I have stored on my computer. I have successfully managed to count the words and ngrams used in all txt.-documents, which was the first part of my work. Now, I would like to make a table with positive and negative connoted words from the same documents (resulting in, for example "overall, the documents include 55% positive words and 45% negative words). I also want to use a sentiment word list made by Loughran and McDonald (2018).
I was not able to successfully paste my XLM-code, so here is a screenshot of my process so far. In "Process Documents" I do tokenize, stopwords, transform cases and generate ngrams.
I have little experience with RapidMiner, and I am eager to get a better understanding of it. Help is much appreciated.
I am conducting some research that involves text mining of a few .txt-files I have stored on my computer. I have successfully managed to count the words and ngrams used in all txt.-documents, which was the first part of my work. Now, I would like to make a table with positive and negative connoted words from the same documents (resulting in, for example "overall, the documents include 55% positive words and 45% negative words). I also want to use a sentiment word list made by Loughran and McDonald (2018).
I was not able to successfully paste my XLM-code, so here is a screenshot of my process so far. In "Process Documents" I do tokenize, stopwords, transform cases and generate ngrams.
I have little experience with RapidMiner, and I am eager to get a better understanding of it. Help is much appreciated.
1
Best Answer
-
Hi @mikesolvay ,this is round about what you want:<?xml version="1.0" encoding="UTF-8"?><process version="9.6.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.6.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="9.6.000" expanded="true" height="68" name="Read Excel" width="90" x="112" y="493">
<parameter key="excel_file" value="C:\Users\MartinSchmitz\Downloads\LoughranMcDonald_SentimentWordLists_2018.xlsx"/>
<parameter key="sheet_selection" value="sheet number"/>
<parameter key="sheet_number" value="2"/>
<parameter key="imported_cell_range" value="A1"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations"/>
<parameter key="date_format" value=""/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<parameter key="read_all_values_as_polynominal" value="false"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="A.true.polynominal.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="false"/>
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<description align="center" color="transparent" colored="false" width="126">Adapt location please</description>
</operator>
<operator activated="true" class="generate_attributes" compatibility="9.6.000" expanded="true" height="82" name="Generate Attributes" width="90" x="246" y="493">
<list key="function_descriptions">
<parameter key="Score" value="-1"/>
<parameter key="A" value="lower(A)"/>
</list>
<parameter key="keep_all" value="true"/>
</operator>
<operator activated="true" class="rename" compatibility="9.6.000" expanded="true" height="82" name="Rename" width="90" x="380" y="493">
<parameter key="old_name" value="A"/>
<parameter key="new_name" value="Word"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="read_excel" compatibility="9.6.000" expanded="true" height="68" name="Read Excel (2)" width="90" x="112" y="646">
<parameter key="excel_file" value="C:\Users\MartinSchmitz\Downloads\LoughranMcDonald_SentimentWordLists_2018.xlsx"/>
<parameter key="sheet_selection" value="sheet number"/>
<parameter key="sheet_number" value="3"/>
<parameter key="imported_cell_range" value="A1"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations"/>
<parameter key="date_format" value=""/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<parameter key="read_all_values_as_polynominal" value="false"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="A.true.polynominal.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="false"/>
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<description align="center" color="transparent" colored="false" width="126">Adapt location please</description>
</operator>
<operator activated="true" class="generate_attributes" compatibility="9.6.000" expanded="true" height="82" name="Generate Attributes (2)" width="90" x="246" y="646">
<list key="function_descriptions">
<parameter key="Score" value="+1"/>
<parameter key="A" value="lower(A)"/>
</list>
<parameter key="keep_all" value="true"/>
</operator>
<operator activated="true" class="rename" compatibility="9.6.000" expanded="true" height="82" name="Rename (2)" width="90" x="380" y="646">
<parameter key="old_name" value="A"/>
<parameter key="new_name" value="Word"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="append" compatibility="9.6.000" expanded="true" height="103" name="Append" width="90" x="514" y="544">
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<parameter key="merge_type" value="all"/>
</operator>
<operator activated="true" class="operator_toolbox:dictionary_sentiment_learner" compatibility="2.4.000-SNAPSHOT" expanded="true" height="82" name="Dictionary-Based Sentiment (Documents)" width="90" x="648" y="544">
<parameter key="value_attribute" value="Score"/>
<parameter key="key_attribute" value="Word"/>
<parameter key="negation_attribute" value=""/>
<parameter key="negation_window_size" value="1"/>
<parameter key="use_symmetric_negation_window" value="false"/>
</operator>
<operator activated="true" class="text:create_document" compatibility="8.2.000" expanded="true" height="68" name="Create Document" width="90" x="648" y="187">
<parameter key="text" value="This is a great document which should be scored rather positive!"/>
<parameter key="add label" value="false"/>
<parameter key="label_type" value="nominal"/>
</operator>
<operator activated="true" class="text:create_document" compatibility="8.2.000" expanded="true" height="68" name="Create Document (2)" width="90" x="648" y="289">
<parameter key="text" value="This is a shit document."/>
<parameter key="add label" value="false"/>
<parameter key="label_type" value="nominal"/>
</operator>
<operator activated="true" class="collect" compatibility="9.6.000" expanded="true" height="103" name="Collect" width="90" x="782" y="187">
<parameter key="unfold" value="false"/>
</operator>
<operator activated="true" class="loop_collection" compatibility="9.6.000" expanded="true" height="82" name="Loop Collection" width="90" x="916" y="187">
<parameter key="set_iteration_macro" value="false"/>
<parameter key="macro_name" value="iteration"/>
<parameter key="macro_start_value" value="1"/>
<parameter key="unfold" value="false"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.2.000" expanded="true" height="68" name="Tokenize" width="90" x="380" y="34">
<parameter key="mode" value="non letters"/>
<parameter key="characters" value=".:"/>
<parameter key="language" value="English"/>
<parameter key="max_token_length" value="3"/>
</operator>
<operator activated="true" class="text:transform_cases" compatibility="8.2.000" expanded="true" height="68" name="Transform Cases" width="90" x="581" y="34">
<parameter key="transform_to" value="lower case"/>
</operator>
<connect from_port="single" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="output 1"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="operator_toolbox:apply_model_documents" compatibility="2.4.000-SNAPSHOT" expanded="true" height="103" name="Apply Model (Documents)" width="90" x="1117" y="391">
<list key="application_parameters"/>
</operator>
<connect from_op="Read Excel" from_port="output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Append" to_port="example set 1"/>
<connect from_op="Read Excel (2)" from_port="output" to_op="Generate Attributes (2)" to_port="example set input"/>
<connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
<connect from_op="Rename (2)" from_port="example set output" to_op="Append" to_port="example set 2"/>
<connect from_op="Append" from_port="merged set" to_op="Dictionary-Based Sentiment (Documents)" to_port="exa"/>
<connect from_op="Dictionary-Based Sentiment (Documents)" from_port="mod" to_op="Apply Model (Documents)" to_port="mod"/>
<connect from_op="Create Document" from_port="output" to_op="Collect" to_port="input 1"/>
<connect from_op="Create Document (2)" from_port="output" to_op="Collect" to_port="input 2"/>
<connect from_op="Collect" from_port="collection" to_op="Loop Collection" to_port="collection"/>
<connect from_op="Loop Collection" from_port="output 1" to_op="Apply Model (Documents)" to_port="doc"/>
<connect from_op="Apply Model (Documents)" from_port="exa" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="168"/>
<description align="center" color="yellow" colored="false" height="294" resized="true" width="599" x="26" y="454">This generates the dictionary as needed in the &quot;Dict based Sentiment&quot; operator</description>
<description align="center" color="yellow" colored="false" height="251" resized="true" width="518" x="559" y="119">This creates two test documents. It also does the preprocessing of it. Note that you need to tokenize your documents before applying it! This is done in &quot;loop collection&quot;</description>
</process>
</operator>
</process>
4
Answers
-
Hi @mikesolvay ,the operator Dictionary Based Sentiment Analysis is what you search for. Actually 'Extract sentiment' bundles models like them for easier use.Can you maybe eloberate what's the advantage of the other dictionary? And where i can find it? Maybe its easy just to add it to the operator.Best,Martin3
-
Thank you for your reply, @mschmitz!
I had a look at the operator you mentioned, but I am confused by the parameters I have to set. How does the operator know what words are considered negative and positive just from entering numerial values for the parameters?
I am sorry for my lack of knowledge. As I said, my experience with RapidMiner is very limited so far.
As for my preferred dictionary, it is only because it is the basis of the methodology I am basing my research on. If it is troublesome to use a personal dictionary, I would just use a standard one from RapidMiner.
1 -
Hi,did you have a look at the tutorial processes in the help panel? Those should help.Can you maybe post a link to the dictionary? That would allow me to create an example process for you on your dictionary.Best,Martin1
-
I had a brief look at it, as well as some YouTube videoes but I still struggle a bit.
Link: https://sraf.nd.edu/textual-analysis/resources/#LM%20Sentiment%20Word%20Lists
You will find it here as an .xlsx-file.
Thank you so much for your help so far, and for taking the time.1 -
hi @mikesolvay welcome to the community. I just "boosted" your rank so you can now post hyperlinks.1
-
@mschmitz Would it be possible for you to share an example process of what I am trying to do? It would also be okay to try it with wordnet dictionary or something else. Thank you in advance!1
-
Hi @mikesolvay ,this is round about what you want:<?xml version="1.0" encoding="UTF-8"?><process version="9.6.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.6.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="9.6.000" expanded="true" height="68" name="Read Excel" width="90" x="112" y="493">
<parameter key="excel_file" value="C:\Users\MartinSchmitz\Downloads\LoughranMcDonald_SentimentWordLists_2018.xlsx"/>
<parameter key="sheet_selection" value="sheet number"/>
<parameter key="sheet_number" value="2"/>
<parameter key="imported_cell_range" value="A1"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations"/>
<parameter key="date_format" value=""/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<parameter key="read_all_values_as_polynominal" value="false"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="A.true.polynominal.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="false"/>
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<description align="center" color="transparent" colored="false" width="126">Adapt location please</description>
</operator>
<operator activated="true" class="generate_attributes" compatibility="9.6.000" expanded="true" height="82" name="Generate Attributes" width="90" x="246" y="493">
<list key="function_descriptions">
<parameter key="Score" value="-1"/>
<parameter key="A" value="lower(A)"/>
</list>
<parameter key="keep_all" value="true"/>
</operator>
<operator activated="true" class="rename" compatibility="9.6.000" expanded="true" height="82" name="Rename" width="90" x="380" y="493">
<parameter key="old_name" value="A"/>
<parameter key="new_name" value="Word"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="read_excel" compatibility="9.6.000" expanded="true" height="68" name="Read Excel (2)" width="90" x="112" y="646">
<parameter key="excel_file" value="C:\Users\MartinSchmitz\Downloads\LoughranMcDonald_SentimentWordLists_2018.xlsx"/>
<parameter key="sheet_selection" value="sheet number"/>
<parameter key="sheet_number" value="3"/>
<parameter key="imported_cell_range" value="A1"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations"/>
<parameter key="date_format" value=""/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<parameter key="read_all_values_as_polynominal" value="false"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="A.true.polynominal.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="false"/>
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<description align="center" color="transparent" colored="false" width="126">Adapt location please</description>
</operator>
<operator activated="true" class="generate_attributes" compatibility="9.6.000" expanded="true" height="82" name="Generate Attributes (2)" width="90" x="246" y="646">
<list key="function_descriptions">
<parameter key="Score" value="+1"/>
<parameter key="A" value="lower(A)"/>
</list>
<parameter key="keep_all" value="true"/>
</operator>
<operator activated="true" class="rename" compatibility="9.6.000" expanded="true" height="82" name="Rename (2)" width="90" x="380" y="646">
<parameter key="old_name" value="A"/>
<parameter key="new_name" value="Word"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="append" compatibility="9.6.000" expanded="true" height="103" name="Append" width="90" x="514" y="544">
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<parameter key="merge_type" value="all"/>
</operator>
<operator activated="true" class="operator_toolbox:dictionary_sentiment_learner" compatibility="2.4.000-SNAPSHOT" expanded="true" height="82" name="Dictionary-Based Sentiment (Documents)" width="90" x="648" y="544">
<parameter key="value_attribute" value="Score"/>
<parameter key="key_attribute" value="Word"/>
<parameter key="negation_attribute" value=""/>
<parameter key="negation_window_size" value="1"/>
<parameter key="use_symmetric_negation_window" value="false"/>
</operator>
<operator activated="true" class="text:create_document" compatibility="8.2.000" expanded="true" height="68" name="Create Document" width="90" x="648" y="187">
<parameter key="text" value="This is a great document which should be scored rather positive!"/>
<parameter key="add label" value="false"/>
<parameter key="label_type" value="nominal"/>
</operator>
<operator activated="true" class="text:create_document" compatibility="8.2.000" expanded="true" height="68" name="Create Document (2)" width="90" x="648" y="289">
<parameter key="text" value="This is a **** document."/>
<parameter key="add label" value="false"/>
<parameter key="label_type" value="nominal"/>
</operator>
<operator activated="true" class="collect" compatibility="9.6.000" expanded="true" height="103" name="Collect" width="90" x="782" y="187">
<parameter key="unfold" value="false"/>
</operator>
<operator activated="true" class="loop_collection" compatibility="9.6.000" expanded="true" height="82" name="Loop Collection" width="90" x="916" y="187">
<parameter key="set_iteration_macro" value="false"/>
<parameter key="macro_name" value="iteration"/>
<parameter key="macro_start_value" value="1"/>
<parameter key="unfold" value="false"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.2.000" expanded="true" height="68" name="Tokenize" width="90" x="380" y="34">
<parameter key="mode" value="non letters"/>
<parameter key="characters" value=".:"/>
<parameter key="language" value="English"/>
<parameter key="max_token_length" value="3"/>
</operator>
<operator activated="true" class="text:transform_cases" compatibility="8.2.000" expanded="true" height="68" name="Transform Cases" width="90" x="581" y="34">
<parameter key="transform_to" value="lower case"/>
</operator>
<connect from_port="single" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="output 1"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="operator_toolbox:apply_model_documents" compatibility="2.4.000-SNAPSHOT" expanded="true" height="103" name="Apply Model (Documents)" width="90" x="1117" y="391">
<list key="application_parameters"/>
</operator>
<connect from_op="Read Excel" from_port="output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Append" to_port="example set 1"/>
<connect from_op="Read Excel (2)" from_port="output" to_op="Generate Attributes (2)" to_port="example set input"/>
<connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
<connect from_op="Rename (2)" from_port="example set output" to_op="Append" to_port="example set 2"/>
<connect from_op="Append" from_port="merged set" to_op="Dictionary-Based Sentiment (Documents)" to_port="exa"/>
<connect from_op="Dictionary-Based Sentiment (Documents)" from_port="mod" to_op="Apply Model (Documents)" to_port="mod"/>
<connect from_op="Create Document" from_port="output" to_op="Collect" to_port="input 1"/>
<connect from_op="Create Document (2)" from_port="output" to_op="Collect" to_port="input 2"/>
<connect from_op="Collect" from_port="collection" to_op="Loop Collection" to_port="collection"/>
<connect from_op="Loop Collection" from_port="output 1" to_op="Apply Model (Documents)" to_port="doc"/>
<connect from_op="Apply Model (Documents)" from_port="exa" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="168"/>
<description align="center" color="yellow" colored="false" height="294" resized="true" width="599" x="26" y="454">This generates the dictionary as needed in the &quot;Dict based Sentiment&quot; operator</description>
<description align="center" color="yellow" colored="false" height="251" resized="true" width="518" x="559" y="119">This creates two test documents. It also does the preprocessing of it. Note that you need to tokenize your documents before applying it! This is done in &quot;loop collection&quot;</description>
</process>
</operator>
</process>
4