Process Document from Data
Hello, Everyone!
I am very beginner in rapid miner and doing a sentiment analysis on tweets. I have a problem at a basic level. I am using a tool process document data to generate tf-idf vector and word counts after cleaning the tweets. I have opened an excel file which containing 2000 tweets with reading excel operator and passes these data to nominal to text operator and select the text attribute for generating the matrix. when I am running my process the process document data not showing the result which I am expected a term frequency matrix instead it showing simple data as I loaded in excel sheet.
I am using rapid miner 8.1.001 version
Best Answer
-
@waqaskhan343 I see you have toggled on 'select attributes and weights' on the Process Doc operator and not entered anything. That's probably your problem. Toggle that off and run again.
2
Answers
-
@waqaskhan343 you should post your XML of the process to help us. Did you put a Tokenize operator inside the Process Documents operator?
0 -
Yes, I am using the tokenizer operator in process document data. please see the attachment
thank you in advance0 -
@Thomas_Ott this xml of my process which I am running on rapid minor
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.1.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="8.1.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="187">
<parameter key="excel_file" value="C:\Users\Waqas Khan\Desktop\NewResarchWorkAfghanVsTaliban\AfterRemovingDuplicatesText.xlsx"/>
<parameter key="imported_cell_range" value="A1:B2760"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="text.true.polynominal.attribute"/>
<parameter key="1" value="screenName.true.polynominal.id"/>
</list>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="8.1.001" expanded="true" height="82" name="Nominal to Text" width="90" x="246" y="187">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="text"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="447" y="187">
<parameter key="select_attributes_and_weights" value="true"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="45" y="34"/>
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="179" y="34"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="313" y="34"/>
<operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="447" y="34">
<parameter key="min_chars" value="3"/>
</operator>
<operator activated="true" class="text:stem_porter" compatibility="8.1.000" expanded="true" height="68" name="Stem (Porter)" width="90" x="581" y="34"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
<connect from_op="Stem (Porter)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>0 -
@waqaskhan343 please edit your previous post and use the </> code format button. This ensures proper rendering of your XML code.
Also, it's RapidMiner, not Rapid minor.
2 -
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.1.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="8.1.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="187">
<parameter key="excel_file" value="C:\Users\Waqas Khan\Desktop\NewResarchWorkAfghanVsTaliban\AfterRemovingDuplicatesText.xlsx"/>
<parameter key="imported_cell_range" value="A1:B2760"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="text.true.polynominal.attribute"/>
<parameter key="1" value="screenName.true.polynominal.id"/>
</list>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="8.1.001" expanded="true" height="82" name="Nominal to Text" width="90" x="246" y="187">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="text"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="447" y="187">
<parameter key="select_attributes_and_weights" value="true"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="45" y="34"/>
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="179" y="34"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="313" y="34"/>
<operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="447" y="34">
<parameter key="min_chars" value="3"/>
</operator>
<operator activated="true" class="text:stem_porter" compatibility="8.1.000" expanded="true" height="68" name="Stem (Porter)" width="90" x="581" y="34"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
<connect from_op="Stem (Porter)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>1 -
1
-
@waqaskhan343 I see you have toggled on 'select attributes and weights' on the Process Doc operator and not entered anything. That's probably your problem. Toggle that off and run again.
2 -
@Thomas_Ott oh yes its working now so the problem is that check on attributee thank you so much you save my life. thank you again and again, sir @Thomas_Ott
2 -
wow - well done @waqaskhan343 for learning how to post XML and getting your question answered by resident unicorn @Thomas_Ott!
I almost want to send you some RapidMiner swag gifts for this...
Scott
1 -
@sgenzer Rofl, :smileyvery-happy: =D
1