text mining filter stopwords (dictionary)
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.003">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.003" expanded="true" name="Process">
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true" height="415" width="547">
<operator activated="true" class="read_excel" compatibility="5.2.003" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
<parameter key="excel_file" value="E:\R-2.14.0\bin\i386\tickets.xls"/>
<parameter key="imported_cell_range" value="A1:A748"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="Rationale.true.text.attribute"/>
</list>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.2.001" expanded="true" height="76" name="Process Documents from Data" width="90" x="179" y="30">
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="absolute"/>
<parameter key="prune_below_absolute" value="10"/>
<parameter key="prune_above_absolute" value="700"/>
<list key="specify_weights"/>
<process expanded="true" height="434" width="557">
<operator activated="true" class="text:transform_cases" compatibility="5.2.001" expanded="true" height="60" name="Transform Cases" width="90" x="35" y="45"/>
<operator activated="true" class="text:stem_snowball" compatibility="5.2.001" expanded="true" height="60" name="Stem (Snowball)" width="90" x="154" y="49"/>
<operator activated="true" class="text:tokenize" compatibility="5.2.001" expanded="true" height="60" name="Tokenize" width="90" x="40" y="144"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.2.001" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="45" y="255"/>
<operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.2.001" expanded="true" height="60" name="Filter Stopwords (Dictionary)" width="90" x="179" y="300">
<parameter key="file" value="E:\Rapid Miner\RapidMiner5\Stop_Dictionary05082012.txt"/>
<parameter key="encoding" value="SYSTEM"/>
</operator>
<operator activated="true" class="text:filter_by_length" compatibility="5.2.001" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="270" y="225">
<parameter key="min_chars" value="2"/>
<parameter key="max_chars" value="9999"/>
</operator>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="5.2.001" expanded="true" height="60" name="Generate n-Grams (Terms)" width="90" x="380" y="165">
<parameter key="max_length" value="1"/>
</operator>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
<connect from_op="Stem (Snowball)" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
<connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="correlation_matrix" compatibility="5.2.003" expanded="true" height="94" name="Correlation Matrix" width="90" x="54" y="198"/>
<connect from_op="Read Excel" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Correlation Matrix" to_port="example set"/>
<connect from_op="Correlation Matrix" from_port="example set" to_port="result 1"/>
<connect from_op="Correlation Matrix" from_port="matrix" to_port="result 2"/>
<connect from_op="Correlation Matrix" from_port="weights" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>
Answers
-
Hi,
your process seems to be okay. How does your Dictionary file look like?
Best,
Nils0 -
Hi Nils,
My dictionary has about 52 words, one per line, all in one column. Do I need a column header or any other formatting? Thanks for responding.0 -
Hi Nils,
I retyped all the words into a new txt file and it now runs fine. Thanks for your efforts.0 -
Hello..... i am having the same problem as it depicted. i follow all the steps which are written in the comment and also having a dictionary of words but it getting all the words and not picking the words specified in my dictionary which i want actually.
Regards.
0 -
Did you create a new text file, like the previous poster?
0 -
Thanks T-Bone.Yes i created text file for vocabulary.
Regards,
0 -
My input is a folder having text format files.
regards,
0 -
Hello.. Can i have a video tutorial of text mining using filter stop words Dictionary...please let me know it’s really urgent.
Regards,
0 -
The process you provided does not provide an operator to load in the TXT file to the Filter Stop Words (Dictionary) operator. You will need an Open File operator and connect it to the Filter Stop Words (Dictionary) "fil" input port.
0