text mining breaks text into symbols
Hello,
I'm trying to do text mining with a large excel table with many text entrys (many words in a cell). Unfortunately my "Process Documents from Files" breaks my text into a mixture of symbols and letters.
I aktually do not know why it is doing that, but also my word list looks like that.
Can you tell why this happens?
Thanks a lot
Imke
Answers
-
Hi,
can you make sure that you tried the right encoding? it looks like this was stored with UTF-8 (Mac/Linux Standard) but read with a Windows Encoding.
Br,
Martin
1 -
Hello,
underneath you can see my process. Maybe you can tell, what is wrong, and why it crashes rapid miner, too.
Thank you
Imke
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.003">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.5.003" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="380" y="34">
<list key="text_directories">
<parameter key="q-star" value="\\ADS.DLH.DE\LHuser$\LHT\HAM42\U555221\Documents\02_T_AL1Q\06_Q-star_Events\q-star-imread"/>
</list>
<parameter key="encoding" value="ISO-8859-1"/>
<process expanded="true">
<operator activated="true" breakpoints="after" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="112" y="34"/>
<operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="246" y="34"/>
<operator activated="true" class="text:filter_stopwords_german" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (German)" width="90" x="380" y="34"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="7.5.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="514" y="34"/>
<operator activated="true" class="text:filter_by_length" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="648" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
<connect from_op="Filter Stopwords (German)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
<connect from_op="Process Documents from Files" from_port="example set" to_port="result 1"/>
<connect from_op="Process Documents from Files" from_port="word list" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>0 -
For Reference,
the issue was that the files in the folder were Excel-Files. Read Document from Files is only able to handle pure text files. The attached process soled the issue.
Best,
Martin
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.0.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="9.0.002" expanded="true" height="68" name="Read Excel" width="90" x="45" y="187">
<list key="annotations"/>
<list key="data_set_meta_data_information"/>
<description align="center" color="transparent" colored="false" width="126">Use Import Wizard to read your file</description>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="9.0.002" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="187">
<description align="center" color="transparent" colored="false" width="126">Make sure the text is tagged as text</description>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="447" y="187">
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" breakpoints="after" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
<operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
<operator activated="true" class="text:filter_stopwords_german" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (German)" width="90" x="313" y="34"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="8.1.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="447" y="34"/>
<operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="581" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
<connect from_op="Filter Stopwords (German)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>0