🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

text mining breaks text into symbols

User: "imke"
New Altair Community Member
Updated by Jocelyn

Hello,

I'm trying to do text mining with a large excel table with many text entrys (many words in a cell). Unfortunately my "Process Documents from Files" breaks my text into a mixture of symbols and letters.unbenannt_2.png

I aktually do not know why it is doing that, but also my word list looks like that.

unbenannt.png

Can you tell why this happens?

Thanks a lot

Imke

Sort by:
1 - 3 of 31

    Hi,

     

    can you make sure that you tried the right encoding? it looks like this was stored with UTF-8 (Mac/Linux Standard) but read with a Windows Encoding.

     

    Br,

    Martin

    User: "imke"
    New Altair Community Member
    OP

    Hello,

    underneath you can see my process. Maybe you can tell, what is wrong, and why it crashes rapid miner, too.

    Thank you

    Imke

    <?xml version="1.0" encoding="UTF-8"?><process version="7.5.003">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.5.003" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="text:process_document_from_file" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="380" y="34">
    <list key="text_directories">
    <parameter key="q-star" value="\\ADS.DLH.DE\LHuser$\LHT\HAM42\U555221\Documents\02_T_AL1Q\06_Q-star_Events\q-star-imread"/>
    </list>
    <parameter key="encoding" value="ISO-8859-1"/>
    <process expanded="true">
    <operator activated="true" breakpoints="after" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="112" y="34"/>
    <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="246" y="34"/>
    <operator activated="true" class="text:filter_stopwords_german" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (German)" width="90" x="380" y="34"/>
    <operator activated="true" class="text:generate_n_grams_terms" compatibility="7.5.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="514" y="34"/>
    <operator activated="true" class="text:filter_by_length" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="648" y="34"/>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
    <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
    <connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
    <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
    <connect from_op="Process Documents from Files" from_port="example set" to_port="result 1"/>
    <connect from_op="Process Documents from Files" from_port="word list" to_port="result 2"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="source_input 2" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    </process>
    </operator>
    </process>

    For Reference,

    the issue was that the files in the folder were Excel-Files. Read Document from Files is only able to handle pure text files. The attached process soled the issue.

     

    Best,

    Martin

     

    <?xml version="1.0" encoding="UTF-8"?><process version="9.0.002">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="9.0.002" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="read_excel" compatibility="9.0.002" expanded="true" height="68" name="Read Excel" width="90" x="45" y="187">
    <list key="annotations"/>
    <list key="data_set_meta_data_information"/>
    <description align="center" color="transparent" colored="false" width="126">Use Import Wizard to read your file</description>
    </operator>
    <operator activated="true" class="nominal_to_text" compatibility="9.0.002" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="187">
    <description align="center" color="transparent" colored="false" width="126">Make sure the text is tagged as text</description>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="447" y="187">
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" breakpoints="after" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
    <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
    <operator activated="true" class="text:filter_stopwords_german" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (German)" width="90" x="313" y="34"/>
    <operator activated="true" class="text:generate_n_grams_terms" compatibility="8.1.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="447" y="34"/>
    <operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="581" y="34"/>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
    <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
    <connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
    <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
    <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="source_input 2" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    </process>
    </operator>
    </process>