text mining breaks text into symbols

New Altair Community Member

Sep 11, 2018

Updated Nov 5, 2024 by Jocelyn

Hello,

I'm trying to do text mining with a large excel table with many text entrys (many words in a cell). Unfortunately my "Process Documents from Files" breaks my text into a mixture of symbols and letters.

I aktually do not know why it is doing that, but also my word list looks like that.

Can you tell why this happens?

Thanks a lot

Imke

unbenannt_2.png

unbenannt.png

Find more posts tagged with

AI Studio

Text Mining + NLP

Getting Started

Sort by:

1 - 3 of 31

MartinLiebig

Altair Employee

Sep 11, 2018

Hi,

can you make sure that you tried the right encoding? it looks like this was stored with UTF-8 (Mac/Linux Standard) but read with a Windows Encoding.

Br,

Martin

imke

New Altair Community Member

Sep 12, 2018

Hello,

underneath you can see my process. Maybe you can tell, what is wrong, and why it crashes rapid miner, too.

Thank you

Imke

<?xml version="1.0" encoding="UTF-8"?><process version="7.5.003">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.5.003" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="text:process_document_from_file" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="380" y="34">
        <list key="text_directories">
          <parameter key="q-star" value="\\ADS.DLH.DE\LHuser$\LHT\HAM42\U555221\Documents\02_T_AL1Q\06_Q-star_Events\q-star-imread"/>
        </list>
        <parameter key="encoding" value="ISO-8859-1"/>
        <process expanded="true">
          <operator activated="true" breakpoints="after" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="112" y="34"/>
          <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="246" y="34"/>
          <operator activated="true" class="text:filter_stopwords_german" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (German)" width="90" x="380" y="34"/>
          <operator activated="true" class="text:generate_n_grams_terms" compatibility="7.5.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="514" y="34"/>
          <operator activated="true" class="text:filter_by_length" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="648" y="34"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
          <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
          <connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
      <connect from_op="Process Documents from Files" from_port="example set" to_port="result 1"/>
      <connect from_op="Process Documents from Files" from_port="word list" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="source_input 2" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

MartinLiebig

Altair Employee

Sep 12, 2018

For Reference,

the issue was that the files in the folder were Excel-Files. Read Document from Files is only able to handle pure text files. The attached process soled the issue.

Best,

Martin

<?xml version="1.0" encoding="UTF-8"?><process version="9.0.002">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.0.002" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="read_excel" compatibility="9.0.002" expanded="true" height="68" name="Read Excel" width="90" x="45" y="187">
        <list key="annotations"/>
        <list key="data_set_meta_data_information"/>
        <description align="center" color="transparent" colored="false" width="126">Use Import Wizard to read your file</description>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="9.0.002" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="187">
        <description align="center" color="transparent" colored="false" width="126">Make sure the text is tagged as text</description>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="447" y="187">
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" breakpoints="after" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
          <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
          <operator activated="true" class="text:filter_stopwords_german" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (German)" width="90" x="313" y="34"/>
          <operator activated="true" class="text:generate_n_grams_terms" compatibility="8.1.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="447" y="34"/>
          <operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="581" y="34"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
          <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
          <connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="source_input 2" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

🎉Community Raffle - Win $25

text mining breaks text into symbols

Find more posts tagged with

Quick Links