Altair RISE

A program to recognize and reward our most engaged community members

Nominate Yourself Now!

Extracting the Tokenized Result Data

Hi,

First I read a txt-file and afterwards i would like to do some simple text processing steps and then i would like to export the newly stemmed and processed data to a txt data.

I got all the steps working but i just cant export results - I dont now if its clear but i would like to have a txt-file which contains the processed file as it is show in the Resulttable.

      <operator activated="true" class="text:read_document" compatibility="5.2.004" expanded="true" height="60" name="Read Document" width="90" x="45" y="120">
        <parameter key="file" value="C:\mystring.txt"/>
        <parameter key="extract_text_only" value="true"/>
        <parameter key="use_file_extension_as_type" value="true"/>
        <parameter key="content_type" value="txt"/>
        <parameter key="encoding" value="SYSTEM"/>
      </operator>
      <operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize" width="90" x="196" y="138">
        <parameter key="mode" value="non letters"/>
        <parameter key="characters" value=".:"/>
        <parameter key="language" value="English"/>
        <parameter key="max_token_length" value="3"/>
      </operator>
      <operator activated="true" class="text:transform_cases" compatibility="5.2.004" expanded="true" height="60" name="Transform Cases" width="90" x="311" y="69">
        <parameter key="transform_to" value="lower case"/>
      </operator>
      <operator activated="true" class="text:filter_stopwords_german" compatibility="5.2.004" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="313" y="210">
        <parameter key="stop_word_list" value="Standard"/>
      </operator>
      <operator activated="true" class="text:stem_german" compatibility="5.2.004" expanded="true" height="60" name="Stem (German)" width="90" x="447" y="30"/>
      <operator activated="true" class="text:filter_by_length" compatibility="5.2.004" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="447" y="210">
        <parameter key="min_chars" value="2"/>
        <parameter key="max_chars" value="25"/>
      </operator>
      <connect from_op="Read Document" from_port="output" to_op="Tokenize" to_port="document"/>
      <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
      <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
      <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Stem (German)" to_port="document"/>
      <connect from_op="Stem (German)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
      <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="108"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Find more posts tagged with

AI Studio

Accepted answers

All comments

Skirzynski

Indeed this is not very intuitive. Of course you need the "Write Document" operator, but unfortunately this operator does not write the processed tokens. To do this you have to add the "Combine Documents" operator which does not only concatenate multiple documents, but also creates a new document with the tokens instead the original text. Thus adding this operator before the write operator yields your desired output.


<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.000" expanded="true" name="Process">
    <process expanded="true" height="431" width="815">
      <operator activated="true" class="text:read_document" compatibility="5.2.005" expanded="true" height="60" name="Read Document" width="90" x="45" y="120">
        <parameter key="file" value="/home/marcin/mystring.txt"/>
        <parameter key="encoding" value="SYSTEM"/>
      </operator>
      <operator activated="true" class="text:tokenize" compatibility="5.2.005" expanded="true" height="60" name="Tokenize" width="90" x="196" y="138"/>
      <operator activated="true" class="text:transform_cases" compatibility="5.2.005" expanded="true" height="60" name="Transform Cases" width="90" x="311" y="69"/>
      <operator activated="true" class="text:filter_stopwords_german" compatibility="5.2.005" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="313" y="210"/>
      <operator activated="true" class="text:stem_german" compatibility="5.2.005" expanded="true" height="60" name="Stem (German)" width="90" x="447" y="30"/>
      <operator activated="true" class="text:filter_by_length" compatibility="5.2.005" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="447" y="210">
        <parameter key="min_chars" value="2"/>
      </operator>
      <operator activated="true" class="text:combine_documents" compatibility="5.2.005" expanded="true" height="76" name="Combine Documents" width="90" x="581" y="210"/>
      <operator activated="true" class="text:write_document" compatibility="5.2.005" expanded="true" height="76" name="Write Document" width="90" x="581" y="30">
        <parameter key="file" value="/home/marcin/mystring-doc.txt"/>
      </operator>
      <connect from_op="Read Document" from_port="output" to_op="Tokenize" to_port="document"/>
      <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
      <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
      <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Stem (German)" to_port="document"/>
      <connect from_op="Stem (German)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
      <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Combine Documents" to_port="documents 1"/>
      <connect from_op="Combine Documents" from_port="document" to_op="Write Document" to_port="document"/>
      <connect from_op="Write Document" from_port="document" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="108"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

maxfax

Thank you very much

Marcin wrote:


<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.000" expanded="true" name="Process">
    <process expanded="true" height="431" width="815">
      <operator activated="true" class="text:read_document" compatibility="5.2.005" expanded="true" height="60" name="Read Document" width="90" x="45" y="120">
        <parameter key="file" value="/home/marcin/mystring.txt"/>
        <parameter key="encoding" value="SYSTEM"/>
      </operator>
      <operator activated="true" class="text:tokenize" compatibility="5.2.005" expanded="true" height="60" name="Tokenize" width="90" x="196" y="138"/>
      <operator activated="true" class="text:transform_cases" compatibility="5.2.005" expanded="true" height="60" name="Transform Cases" width="90" x="311" y="69"/>
      <operator activated="true" class="text:filter_stopwords_german" compatibility="5.2.005" expanded="true" height="60" name="Filter Stopwords (German)" width="90" x="313" y="210"/>
      <operator activated="true" class="text:stem_german" compatibility="5.2.005" expanded="true" height="60" name="Stem (German)" width="90" x="447" y="30"/>
      <operator activated="true" class="text:filter_by_length" compatibility="5.2.005" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="447" y="210">
        <parameter key="min_chars" value="2"/>
      </operator>
      <operator activated="true" class="text:combine_documents" compatibility="5.2.005" expanded="true" height="76" name="Combine Documents" width="90" x="581" y="210"/>
      <operator activated="true" class="text:write_document" compatibility="5.2.005" expanded="true" height="76" name="Write Document" width="90" x="581" y="30">
        <parameter key="file" value="/home/marcin/mystring-doc.txt"/>
      </operator>
      <connect from_op="Read Document" from_port="output" to_op="Tokenize" to_port="document"/>
      <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
      <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
      <connect from_op="Filter Stopwords (German)" from_port="document" to_op="Stem (German)" to_port="document"/>
      <connect from_op="Stem (German)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
      <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Combine Documents" to_port="documents 1"/>
      <connect from_op="Combine Documents" from_port="document" to_op="Write Document" to_port="document"/>
      <connect from_op="Write Document" from_port="document" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="108"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>