nav[aria-label="Primary Navigation"] { padding: 0; & ul { list-style: none; width: 100%; display: flex; flex-direction: row; justify-content: start; align-items: start; gap: 30px; padding: 0; & li { margin: 0; } & ul li { list-style: none; } } }

Siemens Community Catalyst Program

The Siemens Community Catalyst program was co-created with our community to acknowledge technology leaders who consistently contribute to the Siemens Community. Nominations are accepted on a rolling basis.

Nominate Now

Reading multiples text files from a folder and converting them to an exampleset ?

curious95

I have a folder that contains multiple text files. So how can we read the content of each file and use them as a row in the exampleset.

for ex :

for a folder with 5 text files the exampleset will look like.

| Row | TextContent |

------- | ---------------------

1 | contents of fille1

2 | contents of fille2

3 | contents of fille3

4 | contents of fille4

5 | contents of fille5

Find more posts tagged with

AI Studio

Text Mining + NLP

Accepted answers

greg_lorincz79

I'm having trouble with the above process since it only picks up the first line of every documents. The files are emails in .txt format. How can I extract the whole body of the texts?

<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
  <operator activated="true" class="concurrency:loop_files" compatibility="8.2.000" expanded="true" height="82" name="Loop Files" width="90" x="199" y="119">
    <parameter key="directory" value="/Users/alkopop79/Datasets/enron_sample"/>
    <parameter key="filter_type" value="glob"/>
    <parameter key="recursive" value="false"/>
    <parameter key="enable_macros" value="false"/>
    <parameter key="macro_for_file_name" value="file_name"/>
    <parameter key="macro_for_file_type" value="file_type"/>
    <parameter key="macro_for_folder_name" value="folder_name"/>
    <parameter key="reuse_results" value="false"/>
    <parameter key="enable_parallel_execution" value="true"/>
    <process expanded="true">
      <operator activated="true" class="read_csv" compatibility="8.2.000" expanded="true" height="68" name="Read CSV" width="90" x="243" y="262">
        <parameter key="csv_file" value="/Users/alkopop79/Datasets/enron_sample/.DS_Store"/>
        <parameter key="column_separators" value=";"/>
        <parameter key="trim_lines" value="false"/>
        <parameter key="use_quotes" value="true"/>
        <parameter key="quotes_character" value="&quot;"/>
        <parameter key="escape_character" value="\"/>
        <parameter key="skip_comments" value="false"/>
        <parameter key="comment_characters" value="#"/>
        <parameter key="parse_numbers" value="true"/>
        <parameter key="decimal_character" value="."/>
        <parameter key="grouped_digits" value="false"/>
        <parameter key="grouping_character" value=","/>
        <parameter key="date_format" value=""/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="locale" value="English (United States)"/>
        <parameter key="encoding" value="UTF-8"/>
        <parameter key="read_all_values_as_polynominal" value="false"/>
        <list key="data_set_meta_data_information"/>
        <parameter key="read_not_matching_values_as_missings" value="true"/>
        <parameter key="datamanagement" value="double_array"/>
        <parameter key="data_management" value="auto"/>
      </operator>
      <operator activated="true" class="store" compatibility="8.2.000" expanded="true" height="68" name="Store" width="90" x="476" y="268">
        <parameter key="repository_entry" value="enron_sample"/>
      </operator>
      <connect from_port="file object" to_op="Read CSV" to_port="file"/>
      <connect from_op="Read CSV" from_port="output" to_op="Store" to_port="input"/>
      <connect from_op="Store" from_port="through" to_port="output 1"/>
      <portSpacing port="source_file object" spacing="0"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_output 1" spacing="0"/>
      <portSpacing port="sink_output 2" spacing="0"/>
    </process>
  </operator>
</process>

All comments

kayman

If you have the Text Processing extension you could do as follows :

Loop files operator -> inside the operator use fil -> read Document -> Documents to Data -> out -> Append operator

Something like this :

<?xml version="1.0" encoding="UTF-8"?><process version="7.5.001">
 <context>
 <input/>
 <output/>
 <macros/>
 </context>
 <operator activated="true" class="process" compatibility="7.5.001" expanded="true" name="Process">
 <process expanded="true">
 <operator activated="true" class="concurrency:loop_files" compatibility="7.5.001" expanded="true" height="82" name="Loop Files" width="90" x="112" y="34">
 <parameter key="directory" value="myDirectory"/>
 <parameter key="recursive" value="true"/>
 <parameter key="enable_parallel_execution" value="false"/>
 <process expanded="true">
 <operator activated="true" class="text:read_document" compatibility="7.5.000" expanded="true" height="68" name="Read Document" width="90" x="112" y="34"/>
 <operator activated="true" class="text:documents_to_data" compatibility="7.5.000" expanded="true" height="82" name="Documents to Data" width="90" x="246" y="34">
 <parameter key="text_attribute" value="mytext"/>
 </operator>
 <connect from_port="file object" to_op="Read Document" to_port="file"/>
 <connect from_op="Read Document" from_port="output" to_op="Documents to Data" to_port="documents 1"/>
 <connect from_op="Documents to Data" from_port="example set" to_port="output 1"/>
 <portSpacing port="source_file object" spacing="0"/>
 <portSpacing port="source_input 1" spacing="0"/>
 <portSpacing port="sink_output 1" spacing="0"/>
 <portSpacing port="sink_output 2" spacing="0"/>
 </process>
 </operator>
 <operator activated="true" class="append" compatibility="7.5.001" expanded="true" height="82" name="Append" width="90" x="246" y="34"/>
 <connect from_op="Loop Files" from_port="output 1" to_op="Append" to_port="example set 1"/>
 <connect from_op="Append" from_port="merged set" to_port="result 1"/>
 <portSpacing port="source_input 1" spacing="0"/>
 <portSpacing port="sink_result 1" spacing="0"/>
 <portSpacing port="sink_result 2" spacing="0"/>
 </process>
 </operator>
</process>

curious95

This technique works but it picks up the first line of all documents instead of the complete content.

Thomas_Ott

You would use the Loop Files operator with a Read CSV (set to read any file) and Store operator inside.

greg_lorincz79

I'm having trouble with the above process since it only picks up the first line of every documents. The files are emails in .txt format. How can I extract the whole body of the texts?

<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
  <operator activated="true" class="concurrency:loop_files" compatibility="8.2.000" expanded="true" height="82" name="Loop Files" width="90" x="199" y="119">
    <parameter key="directory" value="/Users/alkopop79/Datasets/enron_sample"/>
    <parameter key="filter_type" value="glob"/>
    <parameter key="recursive" value="false"/>
    <parameter key="enable_macros" value="false"/>
    <parameter key="macro_for_file_name" value="file_name"/>
    <parameter key="macro_for_file_type" value="file_type"/>
    <parameter key="macro_for_folder_name" value="folder_name"/>
    <parameter key="reuse_results" value="false"/>
    <parameter key="enable_parallel_execution" value="true"/>
    <process expanded="true">
      <operator activated="true" class="read_csv" compatibility="8.2.000" expanded="true" height="68" name="Read CSV" width="90" x="243" y="262">
        <parameter key="csv_file" value="/Users/alkopop79/Datasets/enron_sample/.DS_Store"/>
        <parameter key="column_separators" value=";"/>
        <parameter key="trim_lines" value="false"/>
        <parameter key="use_quotes" value="true"/>
        <parameter key="quotes_character" value="&quot;"/>
        <parameter key="escape_character" value="\"/>
        <parameter key="skip_comments" value="false"/>
        <parameter key="comment_characters" value="#"/>
        <parameter key="parse_numbers" value="true"/>
        <parameter key="decimal_character" value="."/>
        <parameter key="grouped_digits" value="false"/>
        <parameter key="grouping_character" value=","/>
        <parameter key="date_format" value=""/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="locale" value="English (United States)"/>
        <parameter key="encoding" value="UTF-8"/>
        <parameter key="read_all_values_as_polynominal" value="false"/>
        <list key="data_set_meta_data_information"/>
        <parameter key="read_not_matching_values_as_missings" value="true"/>
        <parameter key="datamanagement" value="double_array"/>
        <parameter key="data_management" value="auto"/>
      </operator>
      <operator activated="true" class="store" compatibility="8.2.000" expanded="true" height="68" name="Store" width="90" x="476" y="268">
        <parameter key="repository_entry" value="enron_sample"/>
      </operator>
      <connect from_port="file object" to_op="Read CSV" to_port="file"/>
      <connect from_op="Read CSV" from_port="output" to_op="Store" to_port="input"/>
      <connect from_op="Store" from_port="through" to_port="output 1"/>
      <portSpacing port="source_file object" spacing="0"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_output 1" spacing="0"/>
      <portSpacing port="sink_output 2" spacing="0"/>
    </process>
  </operator>
</process>