🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

Reading multiples text files from a folder and converting them to an exampleset ?

User: "curious95"
New Altair Community Member
Updated by Jocelyn

I have a folder that contains multiple text files. So how can we read the content of each file and use them as a row in the exampleset.

 

for ex : 

for a folder with 5 text files the exampleset will look like.

| Row |    TextContent    |

-------  | ---------------------

1        | contents of fille1

2        | contents of fille2

3        | contents of fille3

4        | contents of fille4

5        | contents of fille5

Find more posts tagged with

Sort by:
1 - 4 of 41
    User: "kayman"
    New Altair Community Member

    If you have the Text Processing extension you could do as follows :

     

    Loop files operator -> inside the operator use fil -> read Document -> Documents to Data -> out  -> Append operator 

     

    Something like this :

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.5.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.5.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="concurrency:loop_files" compatibility="7.5.001" expanded="true" height="82" name="Loop Files" width="90" x="112" y="34">
    <parameter key="directory" value="myDirectory"/>
    <parameter key="recursive" value="true"/>
    <parameter key="enable_parallel_execution" value="false"/>
    <process expanded="true">
    <operator activated="true" class="text:read_document" compatibility="7.5.000" expanded="true" height="68" name="Read Document" width="90" x="112" y="34"/>
    <operator activated="true" class="text:documents_to_data" compatibility="7.5.000" expanded="true" height="82" name="Documents to Data" width="90" x="246" y="34">
    <parameter key="text_attribute" value="mytext"/>
    </operator>
    <connect from_port="file object" to_op="Read Document" to_port="file"/>
    <connect from_op="Read Document" from_port="output" to_op="Documents to Data" to_port="documents 1"/>
    <connect from_op="Documents to Data" from_port="example set" to_port="output 1"/>
    <portSpacing port="source_file object" spacing="0"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="append" compatibility="7.5.001" expanded="true" height="82" name="Append" width="90" x="246" y="34"/>
    <connect from_op="Loop Files" from_port="output 1" to_op="Append" to_port="example set 1"/>
    <connect from_op="Append" from_port="merged set" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>

     

    User: "curious95"
    New Altair Community Member
    OP

    This technique works but it picks up the first line of all documents instead of the complete content.

    User: "Thomas_Ott"
    New Altair Community Member

    You would use the Loop Files operator with a Read CSV (set to read any file) and Store operator inside. 

    User: "greg_lorincz79"
    New Altair Community Member
    Accepted Answer

    I'm having trouble with the above process since it only picks up the first line of every documents. The files are emails in .txt format. How can I extract the whole body of the texts?

     

    <?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
    <operator activated="true" class="concurrency:loop_files" compatibility="8.2.000" expanded="true" height="82" name="Loop Files" width="90" x="199" y="119">
    <parameter key="directory" value="/Users/alkopop79/Datasets/enron_sample"/>
    <parameter key="filter_type" value="glob"/>
    <parameter key="recursive" value="false"/>
    <parameter key="enable_macros" value="false"/>
    <parameter key="macro_for_file_name" value="file_name"/>
    <parameter key="macro_for_file_type" value="file_type"/>
    <parameter key="macro_for_folder_name" value="folder_name"/>
    <parameter key="reuse_results" value="false"/>
    <parameter key="enable_parallel_execution" value="true"/>
    <process expanded="true">
    <operator activated="true" class="read_csv" compatibility="8.2.000" expanded="true" height="68" name="Read CSV" width="90" x="243" y="262">
    <parameter key="csv_file" value="/Users/alkopop79/Datasets/enron_sample/.DS_Store"/>
    <parameter key="column_separators" value=";"/>
    <parameter key="trim_lines" value="false"/>
    <parameter key="use_quotes" value="true"/>
    <parameter key="quotes_character" value="&quot;"/>
    <parameter key="escape_character" value="\"/>
    <parameter key="skip_comments" value="false"/>
    <parameter key="comment_characters" value="#"/>
    <parameter key="parse_numbers" value="true"/>
    <parameter key="decimal_character" value="."/>
    <parameter key="grouped_digits" value="false"/>
    <parameter key="grouping_character" value=","/>
    <parameter key="date_format" value=""/>
    <parameter key="first_row_as_names" value="false"/>
    <list key="annotations">
    <parameter key="0" value="Name"/>
    </list>
    <parameter key="time_zone" value="SYSTEM"/>
    <parameter key="locale" value="English (United States)"/>
    <parameter key="encoding" value="UTF-8"/>
    <parameter key="read_all_values_as_polynominal" value="false"/>
    <list key="data_set_meta_data_information"/>
    <parameter key="read_not_matching_values_as_missings" value="true"/>
    <parameter key="datamanagement" value="double_array"/>
    <parameter key="data_management" value="auto"/>
    </operator>
    <operator activated="true" class="store" compatibility="8.2.000" expanded="true" height="68" name="Store" width="90" x="476" y="268">
    <parameter key="repository_entry" value="enron_sample"/>
    </operator>
    <connect from_port="file object" to_op="Read CSV" to_port="file"/>
    <connect from_op="Read CSV" from_port="output" to_op="Store" to_port="input"/>
    <connect from_op="Store" from_port="through" to_port="output 1"/>
    <portSpacing port="source_file object" spacing="0"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    </process>