🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

Text Mining of multiple PDF files with separate key word counts

bazi66User: "bazi66"
New Altair Community Member
Updated by Jocelyn

Hello all,

 

I am new to this community and hope that somebody can help me. I already searched the forum a lot and found very good topics, but I couldn't find a proper solution for my task. Here's what I want to do:

I have about 500 PDF files and want to text mine them and compare the results to key words I already have in Excel.

The problem is, that I want to get a word count and a comparison for each PDF file (not overall) and a column for the results in an Excel Sheet. When I start my process with the "Process from Files" with a "Tokenize" operator in it, I only get back the sum over all documents, but not for each PDF file.

I already tried it with a different approach: A "Loop" operator, starting with the "Read from document" Process. I got no results out of that.

I attached my approaches (I use RapidMiner Studio). Can someone maybe help me with the right approach and the correct process map?

 

Thank you very much for your help in advance!

 

1st approach::

<?xml version="1.0" encoding="UTF-8"?><process version="9.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.0.001" expanded="true" name="Process">
<parameter key="resultfile" value="C:\Users\User\Desktop\hello.xls"/>
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="45" y="136">
<list key="text_directories">
<parameter key="Berichte" value="C:\Users\USER\Dropbox\Dropbox\Masterarbeit\"/>
</list>
<parameter key="file_pattern" value="true"/>
<parameter key="extract_text_only" value="false"/>
<parameter key="keep_text" value="true"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="112" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:wordlist_to_data" compatibility="8.1.000" expanded="true" height="82" name="WordList to Data" width="90" x="313" y="187"/>
<operator activated="true" class="write_excel" compatibility="9.0.001" expanded="true" height="82" name="Write Excel" width="90" x="514" y="238">
<parameter key="excel_file" value="C:\Users\User\Desktop\Test1234.xlsx"/>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_port="result 2"/>
<connect from_op="Process Documents from Files" from_port="word list" to_op="WordList to Data" to_port="word list"/>
<connect from_op="WordList to Data" from_port="word list" to_port="result 3"/>
<connect from_op="WordList to Data" from_port="example set" to_op="Write Excel" to_port="input"/>
<connect from_op="Write Excel" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>

2nd approach:

<?xml version="1.0" encoding="UTF-8"?><process version="9.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="loop_files" compatibility="6.4.000" expanded="true" height="82" name="Loop Files" width="90" x="45" y="34">
<parameter key="directory" value="C:\Users\USER\Dropbox\Dropbox\Masterarbeit\"/>
<parameter key="file_name_macro" value="file_name_TEST"/>
<process expanded="true">
<operator activated="true" class="text:read_document" compatibility="8.1.000" expanded="true" height="68" name="Read Document" width="90" x="45" y="30">
<parameter key="content_type" value="pdf"/>
</operator>
<operator activated="true" class="text:process_documents" compatibility="8.1.000" expanded="true" height="103" name="Process Documents" width="90" x="45" y="136">
<parameter key="add_meta_information" value="false"/>
<parameter key="prune_below_rank" value="5.0"/>
<parameter key="prune_above_rank" value="5.0"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="120"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:wordlist_to_data" compatibility="8.1.000" expanded="true" height="82" name="WordList to Data" width="90" x="45" y="255"/>
<operator activated="true" class="write_excel" compatibility="9.0.001" expanded="true" height="82" name="Write Excel" width="90" x="380" y="289">
<parameter key="excel_file" value="C:\Users\USER\Desktop\Test1234.xlsx"/>
</operator>
<connect from_port="file object" to_op="Read Document" to_port="file"/>
<connect from_op="Read Document" from_port="output" to_op="Process Documents" to_port="documents 1"/>
<connect from_op="Process Documents" from_port="word list" to_op="WordList to Data" to_port="word list"/>
<connect from_op="WordList to Data" from_port="example set" to_op="Write Excel" to_port="input"/>
<connect from_op="Write Excel" from_port="file" to_port="out 1"/>
<portSpacing port="source_file object" spacing="0"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<connect from_op="Loop Files" from_port="out 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>


 

Sort by:
1 - 1 of 11
    sgenzerUser: "sgenzer"
    Altair Employee
    Accepted Answer

    hi @bazi66 - does this help? I disabled the Write Excel but you can obviously re-enable it if you want.

     

    <?xml version="1.0" encoding="UTF-8"?><process version="9.0.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="concurrency:loop_files" compatibility="9.0.001" expanded="true" height="82" name="Loop Files (2)" width="90" x="45" y="34">
    <parameter key="directory" value="/Users/genzerconsulting/Desktop/loop files"/>
    <parameter key="filter_type" value="regex"/>
    <parameter key="filter_by_regex" value=".*.pdf"/>
    <parameter key="enable_macros" value="true"/>
    <process expanded="true">
    <operator activated="true" class="text:read_document" compatibility="8.1.000" expanded="true" height="68" name="Read Document" width="90" x="45" y="34">
    <parameter key="content_type" value="pdf"/>
    </operator>
    <operator activated="true" class="text:process_documents" compatibility="8.1.000" expanded="true" height="103" name="Process Documents" width="90" x="179" y="34">
    <parameter key="prune_below_rank" value="5.0"/>
    <parameter key="prune_above_rank" value="5.0"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="120"/>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:wordlist_to_data" compatibility="8.1.000" expanded="true" height="82" name="WordList to Data" width="90" x="313" y="34"/>
    <operator activated="true" class="rename" compatibility="9.0.001" expanded="true" height="82" name="Rename" width="90" x="447" y="34">
    <parameter key="old_name" value="in documents"/>
    <parameter key="new_name" value="in %{file_name}"/>
    <list key="rename_additional_attributes">
    <parameter key="total" value="total %{file_name}"/>
    </list>
    </operator>
    <connect from_port="file object" to_op="Read Document" to_port="file"/>
    <connect from_op="Read Document" from_port="output" to_op="Process Documents" to_port="documents 1"/>
    <connect from_op="Process Documents" from_port="word list" to_op="WordList to Data" to_port="word list"/>
    <connect from_op="WordList to Data" from_port="example set" to_op="Rename" to_port="example set input"/>
    <connect from_op="Rename" from_port="example set output" to_port="output 1"/>
    <portSpacing port="source_file object" spacing="0"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="false" class="write_excel" compatibility="9.0.001" expanded="true" height="82" name="Write Excel" width="90" x="313" y="34">
    <parameter key="excel_file" value="C:\Users\USER\Desktop\Test123456.xlsx"/>
    </operator>
    <operator activated="true" class="subprocess" compatibility="9.0.001" expanded="true" height="82" name="Union Append" width="90" x="179" y="34">
    <process expanded="true">
    <operator activated="true" class="loop_collection" compatibility="9.0.001" expanded="true" height="82" name="Output (4)" width="90" x="45" y="34">
    <parameter key="set_iteration_macro" value="true"/>
    <process expanded="true">
    <operator activated="false" breakpoints="after" class="select" compatibility="9.0.001" expanded="true" height="68" name="Select (5)" width="90" x="112" y="34">
    <parameter key="index" value="%{iteration}"/>
    </operator>
    <operator activated="true" class="branch" compatibility="9.0.001" expanded="true" height="82" name="Branch (2)" width="90" x="313" y="34">
    <parameter key="condition_type" value="expression"/>
    <parameter key="expression" value="%{iteration}==1"/>
    <process expanded="true">
    <connect from_port="condition" to_port="input 1"/>
    <portSpacing port="source_condition" spacing="0"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_input 1" spacing="0"/>
    <portSpacing port="sink_input 2" spacing="0"/>
    </process>
    <process expanded="true">
    <operator activated="true" class="recall" compatibility="9.0.001" expanded="true" height="68" name="Recall (5)" width="90" x="45" y="187">
    <parameter key="name" value="LoopData"/>
    </operator>
    <operator activated="true" class="union" compatibility="9.0.001" expanded="true" height="82" name="Union (2)" width="90" x="179" y="34"/>
    <connect from_port="condition" to_op="Union (2)" to_port="example set 1"/>
    <connect from_op="Recall (5)" from_port="result" to_op="Union (2)" to_port="example set 2"/>
    <connect from_op="Union (2)" from_port="union" to_port="input 1"/>
    <portSpacing port="source_condition" spacing="0"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_input 1" spacing="0"/>
    <portSpacing port="sink_input 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="remember" compatibility="9.0.001" expanded="true" height="68" name="Remember (5)" width="90" x="581" y="34">
    <parameter key="name" value="LoopData"/>
    </operator>
    <connect from_port="single" to_op="Branch (2)" to_port="condition"/>
    <connect from_op="Branch (2)" from_port="input 1" to_op="Remember (5)" to_port="store"/>
    <connect from_op="Remember (5)" from_port="stored" to_port="output 1"/>
    <portSpacing port="source_single" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="select" compatibility="9.0.001" expanded="true" height="68" name="Select (6)" width="90" x="179" y="34">
    <parameter key="index" value="%{iteration}"/>
    </operator>
    <connect from_port="in 1" to_op="Output (4)" to_port="collection"/>
    <connect from_op="Output (4)" from_port="output 1" to_op="Select (6)" to_port="collection"/>
    <connect from_op="Select (6)" from_port="selected" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="source_in 2" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <connect from_op="Loop Files (2)" from_port="output 1" to_op="Union Append" to_port="in 1"/>
    <connect from_op="Union Append" from_port="out 1" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>

    Scott

     

    [EDIT - ok I think you probably want to aggregate by word. I did this in this next process and also added case transformation and some stemming. Just made sense to me.]

     

    <?xml version="1.0" encoding="UTF-8"?><process version="9.0.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="concurrency:loop_files" compatibility="9.0.001" expanded="true" height="82" name="Loop Files (2)" width="90" x="45" y="34">
    <parameter key="directory" value="/Users/genzerconsulting/Desktop/loop files"/>
    <parameter key="filter_type" value="regex"/>
    <parameter key="filter_by_regex" value=".*.pdf"/>
    <parameter key="enable_macros" value="true"/>
    <process expanded="true">
    <operator activated="true" class="text:read_document" compatibility="8.1.000" expanded="true" height="68" name="Read Document" width="90" x="45" y="34">
    <parameter key="content_type" value="pdf"/>
    </operator>
    <operator activated="true" class="text:process_documents" compatibility="8.1.000" expanded="true" height="103" name="Process Documents" width="90" x="179" y="34">
    <parameter key="prune_below_rank" value="5.0"/>
    <parameter key="prune_above_rank" value="5.0"/>
    <process expanded="true">
    <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="45" y="34"/>
    <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="179" y="34"/>
    <operator activated="true" class="text:stem_porter" compatibility="8.1.000" expanded="true" height="68" name="Stem (Porter)" width="90" x="313" y="34"/>
    <connect from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Stem (Porter)" to_port="document"/>
    <connect from_op="Stem (Porter)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:wordlist_to_data" compatibility="8.1.000" expanded="true" height="82" name="WordList to Data" width="90" x="313" y="34"/>
    <operator activated="true" class="rename" compatibility="9.0.001" expanded="true" height="82" name="Rename" width="90" x="447" y="34">
    <parameter key="old_name" value="in documents"/>
    <parameter key="new_name" value="in %{file_name}"/>
    <list key="rename_additional_attributes">
    <parameter key="total" value="total %{file_name}"/>
    </list>
    </operator>
    <connect from_port="file object" to_op="Read Document" to_port="file"/>
    <connect from_op="Read Document" from_port="output" to_op="Process Documents" to_port="documents 1"/>
    <connect from_op="Process Documents" from_port="word list" to_op="WordList to Data" to_port="word list"/>
    <connect from_op="WordList to Data" from_port="example set" to_op="Rename" to_port="example set input"/>
    <connect from_op="Rename" from_port="example set output" to_port="output 1"/>
    <portSpacing port="source_file object" spacing="0"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="false" class="write_excel" compatibility="9.0.001" expanded="true" height="82" name="Write Excel" width="90" x="313" y="238">
    <parameter key="excel_file" value="C:\Users\USER\Desktop\Test123456.xlsx"/>
    </operator>
    <operator activated="true" class="subprocess" compatibility="9.0.001" expanded="true" height="82" name="Union Append" width="90" x="179" y="34">
    <process expanded="true">
    <operator activated="true" class="loop_collection" compatibility="9.0.001" expanded="true" height="82" name="Output (4)" width="90" x="45" y="34">
    <parameter key="set_iteration_macro" value="true"/>
    <process expanded="true">
    <operator activated="false" breakpoints="after" class="select" compatibility="9.0.001" expanded="true" height="68" name="Select (5)" width="90" x="112" y="34">
    <parameter key="index" value="%{iteration}"/>
    </operator>
    <operator activated="true" class="branch" compatibility="9.0.001" expanded="true" height="82" name="Branch (2)" width="90" x="313" y="34">
    <parameter key="condition_type" value="expression"/>
    <parameter key="expression" value="%{iteration}==1"/>
    <process expanded="true">
    <connect from_port="condition" to_port="input 1"/>
    <portSpacing port="source_condition" spacing="0"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_input 1" spacing="0"/>
    <portSpacing port="sink_input 2" spacing="0"/>
    </process>
    <process expanded="true">
    <operator activated="true" class="recall" compatibility="9.0.001" expanded="true" height="68" name="Recall (5)" width="90" x="45" y="187">
    <parameter key="name" value="LoopData"/>
    </operator>
    <operator activated="true" class="union" compatibility="9.0.001" expanded="true" height="82" name="Union (2)" width="90" x="179" y="34"/>
    <connect from_port="condition" to_op="Union (2)" to_port="example set 1"/>
    <connect from_op="Recall (5)" from_port="result" to_op="Union (2)" to_port="example set 2"/>
    <connect from_op="Union (2)" from_port="union" to_port="input 1"/>
    <portSpacing port="source_condition" spacing="0"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_input 1" spacing="0"/>
    <portSpacing port="sink_input 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="remember" compatibility="9.0.001" expanded="true" height="68" name="Remember (5)" width="90" x="581" y="34">
    <parameter key="name" value="LoopData"/>
    </operator>
    <connect from_port="single" to_op="Branch (2)" to_port="condition"/>
    <connect from_op="Branch (2)" from_port="input 1" to_op="Remember (5)" to_port="store"/>
    <connect from_op="Remember (5)" from_port="stored" to_port="output 1"/>
    <portSpacing port="source_single" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="select" compatibility="9.0.001" expanded="true" height="68" name="Select (6)" width="90" x="179" y="34">
    <parameter key="index" value="%{iteration}"/>
    </operator>
    <connect from_port="in 1" to_op="Output (4)" to_port="collection"/>
    <connect from_op="Output (4)" from_port="output 1" to_op="Select (6)" to_port="collection"/>
    <connect from_op="Select (6)" from_port="selected" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="source_in 2" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="aggregate" compatibility="9.0.001" expanded="true" height="82" name="Aggregate" width="90" x="313" y="34">
    <parameter key="use_default_aggregation" value="true"/>
    <parameter key="default_aggregation_function" value="sum"/>
    <list key="aggregation_attributes"/>
    <parameter key="group_by_attributes" value="word"/>
    </operator>
    <connect from_op="Loop Files (2)" from_port="output 1" to_op="Union Append" to_port="in 1"/>
    <connect from_op="Union Append" from_port="out 1" to_op="Aggregate" to_port="example set input"/>
    <connect from_op="Aggregate" from_port="example set output" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>