Hello all,
I am new to this community and hope that somebody can help me. I already searched the forum a lot and found very good topics, but I couldn't find a proper solution for my task. Here's what I want to do:
I have about 500 PDF files and want to text mine them and compare the results to key words I already have in Excel.
The problem is, that I want to get a word count and a comparison for each PDF file (not overall) and a column for the results in an Excel Sheet. When I start my process with the "Process from Files" with a "Tokenize" operator in it, I only get back the sum over all documents, but not for each PDF file.
I already tried it with a different approach: A "Loop" operator, starting with the "Read from document" Process. I got no results out of that.
I attached my approaches (I use RapidMiner Studio). Can someone maybe help me with the right approach and the correct process map?
Thank you very much for your help in advance!
1st approach::
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.0.001" expanded="true" name="Process">
<parameter key="resultfile" value="C:\Users\User\Desktop\hello.xls"/>
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="45" y="136">
<list key="text_directories">
<parameter key="Berichte" value="C:\Users\USER\Dropbox\Dropbox\Masterarbeit\"/>
</list>
<parameter key="file_pattern" value="true"/>
<parameter key="extract_text_only" value="false"/>
<parameter key="keep_text" value="true"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="112" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:wordlist_to_data" compatibility="8.1.000" expanded="true" height="82" name="WordList to Data" width="90" x="313" y="187"/>
<operator activated="true" class="write_excel" compatibility="9.0.001" expanded="true" height="82" name="Write Excel" width="90" x="514" y="238">
<parameter key="excel_file" value="C:\Users\User\Desktop\Test1234.xlsx"/>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_port="result 2"/>
<connect from_op="Process Documents from Files" from_port="word list" to_op="WordList to Data" to_port="word list"/>
<connect from_op="WordList to Data" from_port="word list" to_port="result 3"/>
<connect from_op="WordList to Data" from_port="example set" to_op="Write Excel" to_port="input"/>
<connect from_op="Write Excel" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>
2nd approach:
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="loop_files" compatibility="6.4.000" expanded="true" height="82" name="Loop Files" width="90" x="45" y="34">
<parameter key="directory" value="C:\Users\USER\Dropbox\Dropbox\Masterarbeit\"/>
<parameter key="file_name_macro" value="file_name_TEST"/>
<process expanded="true">
<operator activated="true" class="text:read_document" compatibility="8.1.000" expanded="true" height="68" name="Read Document" width="90" x="45" y="30">
<parameter key="content_type" value="pdf"/>
</operator>
<operator activated="true" class="text:process_documents" compatibility="8.1.000" expanded="true" height="103" name="Process Documents" width="90" x="45" y="136">
<parameter key="add_meta_information" value="false"/>
<parameter key="prune_below_rank" value="5.0"/>
<parameter key="prune_above_rank" value="5.0"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="120"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:wordlist_to_data" compatibility="8.1.000" expanded="true" height="82" name="WordList to Data" width="90" x="45" y="255"/>
<operator activated="true" class="write_excel" compatibility="9.0.001" expanded="true" height="82" name="Write Excel" width="90" x="380" y="289">
<parameter key="excel_file" value="C:\Users\USER\Desktop\Test1234.xlsx"/>
</operator>
<connect from_port="file object" to_op="Read Document" to_port="file"/>
<connect from_op="Read Document" from_port="output" to_op="Process Documents" to_port="documents 1"/>
<connect from_op="Process Documents" from_port="word list" to_op="WordList to Data" to_port="word list"/>
<connect from_op="WordList to Data" from_port="example set" to_op="Write Excel" to_port="input"/>
<connect from_op="Write Excel" from_port="file" to_port="out 1"/>
<portSpacing port="source_file object" spacing="0"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<connect from_op="Loop Files" from_port="out 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>