Include filename in Excel output
Hi,
I'm new to RM.
I'm analysing a bunch of text files in a folder, getting some great results using n-grams, much excitement.
I really want to include the filename in the results that I'm sending out to Excel. I see that the filename is included in the meatada of the ExampleSet but I am converting the WordList to data and outputting that to Excel, how can I get the filenames as the last column in this dataset?
I have searched high and low and found several ideas but I think my skill level is still too low to properly use them.
I'll post the xml below as I have see other posters do in the past, I'd really appreciate any assistance.
<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="112" y="85">
<list key="text_directories">
<parameter key="WFS call text" value="/Users/mark/Downloads/voicetext"/>
</list>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="187"/>
<operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="187"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="313" y="187"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="7.5.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="447" y="187">
<parameter key="max_length" value="7"/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:wordlist_to_data" compatibility="7.5.000" expanded="true" height="82" name="WordList to Data" width="90" x="313" y="238"/>
<operator activated="true" class="write_excel" compatibility="8.0.001" expanded="true" height="82" name="Write Excel" width="90" x="514" y="238">
<parameter key="excel_file" value="/Users/mark/Downloads/WFS patterns.xlsx"/>
</operator>
<connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
<connect from_op="Process Documents from Files" from_port="example set" to_port="result 1"/>
<connect from_op="Process Documents from Files" from_port="word list" to_op="WordList to Data" to_port="word list"/>
<connect from_op="WordList to Data" from_port="example set" to_op="Write Excel" to_port="input"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Answers
-
hello @Markw - welcome to the community. So the filename should be part of the metadata that comes out of Process Documents from Files. I cannot see that column because I don't have your files, but it should be there.
So if that's the case, I would put an "Extract Macro" operator on that wire that will save the filename. Then you can create an attribute with that macro's information. Something like this:
<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="112" y="85">
<list key="text_directories">
<parameter key="WFS call text" value="/Users/mark/Downloads/voicetext"/>
</list>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="187"/>
<operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="187"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="313" y="187"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="7.5.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="447" y="187">
<parameter key="max_length" value="7"/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="extract_macro" compatibility="8.0.001" expanded="true" height="68" name="Extract Macro" width="90" x="380" y="85">
<parameter key="macro" value="filename"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="filename"/>
<parameter key="example_index" value="1"/>
<list key="additional_macros"/>
</operator>
<operator activated="true" class="text:wordlist_to_data" compatibility="7.5.000" expanded="true" height="82" name="WordList to Data" width="90" x="313" y="238"/>
<operator activated="true" class="generate_attributes" compatibility="8.0.001" expanded="true" height="82" name="Generate Attributes" width="90" x="447" y="238">
<list key="function_descriptions">
<parameter key="filename" value="%{filename}"/>
</list>
</operator>
<operator activated="true" class="write_excel" compatibility="8.0.001" expanded="true" height="82" name="Write Excel" width="90" x="581" y="238">
<parameter key="excel_file" value="/Users/mark/Downloads/WFS patterns.xlsx"/>
</operator>
<connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Extract Macro" to_port="example set"/>
<connect from_op="Process Documents from Files" from_port="word list" to_op="WordList to Data" to_port="word list"/>
<connect from_op="Extract Macro" from_port="example set" to_port="result 1"/>
<connect from_op="WordList to Data" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Write Excel" to_port="input"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>Scott
1