I have a workflow to perform document processing for a collection of plain text files. This workflow consists of three sub-processes
1) Tokenize
2) TransformCase
3) Filter token by length
I am trying to log the following information
1) The total computational time for the whole document processing workflow
2) The computational time for each three sub-processes
Here is the process I created. For illustration purposes, I just use 4 text files as input. In this process, I capture “CPU-Time” and “Time” in the log.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<process expanded="true" height="341" width="681">
<operator activated="true" class="select_subprocess" compatibility="5.2.008" expanded="true" height="94" name="Select Subprocess" width="90" x="112" y="30">
<process expanded="true" height="406" width="346">
<operator activated="true" class="text:process_document_from_file" compatibility="5.2.004" expanded="true" height="76" name="Process Documents from Files (3)" width="90" x="246" y="75">
<list key="text_directories">
<parameter key="Responsive" value="C:\Desktop\testdata"/>
</list>
<parameter key="vector_creation" value="Binary Term Occurrences"/>
<process expanded="true" height="431" width="697">
<operator activated="true" class="select_subprocess" compatibility="5.2.008" expanded="true" height="94" name="subprocess1" width="90" x="45" y="30">
<process expanded="true" height="406" width="413">
<operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize" width="90" x="246" y="165"/>
<connect from_port="input 1" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
<portSpacing port="sink_output 3" spacing="0"/>
</process>
<process expanded="true">
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
<portSpacing port="sink_output 3" spacing="0"/>
</process>
</operator>
<operator activated="true" class="select_subprocess" compatibility="5.2.008" expanded="true" height="94" name="subprocess2" width="90" x="246" y="75">
<process expanded="true" height="388" width="314">
<operator activated="true" class="text:transform_cases" compatibility="5.2.004" expanded="true" height="60" name="Transform Cases" width="90" x="151" y="71"/>
<connect from_port="input 1" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
<portSpacing port="sink_output 3" spacing="0"/>
</process>
<process expanded="true">
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
<portSpacing port="sink_output 3" spacing="0"/>
</process>
</operator>
<operator activated="true" class="select_subprocess" compatibility="5.2.008" expanded="true" height="94" name="subprocess3" width="90" x="447" y="30">
<process expanded="true" height="388" width="346">
<operator activated="true" class="text:filter_by_length" compatibility="5.2.004" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="239" y="58">
<parameter key="min_chars" value="1"/>
<parameter key="max_chars" value="200"/>
</operator>
<connect from_port="input 1" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
<portSpacing port="sink_output 3" spacing="0"/>
</process>
<process expanded="true">
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
<portSpacing port="sink_output 3" spacing="0"/>
</process>
</operator>
<operator activated="true" class="log" compatibility="5.2.008" expanded="true" height="112" name="Log" width="90" x="313" y="255">
<list key="log">
<parameter key="tokenCPU" value="operator.Tokenize.value.cpu-time"/>
<parameter key="transformCPU" value="operator.Transform Cases.value.cpu-time"/>
<parameter key="filterCPU" value="operator.Filter Tokens (by Length).value.cpu-time"/>
<parameter key="tokenTime" value="operator.Tokenize.value.time"/>
<parameter key="transformTime" value="operator.Transform Cases.value.time"/>
<parameter key="filterTime" value="operator.Filter Tokens (by Length).value.time"/>
</list>
</operator>
<connect from_port="document" to_op="subprocess1" to_port="input 1"/>
<connect from_op="subprocess1" from_port="output 1" to_op="subprocess2" to_port="input 1"/>
<connect from_op="subprocess1" from_port="output 2" to_op="Log" to_port="through 1"/>
<connect from_op="subprocess2" from_port="output 1" to_op="subprocess3" to_port="input 1"/>
<connect from_op="subprocess2" from_port="output 2" to_op="Log" to_port="through 2"/>
<connect from_op="subprocess3" from_port="output 1" to_port="document 1"/>
<connect from_op="subprocess3" from_port="output 2" to_op="Log" to_port="through 3"/>
<connect from_op="Log" from_port="through 1" to_port="document 2"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
<portSpacing port="sink_document 3" spacing="0"/>
</process>
</operator>
<connect from_port="input 1" to_op="Process Documents from Files (3)" to_port="word list"/>
<connect from_op="Process Documents from Files (3)" from_port="example set" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
<portSpacing port="sink_output 3" spacing="0"/>
</process>
</operator>
<operator activated="true" class="log" compatibility="5.2.008" expanded="true" height="76" name="Log-DP" width="90" x="380" y="210">
<list key="log">
<parameter key="pdCPUTime" value="operator.Process Documents from Files (3).value.cpu-time"/>
<parameter key="pdTime" value="operator.Process Documents from Files (3).value.time"/>
</list>
</operator>
<connect from_op="Select Subprocess" from_port="output 1" to_port="result 2"/>
<connect from_op="Select Subprocess" from_port="output 2" to_op="Log-DP" to_port="through 1"/>
<connect from_op="Log-DP" from_port="through 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
Based on this result, I have several questions:
1) Why the logged information are so different for different runs? And the first experiment run even has zero value for all three subprocesses.
2) I have four files, why the log information for three sub-processes has eight rows?
3) In Rapidminer, the CPU-time is measured in millisecond; What is measures in “Time” from log?
Thanks a lot.