Hi,
i tried to analyse a logfile (Read Document), extract some information and group this information.
The logfile looks like this:
[22:18:48.421] log.channel.name1: INFO: class#method: message bla bla bla
[22:19:48.421] log.channel.name2: ERROR: class#method: message2 bla bla bla
I tokenized the timestamp, the log channel name, log level (INFO; WARN; ERROR), the class/method, where the message comes from,the message itself and the whole line.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<parameter key="logverbosity" value="all"/>
<parameter key="encoding" value="UTF-8"/>
<process expanded="true" height="756" width="748">
<operator activated="true" class="text:read_document" compatibility="5.2.004" expanded="true" height="60" name="Read Document" width="90" x="45" y="75">
<parameter key="file" value="C:\Temp\bla.log"/>
<parameter key="use_file_extension_as_type" value="false"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.2.008" expanded="true" height="94" name="Multiply" width="90" x="45" y="255"/>
<operator activated="true" class="text:extract_information" compatibility="5.2.004" expanded="true" height="60" name="Extract Information" width="90" x="246" y="210">
<parameter key="query_type" value="Regular Expression"/>
<list key="string_machting_queries"/>
<parameter key="attribute_type" value="Numerical"/>
<list key="regular_expression_queries">
<parameter key="LogLevel" value="(?<=\s(\bINFO\b|\bWARNING\b|\bERROR\b):\s)"/>
<parameter key="Class#Method" value="(\w{1,40}#\w{1,50}:\s)"/>
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries"/>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<operator activated="true" class="text:process_documents" compatibility="5.2.004" expanded="true" height="94" name="Process Documents" width="90" x="246" y="300">
<parameter key="vector_creation" value="Binary Term Occurrences"/>
<process expanded="true" height="774" width="731">
<operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize" width="90" x="112" y="75">
<parameter key="mode" value="regular expression"/>
<parameter key="characters" value="\n"/>
<parameter key="expression" value="\n"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize| timestamp" width="90" x="318" y="212">
<parameter key="mode" value="regular expression"/>
<parameter key="expression" value="(?<=\[[\d:\.]{1,20}\])\s"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize| LogLevel" width="90" x="313" y="120">
<parameter key="mode" value="regular expression"/>
<parameter key="expression" value="(?<=\s(\bINFO\b|\bWARNING\b|\bERROR\b):\s)"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize|Channel" width="90" x="313" y="30">
<parameter key="mode" value="regular expression"/>
<parameter key="expression" value="(?<=\w{1,6}\.\w{1,30}\.\w{1,30}:)\s"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize|ClassMethod" width="90" x="447" y="30">
<parameter key="mode" value="regular expression"/>
<parameter key="expression" value="(?<=\w{1,40}#\w{1,50}:\s)"/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Tokenize| timestamp" to_port="document"/>
<connect from_op="Tokenize| timestamp" from_port="document" to_op="Tokenize| LogLevel" to_port="document"/>
<connect from_op="Tokenize| LogLevel" from_port="document" to_op="Tokenize|Channel" to_port="document"/>
<connect from_op="Tokenize|Channel" from_port="document" to_op="Tokenize|ClassMethod" to_port="document"/>
<connect from_op="Tokenize|ClassMethod" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read Document" from_port="output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Extract Information" to_port="document"/>
<connect from_op="Multiply" from_port="output 2" to_op="Process Documents" to_port="documents 1"/>
<connect from_op="Extract Information" from_port="document" to_port="result 1"/>
<connect from_op="Process Documents" from_port="example set" to_port="result 2"/>
<connect from_op="Process Documents" from_port="word list" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="198"/>
<portSpacing port="sink_result 2" spacing="54"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="18"/>
</process>
</operator>
</process>
But how can i group these tokens? I want to create an attribute called timestamp including all timestamps (regEx) e.g.
attribute Values
timestamp [18:18:48.421],[19:20:48.421],[22:38:43.421],[22:44:44.421]
line [22:18:48.421] log.channel.name1: INFO: class#method: message bla bla bla,[22:19:48.421] log.channel.name2: ERROR: class#method: message2 bla bla bla
Is RapidMiner the right tool for this?
Maybe the "Text:Extract Information" could solve the problem (i can specify an attribute and the matching regEx). But i don't know how to proceed.
It was allready tricky to find out, that you have to bracket ( ) the regular expression, otherwise i got an "process failed: no group 1" exception?!
Another opinion was to take the "Web:Read Server Log" and describe the file format in a config file. The problem is, that the logfile includes stacktraces and other informations,too (but that's another matter altogether).
So far...