EXTRACTING DATA FROM UNSTRUCTURED TEXT FILE

floyd1233
New Altair Community Member
Hi
I have a TEXT file with unstructured data
-------------------------------------------------------------------********-------------------------------------------------------
ADD UCELLMEAS:CELLID=29335, LOGICRNCID=900, INTERFREQINTERRATMEASIND=INTER_FREQ_AND_INTER_RAT, RPTIND=NO_REPORT, MAXNUMRPTCELLS=CURRENT_CELL_AND_2BEST_NEIGHBOUR, FACHMEASIND=INTER_FREQ_AND_INTER_RAT, FACHMEASOCCACYCLELENCOEF=6, RPTINDIND=REQUIRE, MAXNUMRPTCELLSIND=REQUIRE, INTRAFREQMEASIND=REQUIRE, DEFERMCREADIND=FALSE;
ADD UCELLMEAS:CELLID=29336, LOGICRNCID=900, INTERFREQINTERRATMEASIND=INTER_FREQ_AND_INTER_RAT, RPTIND=NO_REPORT, MAXNUMRPTCELLS=CURRENT_CELL_AND_2BEST_NEIGHBOUR, FACHMEASIND=INTER_FREQ_AND_INTER_RAT, FACHMEASOCCACYCLELENCOEF=6, RPTINDIND=REQUIRE, MAXNUMRPTCELLSIND=REQUIRE, INTRAFREQMEASIND=REQUIRE, DEFERMCREADIND=FALSE;
ADD UCHPWROFFSET:CELLID=21711, LOGICRNCID=900, AICHPOWEROFFSET=-6, PICHPOWEROFFSET=-7;
ADD UCHPWROFFSET:CELLID=34051, LOGICRNCID=900, AICHPOWEROFFSET=-6, PICHPOWEROFFSET=-7;
ADD UCHPWROFFSET:CELLID=34052, LOGICRNCID=900, AICHPOWEROFFSET=-6, PICHPOWEROFFSET=-7;
ADD UCHPWROFFSET:CELLID=34053, LOGICRNCID=900, AICHPOWEROFFSET=-6, PICHPOWEROFFSET=-7;
-------------------------------------------------------------------********-------------------------------------------------------
I am only interested in extracting rows that contain "UCHPWROFFSET" and would like to convert it to the below format (Tabular)
CELLID LOGICRNCID AICHPOWEROFFSET PICHPOWEROFFSET
21711 900 -6 -7
34051 900 -6 -7
34052 900 -6 -7
34053 900 -6 -7
Any idea how it can be done using Operators within RAPIDMINER
Floyd
I have a TEXT file with unstructured data
-------------------------------------------------------------------********-------------------------------------------------------
ADD UCELLMEAS:CELLID=29335, LOGICRNCID=900, INTERFREQINTERRATMEASIND=INTER_FREQ_AND_INTER_RAT, RPTIND=NO_REPORT, MAXNUMRPTCELLS=CURRENT_CELL_AND_2BEST_NEIGHBOUR, FACHMEASIND=INTER_FREQ_AND_INTER_RAT, FACHMEASOCCACYCLELENCOEF=6, RPTINDIND=REQUIRE, MAXNUMRPTCELLSIND=REQUIRE, INTRAFREQMEASIND=REQUIRE, DEFERMCREADIND=FALSE;
ADD UCELLMEAS:CELLID=29336, LOGICRNCID=900, INTERFREQINTERRATMEASIND=INTER_FREQ_AND_INTER_RAT, RPTIND=NO_REPORT, MAXNUMRPTCELLS=CURRENT_CELL_AND_2BEST_NEIGHBOUR, FACHMEASIND=INTER_FREQ_AND_INTER_RAT, FACHMEASOCCACYCLELENCOEF=6, RPTINDIND=REQUIRE, MAXNUMRPTCELLSIND=REQUIRE, INTRAFREQMEASIND=REQUIRE, DEFERMCREADIND=FALSE;
ADD UCHPWROFFSET:CELLID=21711, LOGICRNCID=900, AICHPOWEROFFSET=-6, PICHPOWEROFFSET=-7;
ADD UCHPWROFFSET:CELLID=34051, LOGICRNCID=900, AICHPOWEROFFSET=-6, PICHPOWEROFFSET=-7;
ADD UCHPWROFFSET:CELLID=34052, LOGICRNCID=900, AICHPOWEROFFSET=-6, PICHPOWEROFFSET=-7;
ADD UCHPWROFFSET:CELLID=34053, LOGICRNCID=900, AICHPOWEROFFSET=-6, PICHPOWEROFFSET=-7;
-------------------------------------------------------------------********-------------------------------------------------------
I am only interested in extracting rows that contain "UCHPWROFFSET" and would like to convert it to the below format (Tabular)
CELLID LOGICRNCID AICHPOWEROFFSET PICHPOWEROFFSET
21711 900 -6 -7
34051 900 -6 -7
34052 900 -6 -7
34053 900 -6 -7
Any idea how it can be done using Operators within RAPIDMINER
Floyd
Tagged:
0
Answers
-
A regular expression using Cut Document would do the trick.
I haven't the time to create the exact ones you need, but the basics would be a lookbehind for each field for example
(?<=.*UCHPWROFFSET.*CELLID\=)[0-9]* (or something like that)
For ease of demonstration here is a quick demo I knocked up, I doubt you'd want to use this in production though and will need to tweak it a bit.<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.0.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:create_document" compatibility="7.0.000" expanded="true" height="68" name="Create Document" width="90" x="45" y="34">
<parameter key="text" value="-------------------------------------------------------------------********------------------------------------------------------- ADD UCELLMEAS:CELLID=29335, LOGICRNCID=900, INTERFREQINTERRATMEASIND=INTER_FREQ_AND_INTER_RAT, RPTIND=NO_REPORT, MAXNUMRPTCELLS=CURRENT_CELL_AND_2BEST_NEIGHBOUR, FACHMEASIND=INTER_FREQ_AND_INTER_RAT, FACHMEASOCCACYCLELENCOEF=6, RPTINDIND=REQUIRE, MAXNUMRPTCELLSIND=REQUIRE, INTRAFREQMEASIND=REQUIRE, DEFERMCREADIND=FALSE; ADD UCELLMEAS:CELLID=29336, LOGICRNCID=900, INTERFREQINTERRATMEASIND=INTER_FREQ_AND_INTER_RAT, RPTIND=NO_REPORT, MAXNUMRPTCELLS=CURRENT_CELL_AND_2BEST_NEIGHBOUR, FACHMEASIND=INTER_FREQ_AND_INTER_RAT, FACHMEASOCCACYCLELENCOEF=6, RPTINDIND=REQUIRE, MAXNUMRPTCELLSIND=REQUIRE, INTRAFREQMEASIND=REQUIRE, DEFERMCREADIND=FALSE; ADD UCHPWROFFSET:CELLID=21711, LOGICRNCID=900, AICHPOWEROFFSET=-6, PICHPOWEROFFSET=-7; ADD UCHPWROFFSET:CELLID=34051, LOGICRNCID=900, AICHPOWEROFFSET=-6, PICHPOWEROFFSET=-7; ADD UCHPWROFFSET:CELLID=34052, LOGICRNCID=900, AICHPOWEROFFSET=-6, PICHPOWEROFFSET=-7; ADD UCHPWROFFSET:CELLID=34053, LOGICRNCID=900, AICHPOWEROFFSET=-6, PICHPOWEROFFSET=-7; -------------------------------------------------------------------********------------------------------------------------------- I am only interested in extracting rows that contain "UCHPWROFFSET" and would like to convert it to the below format (Tabular) CELLID LOGICRNCID AICHPOWEROFFSET PICHPOWEROFFSET 21711 900 -6 -7 34051 900 -6 -7 34052 900 -6 -7 34053 900 -6 -7 "/>
</operator>
<operator activated="true" class="text:cut_document" compatibility="7.0.000" expanded="true" height="68" name="UCHPWROFFSET" width="90" x="179" y="34">
<list key="string_machting_queries">
<parameter key="test" value="UCHPWROFFSET.;"/>
</list>
<list key="regular_expression_queries">
<parameter key="UCHPWROFFSET" value="(?m)UCHPWROFFSET.*$"/>
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries"/>
<list key="namespaces"/>
<list key="index_queries"/>
<list key="jsonpath_queries"/>
<process expanded="true">
<operator activated="true" class="text:cut_document" compatibility="7.0.000" expanded="true" height="68" name="Extract Fields" width="90" x="112" y="34">
<parameter key="query_type" value="Regular Expression"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries">
<parameter key="CELLID" value="(?<=CELLID\=)[\-0-9]*"/>
<parameter key="LOGICRNCID" value="(?<=LOGICRNCID\=)[\-0-9]*"/>
<parameter key="AICHPOWEROFFSET" value="(?<=AICHPOWEROFFSET\=)[\-0-9]*"/>
<parameter key="PICHPOWEROFFSET" value="(?<=PICHPOWEROFFSET\=)[\-0-9]*"/>
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries"/>
<list key="namespaces"/>
<list key="index_queries"/>
<list key="jsonpath_queries"/>
<process expanded="true">
<connect from_port="segment" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="subprocess" compatibility="7.0.001" expanded="true" height="82" name="Just some cleaning, not needed" width="90" x="246" y="34">
<process expanded="true">
<operator activated="true" class="text:documents_to_data" compatibility="7.0.000" expanded="true" height="82" name="Convert to Data to Tidy Up" width="90" x="45" y="34">
<parameter key="text_attribute" value="UCHPWROFFSET"/>
</operator>
<operator activated="true" class="order_attributes" compatibility="7.0.001" expanded="true" height="82" name="Reorder Attributes" width="90" x="45" y="136">
<parameter key="attribute_ordering" value="query_key"/>
</operator>
<operator activated="true" class="transpose" compatibility="7.0.001" expanded="true" height="82" name="Transpose" width="90" x="179" y="34"/>
<operator activated="true" class="rename_by_example_values" compatibility="7.0.001" expanded="true" height="82" name="Rename by Example Values" width="90" x="112" y="289"/>
<operator activated="true" class="text:data_to_documents" compatibility="7.0.000" expanded="true" height="68" name="Data to Documents" width="90" x="246" y="136">
<list key="specify_weights"/>
</operator>
<connect from_port="in 1" to_op="Convert to Data to Tidy Up" to_port="documents 1"/>
<connect from_op="Convert to Data to Tidy Up" from_port="example set" to_op="Reorder Attributes" to_port="example set input"/>
<connect from_op="Reorder Attributes" from_port="example set output" to_op="Transpose" to_port="example set input"/>
<connect from_op="Transpose" from_port="example set output" to_op="Rename by Example Values" to_port="example set input"/>
<connect from_op="Rename by Example Values" from_port="example set output" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Data to Documents" from_port="documents" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<connect from_port="segment" to_op="Extract Fields" to_port="document"/>
<connect from_op="Extract Fields" from_port="documents" to_op="Just some cleaning, not needed" to_port="in 1"/>
<connect from_op="Just some cleaning, not needed" from_port="out 1" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
<description align="center" color="transparent" colored="false" width="126">Note: this is a nested operator. Go inside</description>
</operator>
<operator activated="true" class="text:documents_to_data" compatibility="7.0.000" expanded="true" height="82" name="Documents to Data" width="90" x="313" y="34">
<parameter key="text_attribute" value="NA"/>
</operator>
<connect from_op="Create Document" from_port="output" to_op="UCHPWROFFSET" to_port="document"/>
<connect from_op="UCHPWROFFSET" from_port="documents" to_op="Documents to Data" to_port="documents 1"/>
<connect from_op="Documents to Data" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>0