Merging data sets
GregM
New Altair Community Member
Let me outline the problem I am working with. I have three different data aquisition systems that record various things. Each one is logged at different timesteps throughout the day. I would like to pull them into rapidminer individually and merge them into one dataset. the problem I run into is that with the different spacing on the timestamp and different start times of each file. What I want to do is take the higher speed data and average it over the timestep of the slowest speed data, and match the timesteps (or as close as possible) and then merge the three different sets of data into one. I've been doing it outside of rapidminer in excel but I would prefer to do that in rapidminer and cut excel out of the process entirely..
any help is appreciated
thanks
Greg
any help is appreciated
thanks
Greg
Tagged:
0
Answers
-
Hi Greg,
for merging by timestamps you could use the Join Operator. If you uncheck the use id attributes as key box you can set the timestamp attribute as key. This will merge all examples where the timestamps are the same, discarding the rest.
I hope this helps you with your problem.
Best regards,
David0 -
Thanks!. So far I've figured out how to get the datasets merged, however that brings me to the next problem. DAQ_A records data every second, DAQ B every 2 seconds, and DAQ C every minute... which leaves a lot of missing values when merged. I have not found a good way to deal with that... I would like to do one of the following (first is preferrable) average daq A and B over 1 minute intervals and then join them with C, or fill all missing values in with the most recent preceeding good value0
-
code is split between this post and next, due to 20k forum character limit....
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_csv" compatibility="5.3.013" expanded="true" height="60" name="PTA LOG" width="90" x="45" y="30">
<parameter key="csv_file" value="C:\Users\greg.mcfadden\Desktop\Allen\SystemModeling\node_37_08202013_pta_log.csv"/>
<parameter key="column_separators" value=","/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="encoding" value="windows-1252"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="Date/Time.true.text.attribute"/>
<parameter key="1" value="Drive Voltage (VDC).true.real.attribute"/>
<parameter key="2" value="Current (A_rms).true.real.attribute"/>
<parameter key="3" value="Power (W).true.real.attribute"/>
<parameter key="4" value="Heater Head Temp (degC).true.real.attribute"/>
<parameter key="5" value="Heater Head Temp Setpoint (degC).true.real.attribute"/>
<parameter key="6" value="Generator Voltage (ADC Counts).true.real.attribute"/>
<parameter key="7" value="DC Voltage (VDC).true.real.attribute"/>
<parameter key="8" value="Hall Sensor Counter.true.real.attribute"/>
<parameter key="9" value="Generator State.false.binominal.attribute"/>
<parameter key="10" value="Pump Set Speed.true.real.attribute"/>
<parameter key="11" value="Pump Actual Speed.true.real.attribute"/>
<parameter key="12" value="Pump Voltage.true.real.attribute"/>
<parameter key="13" value="Pump Current.true.real.attribute"/>
<parameter key="14" value="Pump Coolant.true.real.attribute"/>
<parameter key="15" value="Pump Error Byte 1.true.real.attribute"/>
<parameter key="16" value="Pump Error Byte 2.true.real.attribute"/>
<parameter key="17" value="Pump Error Byte 3.true.real.attribute"/>
<parameter key="18" value="Steering Temperature Top.true.real.attribute"/>
<parameter key="19" value="Steering Temperature Right.true.real.attribute"/>
<parameter key="20" value="Steering Temperature Bottom.true.real.attribute"/>
<parameter key="21" value="Steering Temperature Left.true.real.attribute"/>
<parameter key="22" value="Azimuth Offset (deg).true.real.attribute"/>
<parameter key="23" value="Zenith Offset (deg).true.real.attribute"/>
<parameter key="24" value="Azimuth Position (deg).true.real.attribute"/>
<parameter key="25" value="Zenith Position (deg).true.real.attribute"/>
<parameter key="26" value="Status Data.true.real.attribute"/>
<parameter key="27" value="Fault Data.true.real.attribute"/>
<parameter key="28" value="HCMOffset.true.real.attribute"/>
<parameter key="29" value="Debug Value 2.true.real.attribute"/>
<parameter key="30" value="Debug Value 3.true.real.attribute"/>
<parameter key="31" value="Debug Value 4.true.real.attribute"/>
<parameter key="32" value="Version Number.true.real.attribute"/>
<parameter key="33" value="Update Counter.true.real.attribute"/>
<parameter key="34" value="Fault Details.true.real.attribute"/>
<parameter key="35" value="att36.false.attribute_value.attribute"/>
</list>
</operator>
<operator activated="true" class="generate_attributes" compatibility="5.3.013" expanded="true" height="76" name="Generate Attributes" width="90" x="179" y="30">
<list key="function_descriptions">
<parameter key="HCMOFFSET2" value="if(HCMOffset>1000,HCMOffset-65536,HCMOffset)"/>
</list>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.013" expanded="true" height="76" name="Set Role" width="90" x="313" y="30">
<parameter key="attribute_name" value="Date/Time"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="read_csv" compatibility="5.3.013" expanded="true" height="60" name="DAQ LOG" width="90" x="45" y="120">
<parameter key="csv_file" value="C:\Users\greg.mcfadden\Desktop\Allen\SystemModeling\GH37-2013-08-20.CSV"/>
<parameter key="column_separators" value=","/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="encoding" value="windows-1252"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="Date (UTC).true.text.attribute"/>
<parameter key="1" value="Time (UTC).true.text.attribute"/>
<parameter key="2" value="Raw Timestamp.true.real.attribute"/>
<parameter key="3" value="Date (Local).true.binominal.attribute"/>
<parameter key="4" value="Time (Local).true.polynominal.attribute"/>
<parameter key="5" value="PowerDish Unit ID.true.binominal.attribute"/>
<parameter key="6" value="KN Q1.true.real.attribute"/>
<parameter key="7" value="KN Q12.true.real.attribute"/>
<parameter key="8" value="KN Q2.true.real.attribute"/>
<parameter key="9" value="KN Q23.true.real.attribute"/>
<parameter key="10" value="KN Q3.true.real.attribute"/>
<parameter key="11" value="KN Q34.true.real.attribute"/>
<parameter key="12" value="KN Q4.true.real.attribute"/>
<parameter key="13" value="KN Q41.true.real.attribute"/>
<parameter key="14" value="HHX Q12.true.real.attribute"/>
<parameter key="15" value="HHX Q23.true.real.attribute"/>
<parameter key="16" value="HHX Q34.true.real.attribute"/>
<parameter key="17" value="HHX Q41.true.real.attribute"/>
<parameter key="18" value="Center.true.real.attribute"/>
<parameter key="19" value="Open M3C1.true.real.attribute"/>
<parameter key="20" value="Open M3C2.true.real.attribute"/>
<parameter key="21" value="IR.true.real.attribute"/>
</list>
</operator>
<operator activated="true" class="generate_concatenation" compatibility="5.3.013" expanded="true" height="76" name="Generate Concatenation" width="90" x="179" y="120">
<parameter key="first_attribute" value="Date (UTC)"/>
<parameter key="second_attribute" value="Time (UTC)"/>
<parameter key="separator" value=" "/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="5.3.013" expanded="true" height="76" name="Nominal to Text" width="90" x="313" y="120">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Date (UTC) Time (UTC)"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.013" expanded="true" height="76" name="Select Attributes (2)" width="90" x="447" y="120">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="|Date (UTC) Time (UTC)|Center|PowerDish Unit ID|Open M3C2|Open M3C1|KN Q41|KN Q4|KN Q34|KN Q3|KN Q23|KN Q2|KN Q12|KN Q1|IR|HHX Q41|HHX Q34|HHX Q23|HHX Q12"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.013" expanded="true" height="76" name="Set Role (2)" width="90" x="581" y="120">
<parameter key="attribute_name" value="Date (UTC) Time (UTC)"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="join" compatibility="5.3.013" expanded="true" height="76" name="Join" width="90" x="782" y="75">
<list key="key_attributes"/>
</operator>
<operator activated="true" class="read_csv" compatibility="5.3.013" expanded="true" height="60" name="WEATHER LOG" width="90" x="45" y="255">
<description>Must preprocess weather file using excel weatherfileadjuster and save as csv</description>
<parameter key="csv_file" value="C:\Users\greg.mcfadden\Desktop\Allen\SystemModeling\WeatherFileAdjuster.csv"/>
<parameter key="column_separators" value=","/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="encoding" value="windows-1252"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="Date/Time.true.text.attribute"/>
<parameter key="1" value="Wind Spd\. (Med).true.real.attribute"/>
<parameter key="2" value="Wind Dir\. (Med).true.real.attribute"/>
<parameter key="3" value="Air\.Temp\. (Med).true.real.attribute"/>
<parameter key="4" value="Rel\. Hum\. (Med).true.real.attribute"/>
<parameter key="5" value="Atm\.Pressu (Med).true.real.attribute"/>
<parameter key="6" value="DirectRad1 (Med).true.real.attribute"/>
<parameter key="7" value="DirectRad1 (Max).true.real.attribute"/>
<parameter key="8" value="DirectRad1 (Min).true.real.attribute"/>
<parameter key="9" value="Solar Elev (Med).false.integer.attribute"/>
<parameter key="10" value="Solar Elev (Max).false.integer.attribute"/>
<parameter key="11" value="Solar Elev (Min).false.integer.attribute"/>
<parameter key="12" value="Solar Az\. (Med).false.real.attribute"/>
<parameter key="13" value="Solar Az\. (Max).false.real.attribute"/>
<parameter key="14" value="Solar Az\. (Min).false.real.attribute"/>
<parameter key="15" value="Rain (Acu).false.integer.attribute"/>
<parameter key="16" value="Battery (Med).false.real.attribute"/>
<parameter key="17" value="Battery (Max).false.real.attribute"/>
<parameter key="18" value="Battery (Min).false.real.attribute"/>
</list>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.013" expanded="true" height="76" name="Set Role (3)" width="90" x="112" y="435">
<parameter key="attribute_name" value="Date/Time"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.3.013" expanded="true" height="76" name="Multiply (5)" width="90" x="246" y="255"/>
<operator activated="true" class="join" compatibility="5.3.013" expanded="true" height="76" name="Join (2)" width="90" x="380" y="390">
<parameter key="join_type" value="outer"/>
<list key="key_attributes"/>
</operator>
<operator activated="true" class="replace_missing_values" compatibility="5.3.013" expanded="true" height="94" name="Replace Missing Values" width="90" x="514" y="390">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="|Zenith Position (deg)|Zenith Offset (deg)|Wind Spd. (Med)|Wind Dir. (Med)|Rel. Hum. (Med)|DirectRad1 (Min)|DirectRad1 (Med)|DirectRad1 (Max)|Azimuth Position (deg)|Azimuth Offset (deg)|Atm.Pressu (Med)|Air.Temp. (Med)"/>
<parameter key="default" value="value"/>
<list key="columns"/>
<parameter key="replenishment_value" value="0"/>
</operator>
<operator activated="true" class="rename_by_replacing" compatibility="5.3.013" expanded="true" height="76" name="Rename by Replacing (2)" width="90" x="648" y="390"/>
<operator activated="true" class="set_role" compatibility="5.3.013" expanded="true" height="76" name="Set Role (5)" width="90" x="782" y="390">
<parameter key="attribute_name" value="Date/Time"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.013" expanded="true" height="76" name="Generate ID (2)" width="90" x="916" y="390"/>
<operator activated="true" class="multiply" compatibility="5.3.013" expanded="true" height="94" name="Multiply (6)" width="90" x="179" y="570"/>
<operator activated="true" class="select_attributes" compatibility="5.3.013" expanded="true" height="76" name="Select Attributes (4)" width="90" x="313" y="660">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="HHXQ12|HHXQ23|HHXQ34|HHXQ41|KNQ1|KNQ12|KNQ2|KNQ23|KNQ3|KNQ34|KNQ4|KNQ41||id"/>
</operator>
<operator activated="true" class="replace_missing_values" compatibility="5.3.013" expanded="true" height="94" name="Replace Missing Values (2)" width="90" x="447" y="660">
<parameter key="default" value="value"/>
<list key="columns"/>
<parameter key="replenishment_value" value="0"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="5.3.013" expanded="true" height="76" name="Generate Attributes (2)" width="90" x="581" y="660">
<list key="function_descriptions">
<parameter key="MAXHHX" value="max(HHXQ12,HHXQ23,HHXQ34,HHXQ41)"/>
<parameter key="MAXKNUCK" value="max(KNQ1,KNQ12,KNQ2,KNQ23,KNQ3,KNQ34,KNQ4,KNQ41)"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.013" expanded="true" height="76" name="Select Attributes (5)" width="90" x="715" y="705">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="|id|MAXKNUCK|MAXHHX"/>
</operator>
<operator activated="true" class="declare_missing_value" compatibility="5.3.013" expanded="true" height="76" name="Declare Missing Value" width="90" x="313" y="570">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="|KNQ41|KNQ4|KNQ34|KNQ3|KNQ23|KNQ2|KNQ12|KNQ1"/>
<parameter key="numeric_value" value="0.0"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="5.3.013" expanded="true" height="76" name="Filter Examples (2)" width="90" x="447" y="570">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="PowerW>100"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.013" expanded="true" height="76" name="Select Attributes (3)" width="90" x="581" y="570">
<parameter key="attribute_filter_type" value="no_missing_values"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.3.013" expanded="true" height="94" name="Filtered Dataset" width="90" x="715" y="570"/>
<operator activated="true" class="join" compatibility="5.3.013" expanded="true" height="76" name="Join (3)" width="90" x="849" y="660">
<parameter key="join_type" value="left"/>
<list key="key_attributes"/>0 -
rest of code
</operator>
<connect from_op="PTA LOG" from_port="output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Join" to_port="left"/>
<connect from_op="DAQ LOG" from_port="output" to_op="Generate Concatenation" to_port="example set input"/>
<connect from_op="Generate Concatenation" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Join" to_port="right"/>
<connect from_op="Join" from_port="join" to_op="Multiply (5)" to_port="input"/>
<connect from_op="WEATHER LOG" from_port="output" to_op="Set Role (3)" to_port="example set input"/>
<connect from_op="Set Role (3)" from_port="example set output" to_op="Join (2)" to_port="right"/>
<connect from_op="Multiply (5)" from_port="output 1" to_op="Join (2)" to_port="left"/>
<connect from_op="Join (2)" from_port="join" to_op="Replace Missing Values" to_port="example set input"/>
<connect from_op="Replace Missing Values" from_port="example set output" to_op="Rename by Replacing (2)" to_port="example set input"/>
<connect from_op="Rename by Replacing (2)" from_port="example set output" to_op="Set Role (5)" to_port="example set input"/>
<connect from_op="Set Role (5)" from_port="example set output" to_op="Generate ID (2)" to_port="example set input"/>
<connect from_op="Generate ID (2)" from_port="example set output" to_op="Multiply (6)" to_port="input"/>
<connect from_op="Multiply (6)" from_port="output 1" to_op="Declare Missing Value" to_port="example set input"/>
<connect from_op="Multiply (6)" from_port="output 2" to_op="Select Attributes (4)" to_port="example set input"/>
<connect from_op="Select Attributes (4)" from_port="example set output" to_op="Replace Missing Values (2)" to_port="example set input"/>
<connect from_op="Replace Missing Values (2)" from_port="example set output" to_op="Generate Attributes (2)" to_port="example set input"/>
<connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Select Attributes (5)" to_port="example set input"/>
<connect from_op="Select Attributes (5)" from_port="example set output" to_op="Join (3)" to_port="right"/>
<connect from_op="Declare Missing Value" from_port="example set output" to_op="Filter Examples (2)" to_port="example set input"/>
<connect from_op="Filter Examples (2)" from_port="example set output" to_op="Select Attributes (3)" to_port="example set input"/>
<connect from_op="Select Attributes (3)" from_port="example set output" to_op="Filtered Dataset" to_port="input"/>
<connect from_op="Filtered Dataset" from_port="output 1" to_port="result 1"/>
<connect from_op="Filtered Dataset" from_port="output 2" to_op="Join (3)" to_port="left"/>
<connect from_op="Join (3)" from_port="join" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>0 -
For calculating the average every minute, there is the Moving Average Operator in the Time Series Extension which might help you. For replacing the missing values with something usefull th Replace Missing Values Operator might be your best option.0