Denormalization of data
hore
New Altair Community Member
I have a data set with a list of transactions with the following attributes:
CustomerID, ProductID, ProductCount
I am trying to transform it into a format where each line corresponds to a CustomerID
and where there is an attribute for each product, like
CustomerID, Product_A_Total_Count, Product_B_Total_Count, Product_C_Total_Count
with counts summed up over all transactions the customer has made with the same product.
I have tried to use Pivot which produces a dataset with the desired structure,
but it does not aggregate the counts of multiple transactions of the same customer with the same product;
rather it picks the count of the last transaction.
Using the ProductCount as WEIGHT makes all the Product_X_COunt attributes disappear entirely from the result.
Am I using the wrong operator?
CustomerID, ProductID, ProductCount
I am trying to transform it into a format where each line corresponds to a CustomerID
and where there is an attribute for each product, like
CustomerID, Product_A_Total_Count, Product_B_Total_Count, Product_C_Total_Count
with counts summed up over all transactions the customer has made with the same product.
I have tried to use Pivot which produces a dataset with the desired structure,
but it does not aggregate the counts of multiple transactions of the same customer with the same product;
rather it picks the count of the last transaction.
Using the ProductCount as WEIGHT makes all the Product_X_COunt attributes disappear entirely from the result.
Am I using the wrong operator?
Tagged:
0
Answers
-
Hi hore,
could you post you process please?
Ciao Sebastian
P.S. Copy and past the xml code of the process from rapidminer. Use the "insert code" button (#) in the forum editor to post code.0 -
Here is my process.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" expanded="true" name="Process">
<process expanded="true" height="325" width="212">
<operator activated="true" class="retrieve" expanded="true" height="60" name="Retrieve" width="90" x="45" y="75">
<parameter key="repository_entry" value="transact"/>
</operator>
<operator activated="true" class="pivot" expanded="true" height="76" name="Pivot" width="90" x="164" y="73">
<parameter key="group_attribute" value="CustomerID"/>
<parameter key="index_attribute" value="ProductID"/>
</operator>
<connect from_op="Retrieve" from_port="output" to_op="Pivot" to_port="example set input"/>
<connect from_op="Pivot" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
The Input table looks like
CustomerID ProductID ProductCount
1 A 4
2 B 5
1 A 6
1 B 7
The resulting Table is
RowNo CustomerID ProductCount_A ProductCount_B
1 1 6 7
2 2 ? 5
I need a 10 (6+4) instead of the 6...0 -
Hi hore,
you need to aggregate first. Take a look at this process, it shows you how to do this:<?xml version="1.0" encoding="UTF-8" standalone="no"?>
Ciao Sebastian
<process version="5.0">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" expanded="true" name="Process">
<process expanded="true" height="476" width="681">
<operator activated="true" class="generate_data" expanded="true" height="60" name="Generate Data" width="90" x="45" y="30">
<parameter key="number_examples" value="30"/>
<parameter key="number_of_attributes" value="3"/>
<parameter key="attributes_lower_bound" value="1.0"/>
<parameter key="attributes_upper_bound" value="4.0"/>
<parameter key="datamanagement" value="int_array"/>
</operator>
<operator activated="true" class="select_attributes" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="30">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="att1|att2|att3"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="rename" expanded="true" height="76" name="Rename" width="90" x="313" y="30">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="customerId"/>
</operator>
<operator activated="true" class="rename" expanded="true" height="76" name="Rename (2)" width="90" x="447" y="30">
<parameter key="old_name" value="att2"/>
<parameter key="new_name" value="ProductID"/>
</operator>
<operator activated="true" class="rename" expanded="true" height="76" name="Rename (3)" width="90" x="581" y="30">
<parameter key="old_name" value="att3"/>
<parameter key="new_name" value="amount"/>
</operator>
<operator activated="true" breakpoints="after" class="multiply" expanded="true" height="76" name="Multiply" width="90" x="45" y="210"/>
<operator activated="true" breakpoints="after" class="aggregate" expanded="true" height="76" name="Aggregate" width="90" x="179" y="210">
<list key="aggregation_attributes">
<parameter key="amount" value="sum"/>
</list>
<parameter key="group_by_attributes" value="customerId|ProductID"/>
</operator>
<operator activated="true" class="pivot" expanded="true" height="76" name="Pivot (2)" width="90" x="313" y="210">
<parameter key="group_attribute" value="customerId"/>
<parameter key="index_attribute" value="ProductID"/>
</operator>
<operator activated="true" class="replace_missing_values" expanded="true" height="94" name="Replace Missing Values" width="90" x="514" y="210">
<parameter key="default" value="zero"/>
<list key="columns"/>
</operator>
<connect from_op="Generate Data" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
<connect from_op="Rename (2)" from_port="example set output" to_op="Rename (3)" to_port="example set input"/>
<connect from_op="Rename (3)" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Aggregate" from_port="example set output" to_op="Pivot (2)" to_port="example set input"/>
<connect from_op="Pivot (2)" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
<connect from_op="Replace Missing Values" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="180"/>
<portSpacing port="sink_result 2" spacing="108"/>
</process>
</operator>
</process>0