Replacing missing values with subgroup averages not total column averages
Hi all,
I would like to replace the missing values of attributes with their respective group average instead of
their column average, eg. : averages
state region widgets north south east west
florida south 5 8.5 3 9 8
california west 10
new york north 15
delaware east 8
georgia south 1
texas south ?
nevada west 6
arizona west ?
maryland east 10
mass. east ?
maine north ?
vermont north 2
Answers
-
Hello parisave,
You can aggregate by region and use the result to join back to the original where there are missing values.
Relatively complex - here's an example
<?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="subprocess" compatibility="7.6.001" expanded="true" height="82" name="Subprocess" width="90" x="112" y="85">
<process expanded="true">
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="45" y="34">
<list key="attribute_values">
<parameter key="state" value=""florida""/>
<parameter key="region" value=""south""/>
<parameter key="widget" value="5"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (2)" width="90" x="45" y="136">
<list key="attribute_values">
<parameter key="state" value=""california""/>
<parameter key="region" value=""west""/>
<parameter key="widget" value="10"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (3)" width="90" x="45" y="238">
<list key="attribute_values">
<parameter key="state" value=""new york""/>
<parameter key="region" value=""north""/>
<parameter key="widget" value="15"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (4)" width="90" x="45" y="340">
<list key="attribute_values">
<parameter key="state" value=""delaware""/>
<parameter key="region" value=""east""/>
<parameter key="widget" value="8"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (5)" width="90" x="45" y="442">
<list key="attribute_values">
<parameter key="state" value=""georgia""/>
<parameter key="region" value=""south""/>
<parameter key="widget" value="1"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (6)" width="90" x="45" y="544">
<list key="attribute_values">
<parameter key="state" value=""texas""/>
<parameter key="region" value=""south""/>
<parameter key="widget" value="log(-1)"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (7)" width="90" x="45" y="646">
<list key="attribute_values">
<parameter key="state" value=""nevada""/>
<parameter key="region" value=""west""/>
<parameter key="widget" value="6"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (8)" width="90" x="45" y="748">
<list key="attribute_values">
<parameter key="state" value=""arizona""/>
<parameter key="region" value=""west""/>
<parameter key="widget" value="log(-1)"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (9)" width="90" x="313" y="748">
<list key="attribute_values">
<parameter key="state" value=""maryland""/>
<parameter key="region" value=""east""/>
<parameter key="widget" value="10"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (10)" width="90" x="514" y="748">
<list key="attribute_values">
<parameter key="state" value=""mass.""/>
<parameter key="region" value=""east""/>
<parameter key="widget" value="log(-1)"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (11)" width="90" x="715" y="748">
<list key="attribute_values">
<parameter key="state" value=""main""/>
<parameter key="region" value=""north""/>
<parameter key="widget" value="log(-1)"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (12)" width="90" x="849" y="748">
<list key="attribute_values">
<parameter key="state" value=""vermont""/>
<parameter key="region" value=""north""/>
<parameter key="widget" value="2"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="append" compatibility="7.6.001" expanded="true" height="313" name="Append" width="90" x="514" y="34"/>
<operator activated="true" class="generate_id" compatibility="7.6.001" expanded="true" height="82" name="Generate ID" width="90" x="648" y="34"/>
<connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
<connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
<connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
<connect from_op="Generate Data by User Specification (4)" from_port="output" to_op="Append" to_port="example set 4"/>
<connect from_op="Generate Data by User Specification (5)" from_port="output" to_op="Append" to_port="example set 5"/>
<connect from_op="Generate Data by User Specification (6)" from_port="output" to_op="Append" to_port="example set 6"/>
<connect from_op="Generate Data by User Specification (7)" from_port="output" to_op="Append" to_port="example set 7"/>
<connect from_op="Generate Data by User Specification (8)" from_port="output" to_op="Append" to_port="example set 8"/>
<connect from_op="Generate Data by User Specification (9)" from_port="output" to_op="Append" to_port="example set 9"/>
<connect from_op="Generate Data by User Specification (10)" from_port="output" to_op="Append" to_port="example set 10"/>
<connect from_op="Generate Data by User Specification (11)" from_port="output" to_op="Append" to_port="example set 11"/>
<connect from_op="Generate Data by User Specification (12)" from_port="output" to_op="Append" to_port="example set 12"/>
<connect from_op="Append" from_port="merged set" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="aggregate" compatibility="7.6.001" expanded="true" height="82" name="Aggregate" width="90" x="313" y="85">
<list key="aggregation_attributes">
<parameter key="widget" value="average"/>
</list>
<parameter key="group_by_attributes" value="region"/>
</operator>
<operator activated="true" class="rename" compatibility="7.6.001" expanded="true" height="82" name="Rename" width="90" x="447" y="34">
<parameter key="old_name" value="average(widget)"/>
<parameter key="new_name" value="widget"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="generate_id" compatibility="7.6.001" expanded="true" height="82" name="Generate ID (2)" width="90" x="581" y="34"/>
<operator activated="true" class="filter_examples" compatibility="7.6.001" expanded="true" height="103" name="Filter Examples" width="90" x="447" y="238">
<parameter key="condition_class" value="no_missing_attributes"/>
<list key="filters_list"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.6.001" expanded="true" height="82" name="Select Attributes" width="90" x="581" y="136">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="region|state"/>
</operator>
<operator activated="true" class="join" compatibility="7.6.001" expanded="true" height="82" name="Join" width="90" x="782" y="34">
<parameter key="join_type" value="left"/>
<parameter key="use_id_attribute_as_key" value="false"/>
<list key="key_attributes">
<parameter key="region" value="region"/>
</list>
</operator>
<operator activated="true" class="append" compatibility="7.6.001" expanded="true" height="103" name="Append (2)" width="90" x="916" y="85"/>
<operator activated="true" class="sort" compatibility="7.6.001" expanded="true" height="82" name="Sort" width="90" x="1050" y="85">
<parameter key="attribute_name" value="id"/>
</operator>
<connect from_op="Subprocess" from_port="out 1" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Aggregate" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Aggregate" from_port="original" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Generate ID (2)" to_port="example set input"/>
<connect from_op="Generate ID (2)" from_port="example set output" to_op="Join" to_port="right"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Append (2)" to_port="example set 2"/>
<connect from_op="Filter Examples" from_port="unmatched example set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Join" to_port="left"/>
<connect from_op="Join" from_port="join" to_op="Append (2)" to_port="example set 1"/>
<connect from_op="Append (2)" from_port="merged set" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>regards,
Andrew
1 -
Hi,
an easier way to do it is to user Group Into Collection from operator toolbox extension. This enables you to do any subprocess with a group_by statement. An example is attached.
Cheers,
Martin
<?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="subprocess" compatibility="7.6.001" expanded="true" height="82" name="Subprocess" width="90" x="45" y="85">
<process expanded="true">
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="45" y="34">
<list key="attribute_values">
<parameter key="state" value=""florida""/>
<parameter key="region" value=""south""/>
<parameter key="widget" value="5"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (2)" width="90" x="45" y="136">
<list key="attribute_values">
<parameter key="state" value=""california""/>
<parameter key="region" value=""west""/>
<parameter key="widget" value="10"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (3)" width="90" x="45" y="238">
<list key="attribute_values">
<parameter key="state" value=""new york""/>
<parameter key="region" value=""north""/>
<parameter key="widget" value="15"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (4)" width="90" x="45" y="340">
<list key="attribute_values">
<parameter key="state" value=""delaware""/>
<parameter key="region" value=""east""/>
<parameter key="widget" value="8"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (5)" width="90" x="45" y="442">
<list key="attribute_values">
<parameter key="state" value=""georgia""/>
<parameter key="region" value=""south""/>
<parameter key="widget" value="1"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (6)" width="90" x="45" y="544">
<list key="attribute_values">
<parameter key="state" value=""texas""/>
<parameter key="region" value=""south""/>
<parameter key="widget" value="log(-1)"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (7)" width="90" x="45" y="646">
<list key="attribute_values">
<parameter key="state" value=""nevada""/>
<parameter key="region" value=""west""/>
<parameter key="widget" value="6"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (8)" width="90" x="45" y="748">
<list key="attribute_values">
<parameter key="state" value=""arizona""/>
<parameter key="region" value=""west""/>
<parameter key="widget" value="log(-1)"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (9)" width="90" x="313" y="748">
<list key="attribute_values">
<parameter key="state" value=""maryland""/>
<parameter key="region" value=""east""/>
<parameter key="widget" value="10"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (10)" width="90" x="514" y="748">
<list key="attribute_values">
<parameter key="state" value=""mass.""/>
<parameter key="region" value=""east""/>
<parameter key="widget" value="log(-1)"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (11)" width="90" x="715" y="748">
<list key="attribute_values">
<parameter key="state" value=""main""/>
<parameter key="region" value=""north""/>
<parameter key="widget" value="log(-1)"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="7.6.001" expanded="true" height="68" name="Generate Data by User Specification (12)" width="90" x="849" y="748">
<list key="attribute_values">
<parameter key="state" value=""vermont""/>
<parameter key="region" value=""north""/>
<parameter key="widget" value="2"/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="append" compatibility="7.6.001" expanded="true" height="313" name="Append" width="90" x="514" y="34"/>
<operator activated="true" class="generate_id" compatibility="7.6.001" expanded="true" height="82" name="Generate ID" width="90" x="648" y="34"/>
<connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
<connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
<connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
<connect from_op="Generate Data by User Specification (4)" from_port="output" to_op="Append" to_port="example set 4"/>
<connect from_op="Generate Data by User Specification (5)" from_port="output" to_op="Append" to_port="example set 5"/>
<connect from_op="Generate Data by User Specification (6)" from_port="output" to_op="Append" to_port="example set 6"/>
<connect from_op="Generate Data by User Specification (7)" from_port="output" to_op="Append" to_port="example set 7"/>
<connect from_op="Generate Data by User Specification (8)" from_port="output" to_op="Append" to_port="example set 8"/>
<connect from_op="Generate Data by User Specification (9)" from_port="output" to_op="Append" to_port="example set 9"/>
<connect from_op="Generate Data by User Specification (10)" from_port="output" to_op="Append" to_port="example set 10"/>
<connect from_op="Generate Data by User Specification (11)" from_port="output" to_op="Append" to_port="example set 11"/>
<connect from_op="Generate Data by User Specification (12)" from_port="output" to_op="Append" to_port="example set 12"/>
<connect from_op="Append" from_port="merged set" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="operator_toolbox:group_into_collection" compatibility="0.7.000" expanded="true" height="82" name="Group Into Collection" width="90" x="246" y="85">
<parameter key="group_by_attribute" value="region"/>
</operator>
<operator activated="true" class="loop_collection" compatibility="7.6.001" expanded="true" height="82" name="Loop Collection" width="90" x="447" y="85">
<process expanded="true">
<operator activated="true" class="replace_missing_values" compatibility="7.6.001" expanded="true" height="103" name="Replace Missing Values" width="90" x="179" y="34">
<list key="columns"/>
</operator>
<connect from_port="single" to_op="Replace Missing Values" to_port="example set input"/>
<connect from_op="Replace Missing Values" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="append" compatibility="7.6.001" expanded="true" height="82" name="Append (2)" width="90" x="648" y="85"/>
<connect from_op="Subprocess" from_port="out 1" to_op="Group Into Collection" to_port="exa"/>
<connect from_op="Group Into Collection" from_port="col" to_op="Loop Collection" to_port="collection"/>
<connect from_op="Loop Collection" from_port="output 1" to_op="Append (2)" to_port="example set 1"/>
<connect from_op="Append (2)" from_port="merged set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>1