count total occurrence and sort by date
tahsin
New Altair Community Member
I have rapidminer example set like this,
ID Issue Exp 100 9/8/2020 11/8/2020 100 8/5/2019 9/5/2019 101 6/3/2020 10/1/2020 102 8/15/2020 12/12/2020
I want to add a new column which will count the occurrence of the ID by adding the numbers and sort by the earliest date so we know at what date how many count I had.
Output like this,
ID Issue Exp Count 100 8/5/2019 9/5/2019 1 100 9/8/2020 11/8/2020 2 101 6/3/2020 10/1/2020 1 102 8/15/2020 12/12/2020 1
But when I aggregate by ID and do a count it will just count the total instead and show them for the same ID. So, for ID 100 it shows me 2 both the times because it is just adding the numbers both the times.
For example, for ID 100 in 2019 we had only 1 issue date hence count is 1, when we find ID 100 again at 2020 the count will be 2. So, the sort by date is also important because it will help us find the ID occurrence in correct order.
0
Best Answers
-
Yes, if you look at the ID column, for ID 100 the count is 2 both the times. It should be 1 when there was an issue date first time(2019) and 2 when there was another issue date (2020). So, we are incrementing the numbers based on occurrence instead of just counting 2 both times.
Thanks.0 -
Hi,have a look at the attached process, it should do the trick.Best,Martin<?xml version="1.0" encoding="UTF-8"?><process version="9.9.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.9.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="utility:create_exampleset" compatibility="9.9.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="34">
<parameter key="generator_type" value="comma separated text"/>
<parameter key="number_of_examples" value="100"/>
<parameter key="use_stepsize" value="false"/>
<list key="function_descriptions"/>
<parameter key="add_id_attribute" value="false"/>
<list key="numeric_series_configuration"/>
<list key="date_series_configuration"/>
<list key="date_series_configuration (interval)"/>
<parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="input_csv_text" value="ID , Issue , Exp 100 , 9/8/2020 , 11/8/2020 100 , 8/5/2019 , 9/5/2019 101 , 6/3/2020 , 10/1/2020 102 , 8/15/2020 , 12/12/2020"/>
<parameter key="column_separator" value=","/>
<parameter key="parse_all_as_nominal" value="false"/>
<parameter key="decimal_point_character" value="."/>
<parameter key="trim_attribute_names" value="true"/>
</operator>
<operator activated="true" class="operator_toolbox:group_into_collection" compatibility="3.0.000-SNAPSHOT" expanded="true" height="82" name="Group Into Collection" width="90" x="179" y="85">
<parameter key="group_by_attribute" value="ID"/>
<parameter key="group_by_attribute (numerical)" value=""/>
<parameter key="sorting_order" value="none"/>
</operator>
<operator activated="true" class="loop_collection" compatibility="9.9.000" expanded="true" height="82" name="Loop Collection" width="90" x="313" y="85">
<parameter key="set_iteration_macro" value="false"/>
<parameter key="macro_name" value="iteration"/>
<parameter key="macro_start_value" value="1"/>
<parameter key="unfold" value="false"/>
<process expanded="true">
<operator activated="true" class="blending:sort" compatibility="9.9.000" expanded="true" height="82" name="Sort" width="90" x="112" y="34">
<list key="sort_by">
<parameter key="Issue" value="ascending"/>
</list>
</operator>
<operator activated="true" class="generate_id" compatibility="9.9.000" expanded="true" height="82" name="Generate ID" width="90" x="313" y="34">
<parameter key="create_nominal_ids" value="false"/>
<parameter key="offset" value="0"/>
</operator>
<operator activated="true" class="blending:rename" compatibility="9.9.000" expanded="true" height="82" name="Rename" width="90" x="447" y="34">
<list key="rename attributes">
<parameter key="id" value="count"/>
</list>
<parameter key="from_attribute" value=""/>
<parameter key="to_attribute" value=""/>
</operator>
<connect from_port="single" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="operator_toolbox:advanced_append" compatibility="3.0.000-SNAPSHOT" expanded="true" height="82" name="Append (Superset)" width="90" x="447" y="85"/>
<connect from_op="Create ExampleSet" from_port="output" to_op="Group Into Collection" to_port="exa"/>
<connect from_op="Group Into Collection" from_port="col" to_op="Loop Collection" to_port="collection"/>
<connect from_op="Loop Collection" from_port="output 1" to_op="Append (Superset)" to_port="example set 1"/>
<connect from_op="Append (Superset)" from_port="merged set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
1
Answers
-
Hi,sounds like you want to aggregate and then join again? Attached is a process on your data.Bet,Martin<?xml version="1.0" encoding="UTF-8"?><process version="9.9.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.9.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="utility:create_exampleset" compatibility="9.9.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="34">
<parameter key="generator_type" value="comma separated text"/>
<parameter key="number_of_examples" value="100"/>
<parameter key="use_stepsize" value="false"/>
<list key="function_descriptions"/>
<parameter key="add_id_attribute" value="false"/>
<list key="numeric_series_configuration"/>
<list key="date_series_configuration"/>
<list key="date_series_configuration (interval)"/>
<parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="input_csv_text" value="ID , Issue , Exp 100 , 9/8/2020 , 11/8/2020 100 , 8/5/2019 , 9/5/2019 101 , 6/3/2020 , 10/1/2020 102 , 8/15/2020 , 12/12/2020"/>
<parameter key="column_separator" value=","/>
<parameter key="parse_all_as_nominal" value="false"/>
<parameter key="decimal_point_character" value="."/>
<parameter key="trim_attribute_names" value="true"/>
</operator>
<operator activated="true" class="multiply" compatibility="9.9.000" expanded="true" height="103" name="Multiply" width="90" x="179" y="136"/>
<operator activated="true" class="aggregate" compatibility="9.9.000" expanded="true" height="82" name="Aggregate" width="90" x="313" y="34">
<parameter key="use_default_aggregation" value="false"/>
<parameter key="attribute_filter_type" value="all"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="default_aggregation_function" value="average"/>
<list key="aggregation_attributes">
<parameter key="ID" value="count"/>
</list>
<parameter key="group_by_attributes" value="ID"/>
<parameter key="count_all_combinations" value="false"/>
<parameter key="only_distinct" value="false"/>
<parameter key="ignore_missings" value="true"/>
</operator>
<operator activated="true" class="concurrency:join" compatibility="9.9.000" expanded="true" height="82" name="Join" width="90" x="447" y="136">
<parameter key="remove_double_attributes" value="true"/>
<parameter key="join_type" value="inner"/>
<parameter key="use_id_attribute_as_key" value="false"/>
<list key="key_attributes">
<parameter key="ID" value="ID"/>
</list>
<parameter key="keep_both_join_attributes" value="false"/>
</operator>
<operator activated="true" class="blending:sort" compatibility="9.9.000" expanded="true" height="82" name="Sort" width="90" x="581" y="136">
<list key="sort_by">
<parameter key="Exp" value="ascending"/>
</list>
</operator>
<connect from_op="Create ExampleSet" from_port="output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 2" to_op="Join" to_port="right"/>
<connect from_op="Aggregate" from_port="example set output" to_op="Join" to_port="left"/>
<connect from_op="Join" from_port="join" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
1 -
Hi,
Thanks for your reply. I ran your process and this is the result I get. It looks like its the same result I got. Showing 2 for count both the times for ID 100.
0 -
Hi,so why would this be wrong? You want more a cumulative sum?Best,Martin0
-
Yes, if you look at the ID column, for ID 100 the count is 2 both the times. It should be 1 when there was an issue date first time(2019) and 2 when there was another issue date (2020). So, we are incrementing the numbers based on occurrence instead of just counting 2 both times.
Thanks.0 -
Hi,have a look at the attached process, it should do the trick.Best,Martin<?xml version="1.0" encoding="UTF-8"?><process version="9.9.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.9.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="utility:create_exampleset" compatibility="9.9.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="34">
<parameter key="generator_type" value="comma separated text"/>
<parameter key="number_of_examples" value="100"/>
<parameter key="use_stepsize" value="false"/>
<list key="function_descriptions"/>
<parameter key="add_id_attribute" value="false"/>
<list key="numeric_series_configuration"/>
<list key="date_series_configuration"/>
<list key="date_series_configuration (interval)"/>
<parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="input_csv_text" value="ID , Issue , Exp 100 , 9/8/2020 , 11/8/2020 100 , 8/5/2019 , 9/5/2019 101 , 6/3/2020 , 10/1/2020 102 , 8/15/2020 , 12/12/2020"/>
<parameter key="column_separator" value=","/>
<parameter key="parse_all_as_nominal" value="false"/>
<parameter key="decimal_point_character" value="."/>
<parameter key="trim_attribute_names" value="true"/>
</operator>
<operator activated="true" class="operator_toolbox:group_into_collection" compatibility="3.0.000-SNAPSHOT" expanded="true" height="82" name="Group Into Collection" width="90" x="179" y="85">
<parameter key="group_by_attribute" value="ID"/>
<parameter key="group_by_attribute (numerical)" value=""/>
<parameter key="sorting_order" value="none"/>
</operator>
<operator activated="true" class="loop_collection" compatibility="9.9.000" expanded="true" height="82" name="Loop Collection" width="90" x="313" y="85">
<parameter key="set_iteration_macro" value="false"/>
<parameter key="macro_name" value="iteration"/>
<parameter key="macro_start_value" value="1"/>
<parameter key="unfold" value="false"/>
<process expanded="true">
<operator activated="true" class="blending:sort" compatibility="9.9.000" expanded="true" height="82" name="Sort" width="90" x="112" y="34">
<list key="sort_by">
<parameter key="Issue" value="ascending"/>
</list>
</operator>
<operator activated="true" class="generate_id" compatibility="9.9.000" expanded="true" height="82" name="Generate ID" width="90" x="313" y="34">
<parameter key="create_nominal_ids" value="false"/>
<parameter key="offset" value="0"/>
</operator>
<operator activated="true" class="blending:rename" compatibility="9.9.000" expanded="true" height="82" name="Rename" width="90" x="447" y="34">
<list key="rename attributes">
<parameter key="id" value="count"/>
</list>
<parameter key="from_attribute" value=""/>
<parameter key="to_attribute" value=""/>
</operator>
<connect from_port="single" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="operator_toolbox:advanced_append" compatibility="3.0.000-SNAPSHOT" expanded="true" height="82" name="Append (Superset)" width="90" x="447" y="85"/>
<connect from_op="Create ExampleSet" from_port="output" to_op="Group Into Collection" to_port="exa"/>
<connect from_op="Group Into Collection" from_port="col" to_op="Loop Collection" to_port="collection"/>
<connect from_op="Loop Collection" from_port="output 1" to_op="Append (Superset)" to_port="example set 1"/>
<connect from_op="Append (Superset)" from_port="merged set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
1 -
Great!! Exactly how I expected. Thanks so much for your help.1