Altair RISE

A program to recognize and reward our most engaged community members

Nominate Yourself Now!

Creating an attribute with reference values from another ExampleSet

Hi, everyone.

I'm stucked some days with the creation of an attriibute that must be filled with the values from another example set, result from an aggregation operation. There is a "CNPJ" attribute on the main example set, that has it's values repeated over the 25.000+ rows. The aggregation set is made of 700+ rows of unique CNPJ values and a second collumn with it's counts ([count(CNPJ)]). What I need is to create a collumn with the count(CNPJ) value on every time the sameCNPJ appears on the main set.

The better way I found until now is Python Scripting, that is fully working. I made them be read through:

<div>import pandas</div><div><br></div><div>def rm_main(cnpj, data): # cnpj is the aggregation set and data is the sain set</div>    # code<span>
    return data</span>

But already tried with these codes and all fail:

import pandas

def rm_main(cnpj, data):

    data["CNPJ_count"] = [cnpj["count(CNPJ)"] for

data["CNPJ"] = cnpj["CNPJ"] in data["CNPJ"]] return data

import pandas

def rm_main(cnpj, data):

    CNPJ_count = []
    for count in data["CNPJ"]:
        if data["CNPJ"] == cnpj["CNPJ"]:
            CNPJ_count.append(cnpj["count(CNPJ)"])
    data["CNPJ_count"] = CNPJ_count
            
    return data

Anyone could help?

Find more posts tagged with

AI Studio

Python

Transformations

Aggregate

Data Sets

Execute Script

Accepted answers

kayman

You are trying to compare 2 different shaped tables with each other, and this doesn't work. Oversimplified the process will look at row 1 of set 1 and compare with row 1 on set 2, and iterates through the whole set this way. But that means your table need to have the same amount of records, which is not the case here.

I'm wondering if a simple join isn't doing what you are e

Attached example shows the same mapping you like to get (if I am understanding it correctly :-))

<?xml version="1.0" encoding="UTF-8"?><process version="9.3.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.3.001" expanded="true" name="Process" origin="EXPORTED_TURBOPREP">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="UTF-8"/>
    <process expanded="true">
      <operator activated="true" class="aggregate" compatibility="9.3.001" expanded="true" height="82" name="Aggregate (2)" width="90" x="112" y="34">
        <parameter key="use_default_aggregation" value="false"/>
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="CNPJ"/>
        <parameter key="attributes" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="default_aggregation_function" value="count"/>
        <list key="aggregation_attributes">
          <parameter key="CNPJ" value="count"/>
        </list>
        <parameter key="group_by_attributes" value="CNPJ"/>
        <parameter key="count_all_combinations" value="false"/>
        <parameter key="only_distinct" value="false"/>
        <parameter key="ignore_missings" value="true"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="9.3.001" expanded="true" height="82" name="Select Attributes (9)" width="90" x="246" y="136">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="coduasg|ipgCod_diff|ippIndAceito|ippIndAdjudicado|lanClassif_diff|numprp|prpCod|lanData_milis|lanData_milis_diff"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="true"/>
        <parameter key="include_special_attributes" value="false"/>
        <description align="center" color="transparent" colored="false" width="126">Exclus&amp;#227;o final de atributos</description>
      </operator>
      <operator activated="true" class="order_attributes" compatibility="9.3.001" expanded="true" height="82" name="Reorder Attributes" width="90" x="380" y="136">
        <parameter key="sort_mode" value="user specified"/>
        <parameter key="attribute_ordering" value="coduasg numprp|prpCod|ipgCod|ippIndAceito|ippIndAdjudicado|ippIndHabilitado|lanClassif|lanData|lanData_milis|lanData_milis_diff|ipgCod_diff"/>
        <parameter key="use_regular_expressions" value="false"/>
        <parameter key="handle_unmatched" value="append"/>
        <parameter key="sort_direction" value="ascending"/>
        <description align="center" color="transparent" colored="false" width="126">Ordena&amp;#231;&amp;#227;o dos atributos</description>
      </operator>
      <operator activated="true" class="concurrency:join" compatibility="9.3.001" expanded="true" height="82" name="Join" width="90" x="514" y="34">
        <parameter key="remove_double_attributes" value="true"/>
        <parameter key="join_type" value="right"/>
        <parameter key="use_id_attribute_as_key" value="false"/>
        <list key="key_attributes">
          <parameter key="CNPJ" value="CNPJ"/>
        </list>
        <parameter key="keep_both_join_attributes" value="false"/>
      </operator>
      <connect from_port="input 1" to_op="Aggregate (2)" to_port="example set input"/>
      <connect from_op="Aggregate (2)" from_port="example set output" to_op="Join" to_port="left"/>
      <connect from_op="Aggregate (2)" from_port="original" to_op="Select Attributes (9)" to_port="example set input"/>
      <connect from_op="Select Attributes (9)" from_port="example set output" to_op="Reorder Attributes" to_port="example set input"/>
      <connect from_op="Reorder Attributes" from_port="example set output" to_op="Join" to_port="right"/>
      <connect from_op="Join" from_port="join" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="source_input 2" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

All comments

MarlaBot

Hi @hugomsouto - this is MarlaBot. I found these great videos on our RapidMiner Academy that you may find helpful:

Instructional Video: Python and PyCharm in RapidMiner (Viewing time: ~5m)

Instructional Video: Fast Fourier Transformation (Viewing time: ~4m)

Instructional Video: Generate Aggregations intro (Viewing time: ~2m)

Instructional Video: Generate Aggregations demo (Viewing time: ~3m)

Instructional Video: Pivot and Aggregations demo (Viewing time: ~10m)

Please LIKE my comment if it helps! 👇

MarlaBot

MartinLiebig

Hi,

don;t you just want to use a join operation?

Best,

Martin

kayman

<?xml version="1.0" encoding="UTF-8"?><process version="9.3.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.3.001" expanded="true" name="Process" origin="EXPORTED_TURBOPREP">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="UTF-8"/>
    <process expanded="true">
      <operator activated="true" class="aggregate" compatibility="9.3.001" expanded="true" height="82" name="Aggregate (2)" width="90" x="112" y="34">
        <parameter key="use_default_aggregation" value="false"/>
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="CNPJ"/>
        <parameter key="attributes" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
        <parameter key="default_aggregation_function" value="count"/>
        <list key="aggregation_attributes">
          <parameter key="CNPJ" value="count"/>
        </list>
        <parameter key="group_by_attributes" value="CNPJ"/>
        <parameter key="count_all_combinations" value="false"/>
        <parameter key="only_distinct" value="false"/>
        <parameter key="ignore_missings" value="true"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="9.3.001" expanded="true" height="82" name="Select Attributes (9)" width="90" x="246" y="136">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="coduasg|ipgCod_diff|ippIndAceito|ippIndAdjudicado|lanClassif_diff|numprp|prpCod|lanData_milis|lanData_milis_diff"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="true"/>
        <parameter key="include_special_attributes" value="false"/>
        <description align="center" color="transparent" colored="false" width="126">Exclus&amp;#227;o final de atributos</description>
      </operator>
      <operator activated="true" class="order_attributes" compatibility="9.3.001" expanded="true" height="82" name="Reorder Attributes" width="90" x="380" y="136">
        <parameter key="sort_mode" value="user specified"/>
        <parameter key="attribute_ordering" value="coduasg numprp|prpCod|ipgCod|ippIndAceito|ippIndAdjudicado|ippIndHabilitado|lanClassif|lanData|lanData_milis|lanData_milis_diff|ipgCod_diff"/>
        <parameter key="use_regular_expressions" value="false"/>
        <parameter key="handle_unmatched" value="append"/>
        <parameter key="sort_direction" value="ascending"/>
        <description align="center" color="transparent" colored="false" width="126">Ordena&amp;#231;&amp;#227;o dos atributos</description>
      </operator>
      <operator activated="true" class="concurrency:join" compatibility="9.3.001" expanded="true" height="82" name="Join" width="90" x="514" y="34">
        <parameter key="remove_double_attributes" value="true"/>
        <parameter key="join_type" value="right"/>
        <parameter key="use_id_attribute_as_key" value="false"/>
        <list key="key_attributes">
          <parameter key="CNPJ" value="CNPJ"/>
        </list>
        <parameter key="keep_both_join_attributes" value="false"/>
      </operator>
      <connect from_port="input 1" to_op="Aggregate (2)" to_port="example set input"/>
      <connect from_op="Aggregate (2)" from_port="example set output" to_op="Join" to_port="left"/>
      <connect from_op="Aggregate (2)" from_port="original" to_op="Select Attributes (9)" to_port="example set input"/>
      <connect from_op="Select Attributes (9)" from_port="example set output" to_op="Reorder Attributes" to_port="example set input"/>
      <connect from_op="Reorder Attributes" from_port="example set output" to_op="Join" to_port="right"/>
      <connect from_op="Join" from_port="join" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="source_input 2" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

hugomsouto

Incredibly simple solution. Thank you @kayman for a such detailed answer, helped a lot.

Thanks @mschmitz, I have tried using join, but I didn't set the role for both columns and it didn't work.