[Solved] Add clustering label to dataset

Unknown
edited November 5 in Community Q&A
Hi everybody ,

I am doing clustering in RapidMiner , my original dataset doesn't have any attribute for cluster id , but I want the the clustering attribute to be added to my dataset,

How can I do that ? because after clustering , I have access to the model not the dataset ,

Thanks in advance
Tagged:

Answers

  • Skirzynski
    Skirzynski New Altair Community Member
    Hey,

    for most of our clustering operator you should see two output ports which offer the clustering model and the clustered set, which is you input example set + the cluster id.

    If you just have the clustering model use the "Apply Model" operator to apply the model on a dataset and generate your desired clustering attribute.

    Best Regards
      Marcin
  • Thanks for the reply ,

    Another question is : When I extract cluster prototype , my label attribute vanishes , what is the reason ? how can I preserve it
  • Skirzynski
    Skirzynski New Altair Community Member
    Do you mean the cluster attribute? In my case it is preserved. Please post a process for your problem so i can take a look.
  • No , I mean label attribute, actually , I have a label attribute before & after clustering module , but I lose it after extracting clustering prototypes,
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.2.008">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
        <process expanded="true" height="539" width="1957">
          <operator activated="true" class="retrieve" compatibility="5.2.008" expanded="true" height="60" name="Retrieve" width="90" x="112" y="75">
            <parameter key="repository_entry" value="//NewLocalRepository/temp_5000sampled_MI4"/>
          </operator>
          <operator activated="true" class="nominal_to_numerical" compatibility="5.2.008" expanded="true" height="94" name="Nominal to Numerical" width="90" x="246" y="75">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="sex"/>
            <list key="comparison_groups"/>
          </operator>
          <operator activated="true" class="normalize" compatibility="5.2.008" expanded="true" height="94" name="Normalize" width="90" x="380" y="75"/>
          <operator activated="false" class="select_attributes" compatibility="5.2.008" expanded="true" height="76" name="Select Attributes" width="90" x="715" y="300">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="event"/>
            <parameter key="invert_selection" value="true"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="5.2.008" expanded="true" height="76" name="Multiply" width="90" x="514" y="75"/>
          <operator activated="false" class="filter_examples" compatibility="5.2.008" expanded="true" height="76" name="Filter Examples (2)" width="90" x="849" y="255">
            <parameter key="condition_class" value="attribute_value_filter"/>
            <parameter key="parameter_string" value="event=t"/>
          </operator>
          <operator activated="true" class="filter_examples" compatibility="5.2.008" expanded="true" height="76" name="Filter Examples" width="90" x="849" y="30">
            <parameter key="condition_class" value="attribute_value_filter"/>
            <parameter key="parameter_string" value="event=f"/>
          </operator>
          <operator activated="false" class="k_means" compatibility="5.2.008" expanded="true" height="76" name="Clustering" width="90" x="1050" y="165">
            <parameter key="k" value="572"/>
            <parameter key="measure_types" value="MixedMeasures"/>
          </operator>
          <operator activated="true" class="k_medoids" compatibility="5.2.008" expanded="true" height="76" name="Clustering (2)" width="90" x="1117" y="30">
            <parameter key="k" value="572"/>
          </operator>
          <operator activated="true" class="extract_prototypes" compatibility="5.2.008" expanded="true" height="76" name="Extract Cluster Prototypes" width="90" x="1318" y="30"/>
          <operator activated="false" class="select_attributes" compatibility="5.2.008" expanded="true" height="76" name="Select Attributes (2)" width="90" x="1050" y="300">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="cluster"/>
            <parameter key="invert_selection" value="true"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="false" class="union" compatibility="5.2.008" expanded="true" height="76" name="Union" width="90" x="1452" y="165"/>
          <operator activated="false" class="select_attributes" compatibility="5.2.008" expanded="true" height="76" name="Select Attributes (3)" width="90" x="1586" y="165">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="id"/>
            <parameter key="invert_selection" value="true"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="false" class="shuffle" compatibility="5.2.008" expanded="true" height="76" name="Shuffle" width="90" x="1720" y="165"/>
          <connect from_op="Retrieve" from_port="output" to_op="Nominal to Numerical" to_port="example set input"/>
          <connect from_op="Nominal to Numerical" from_port="example set output" to_op="Normalize" to_port="example set input"/>
          <connect from_op="Normalize" from_port="example set output" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="Filter Examples" to_port="example set input"/>
          <connect from_op="Filter Examples" from_port="example set output" to_op="Clustering (2)" to_port="example set"/>
          <connect from_op="Clustering (2)" from_port="cluster model" to_op="Extract Cluster Prototypes" to_port="model"/>
          <connect from_op="Extract Cluster Prototypes" from_port="example set" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
  • MariusHelf
    MariusHelf New Altair Community Member
    When you extract the cluster prototypes you create a completely new example set - so why should it have a label?

    If you want the clustered data (including the label, if it has been in the data before the clustering), just use the second output of the Clustering operator.

    Best, Marius
  • I am doing down sampling by choosing medoids of clustering , so I need the label attribute , Is there any way to preserve the label attribute too ?
  • MariusHelf
    MariusHelf New Altair Community Member
    Yes, you have two options:

    1. Join the extracted prototypes with the original data, use all attributes as key attributes:
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.000" expanded="true" name="Process">
        <process expanded="true" height="391" width="748">
          <operator activated="true" class="generate_data" compatibility="5.3.000" expanded="true" height="60" name="Generate Data" width="90" x="45" y="75">
            <parameter key="target_function" value="random classification"/>
            <parameter key="number_of_attributes" value="3"/>
          </operator>
          <operator activated="true" class="k_medoids" compatibility="5.3.000" expanded="true" height="76" name="Clustering" width="90" x="246" y="75"/>
          <operator activated="true" class="extract_prototypes" compatibility="5.3.000" expanded="true" height="76" name="Extract Cluster Prototypes" width="90" x="447" y="30"/>
          <operator activated="true" class="join" compatibility="5.3.000" expanded="true" height="76" name="Join" width="90" x="581" y="120">
            <parameter key="use_id_attribute_as_key" value="false"/>
            <list key="key_attributes">
              <parameter key="att1" value="att1"/>
              <parameter key="att2" value="att2"/>
              <parameter key="att3" value="att3"/>
            </list>
          </operator>
          <connect from_op="Generate Data" from_port="output" to_op="Clustering" to_port="example set"/>
          <connect from_op="Clustering" from_port="cluster model" to_op="Extract Cluster Prototypes" to_port="model"/>
          <connect from_op="Clustering" from_port="clustered set" to_op="Join" to_port="right"/>
          <connect from_op="Extract Cluster Prototypes" from_port="example set" to_op="Join" to_port="left"/>
          <connect from_op="Join" from_port="join" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="90"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>

    2. Set the role of the label to regular before clustering. Warning: that way the label will be considered for clustering. This is not always what you want.
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.000" expanded="true" name="Process">
        <process expanded="true" height="391" width="748">
          <operator activated="true" class="generate_data" compatibility="5.3.000" expanded="true" height="60" name="Generate Data" width="90" x="45" y="75">
            <parameter key="target_function" value="random classification"/>
            <parameter key="number_of_attributes" value="3"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="5.3.000" expanded="true" height="76" name="Set Role" width="90" x="179" y="75">
            <parameter key="name" value="label"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="k_medoids" compatibility="5.3.000" expanded="true" height="76" name="Clustering" width="90" x="313" y="75"/>
          <operator activated="true" class="extract_prototypes" compatibility="5.3.000" expanded="true" height="76" name="Extract Cluster Prototypes" width="90" x="447" y="75"/>
          <operator activated="true" class="set_role" compatibility="5.3.000" expanded="true" height="76" name="Set Role (2)" width="90" x="581" y="75">
            <parameter key="name" value="label"/>
            <parameter key="target_role" value="label"/>
            <list key="set_additional_roles"/>
          </operator>
          <connect from_op="Generate Data" from_port="output" to_op="Set Role" to_port="example set input"/>
          <connect from_op="Set Role" from_port="example set output" to_op="Clustering" to_port="example set"/>
          <connect from_op="Clustering" from_port="cluster model" to_op="Extract Cluster Prototypes" to_port="model"/>
          <connect from_op="Extract Cluster Prototypes" from_port="example set" to_op="Set Role (2)" to_port="example set input"/>
          <connect from_op="Set Role (2)" from_port="example set output" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="36"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>

    Happy Mining!
    ~Marius
  • Thanks for your help,

    But there's a problem , the dataset is very large (550K of examples with more than 700 attributes) , so joining is not applicable I guess ,

    Do you have any idea how I can down sample such a large dataset ? maybe another way not extracting prototypes
  • MariusHelf
    MariusHelf New Altair Community Member
    You could mis-use k-NN with k=1 for it. Won't be exactly what you call "fast", but it will work:
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.000" expanded="true" name="Process">
        <process expanded="true" height="391" width="748">
          <operator activated="true" class="generate_data" compatibility="5.3.000" expanded="true" height="60" name="Generate Data" width="90" x="45" y="75">
            <parameter key="target_function" value="random classification"/>
            <parameter key="number_of_attributes" value="3"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="5.3.000" expanded="true" height="94" name="Multiply" width="90" x="179" y="75"/>
          <operator activated="true" class="k_medoids" compatibility="5.3.000" expanded="true" height="76" name="Clustering" width="90" x="313" y="75"/>
          <operator activated="true" class="extract_prototypes" compatibility="5.3.000" expanded="true" height="76" name="Extract Cluster Prototypes" width="90" x="447" y="75"/>
          <operator activated="true" class="k_nn" compatibility="5.3.000" expanded="true" height="76" name="k-NN" width="90" x="313" y="210"/>
          <operator activated="true" class="apply_model" compatibility="5.3.000" expanded="true" height="76" name="Apply Model" width="90" x="581" y="165">
            <list key="application_parameters"/>
          </operator>
          <connect from_op="Generate Data" from_port="output" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="Clustering" to_port="example set"/>
          <connect from_op="Multiply" from_port="output 2" to_op="k-NN" to_port="training set"/>
          <connect from_op="Clustering" from_port="cluster model" to_op="Extract Cluster Prototypes" to_port="model"/>
          <connect from_op="Extract Cluster Prototypes" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="k-NN" from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="36"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
  • I used the following model , but there's a problem , after applying the cluster model , I want to invert-select cluster attribute (I want to remove it ) but there's no such attribute in the list , but when I get the result it shows me this attribute
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.2.008">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
        <process expanded="true" height="539" width="1957">
          <operator activated="true" class="retrieve" compatibility="5.2.008" expanded="true" height="60" name="Retrieve" width="90" x="45" y="75">
            <parameter key="repository_entry" value="//NewLocalRepository/temp_5000sampled_MI4"/>
          </operator>
          <operator activated="true" class="sample_stratified" compatibility="5.2.008" expanded="true" height="76" name="Sample (Stratified)" width="90" x="179" y="255">
            <parameter key="sample" value="relative"/>
          </operator>
          <operator activated="true" class="nominal_to_numerical" compatibility="5.2.008" expanded="true" height="94" name="Nominal to Numerical" width="90" x="246" y="75">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="sex"/>
            <list key="comparison_groups"/>
          </operator>
          <operator activated="true" class="normalize" compatibility="5.2.008" expanded="true" height="94" name="Normalize" width="90" x="380" y="75"/>
          <operator activated="false" class="select_attributes" compatibility="5.2.008" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="345">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="event"/>
            <parameter key="invert_selection" value="true"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="5.2.008" expanded="true" height="94" name="Multiply" width="90" x="581" y="75"/>
          <operator activated="true" class="filter_examples" compatibility="5.2.008" expanded="true" height="76" name="Filter Examples (2)" width="90" x="782" y="255">
            <parameter key="condition_class" value="attribute_value_filter"/>
            <parameter key="parameter_string" value="event=t"/>
          </operator>
          <operator activated="true" class="filter_examples" compatibility="5.2.008" expanded="true" height="76" name="Filter Examples" width="90" x="782" y="30">
            <parameter key="condition_class" value="attribute_value_filter"/>
            <parameter key="parameter_string" value="event=f"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="5.2.008" expanded="true" height="94" name="Multiply (2)" width="90" x="916" y="30"/>
          <operator activated="true" class="k_means" compatibility="5.2.008" expanded="true" height="76" name="Clustering" width="90" x="1117" y="30">
            <parameter key="k" value="5"/>
            <parameter key="max_runs" value="100"/>
            <parameter key="measure_types" value="MixedMeasures"/>
          </operator>
          <operator activated="true" class="apply_model" compatibility="5.2.008" expanded="true" height="76" name="Apply Model" width="90" x="1117" y="120">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="union" compatibility="5.2.008" expanded="true" height="76" name="Union" width="90" x="1385" y="210"/>
          <operator activated="true" class="select_attributes" compatibility="5.2.008" expanded="true" height="76" name="Select Attributes (3)" width="90" x="1519" y="210">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="id"/>
            <parameter key="invert_selection" value="true"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="shuffle" compatibility="5.2.008" expanded="true" height="76" name="Shuffle" width="90" x="1720" y="210"/>
          <connect from_op="Retrieve" from_port="output" to_op="Sample (Stratified)" to_port="example set input"/>
          <connect from_op="Sample (Stratified)" from_port="example set output" to_op="Nominal to Numerical" to_port="example set input"/>
          <connect from_op="Nominal to Numerical" from_port="example set output" to_op="Normalize" to_port="example set input"/>
          <connect from_op="Normalize" from_port="example set output" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="Filter Examples" to_port="example set input"/>
          <connect from_op="Multiply" from_port="output 2" to_op="Filter Examples (2)" to_port="example set input"/>
          <connect from_op="Filter Examples (2)" from_port="example set output" to_op="Union" to_port="example set 2"/>
          <connect from_op="Filter Examples" from_port="example set output" to_op="Multiply (2)" to_port="input"/>
          <connect from_op="Multiply (2)" from_port="output 1" to_op="Clustering" to_port="example set"/>
          <connect from_op="Multiply (2)" from_port="output 2" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Clustering" from_port="cluster model" to_op="Apply Model" to_port="model"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Union" to_port="example set 1"/>
          <connect from_op="Union" from_port="union" to_op="Select Attributes (3)" to_port="example set input"/>
          <connect from_op="Select Attributes (3)" from_port="example set output" to_op="Shuffle" to_port="example set input"/>
          <connect from_op="Shuffle" from_port="example set output" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
  • MariusHelf
    MariusHelf New Altair Community Member
    I did not look at the process, but if an attribute is not shown in the list, you can simply enter it manually, even though it's not in the drop-down-list.
  • it works fine now
    I appreciate your help,
    Thanks