Consensus Cluster / Cluster Ensemble: How to realize this

Smerg
Smerg New Altair Community Member
edited November 5 in Community Q&A
Hi guys,

in my master thesis i have to realize a consensus cluster (cc), (or cluster ensemble called). My simple example for testing is to create a cc (k-medoids and k-means) of iris... Do not ask about the sense of this combination  ;)

Unfortunately, my search provided no response, how can I create a cc in RapidMiner 5. Simple tinkering has not brought me further. The Group Model operator or the Model Combiner operator do not seem to be the right thing.


My problem ist to combine the cluster outputs to an example set, so that i can put them in a classification operator. This applies to the attempt on the exsample set itself and the models from the cluster operators. Maybe I have an understanding problem? Maybe I need to make a classification for each cluster and combine these results?  ???

My main strategy is the following:
  • Pre-processing
  • Split dataset
  • Use different cluster algorithms
  • Create a cc
  • Create a classification model based on the cc
  • ...
The images show you the strategy, which you can download her: https://www.dropbox.com/s/857x6r5asw3m86b/strategie.png.

Briefly, I considered to solve it with the R integration. But there must be a better way. I would be happy about any ideas, suggestions or comments. Thanks in advance

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <parameter key="parallelize_main_process" value="false"/>
    <process expanded="true" height="431" width="748">
      <operator activated="true" class="retrieve" compatibility="5.2.008" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30">
        <parameter key="repository_entry" value="//Samples/data/Iris"/>
      </operator>
      <operator activated="true" class="multiply" compatibility="5.2.008" expanded="true" height="94" name="Multiply" width="90" x="179" y="30"/>
      <operator activated="true" class="k_medoids" compatibility="5.2.008" expanded="true" height="76" name="k-medois" width="90" x="313" y="120">
        <parameter key="add_cluster_attribute" value="true"/>
        <parameter key="add_as_label" value="false"/>
        <parameter key="remove_unlabeled" value="false"/>
        <parameter key="k" value="2"/>
        <parameter key="max_runs" value="10"/>
        <parameter key="max_optimization_steps" value="100"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
        <parameter key="measure_types" value="MixedMeasures"/>
        <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
        <parameter key="nominal_measure" value="NominalDistance"/>
        <parameter key="numerical_measure" value="EuclideanDistance"/>
        <parameter key="divergence" value="GeneralizedIDivergence"/>
        <parameter key="kernel_type" value="radial"/>
        <parameter key="kernel_gamma" value="1.0"/>
        <parameter key="kernel_sigma1" value="1.0"/>
        <parameter key="kernel_sigma2" value="0.0"/>
        <parameter key="kernel_sigma3" value="2.0"/>
        <parameter key="kernel_degree" value="3.0"/>
        <parameter key="kernel_shift" value="1.0"/>
        <parameter key="kernel_a" value="1.0"/>
        <parameter key="kernel_b" value="0.0"/>
      </operator>
      <operator activated="true" class="k_means" compatibility="5.2.008" expanded="true" height="76" name="k-means" width="90" x="313" y="30">
        <parameter key="add_cluster_attribute" value="true"/>
        <parameter key="add_as_label" value="false"/>
        <parameter key="remove_unlabeled" value="false"/>
        <parameter key="k" value="2"/>
        <parameter key="max_runs" value="10"/>
        <parameter key="determine_good_start_values" value="false"/>
        <parameter key="measure_types" value="BregmanDivergences"/>
        <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
        <parameter key="nominal_measure" value="NominalDistance"/>
        <parameter key="numerical_measure" value="EuclideanDistance"/>
        <parameter key="divergence" value="SquaredEuclideanDistance"/>
        <parameter key="kernel_type" value="radial"/>
        <parameter key="kernel_gamma" value="1.0"/>
        <parameter key="kernel_sigma1" value="1.0"/>
        <parameter key="kernel_sigma2" value="0.0"/>
        <parameter key="kernel_sigma3" value="2.0"/>
        <parameter key="kernel_degree" value="3.0"/>
        <parameter key="kernel_shift" value="1.0"/>
        <parameter key="kernel_a" value="1.0"/>
        <parameter key="kernel_b" value="0.0"/>
        <parameter key="max_optimization_steps" value="100"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
      </operator>
      <operator activated="true" class="support_vector_machine" compatibility="5.2.008" expanded="true" height="112" name="SVM" width="90" x="514" y="30">
        <parameter key="kernel_type" value="dot"/>
        <parameter key="kernel_gamma" value="1.0"/>
        <parameter key="kernel_sigma1" value="1.0"/>
        <parameter key="kernel_sigma2" value="0.0"/>
        <parameter key="kernel_sigma3" value="2.0"/>
        <parameter key="kernel_shift" value="1.0"/>
        <parameter key="kernel_degree" value="2.0"/>
        <parameter key="kernel_a" value="1.0"/>
        <parameter key="kernel_b" value="0.0"/>
        <parameter key="kernel_cache" value="200"/>
        <parameter key="C" value="0.0"/>
        <parameter key="convergence_epsilon" value="0.0010"/>
        <parameter key="max_iterations" value="100000"/>
        <parameter key="scale" value="true"/>
        <parameter key="calculate_weights" value="true"/>
        <parameter key="return_optimization_performance" value="true"/>
        <parameter key="L_pos" value="1.0"/>
        <parameter key="L_neg" value="1.0"/>
        <parameter key="epsilon" value="0.0"/>
        <parameter key="epsilon_plus" value="0.0"/>
        <parameter key="epsilon_minus" value="0.0"/>
        <parameter key="balance_cost" value="false"/>
        <parameter key="quadratic_loss_pos" value="false"/>
        <parameter key="quadratic_loss_neg" value="false"/>
        <parameter key="estimate_performance" value="false"/>
      </operator>
      <connect from_op="Retrieve" from_port="output" to_op="Multiply" to_port="input"/>
      <connect from_op="Multiply" from_port="output 1" to_op="k-means" to_port="example set"/>
      <connect from_op="Multiply" from_port="output 2" to_op="k-medois" to_port="example set"/>
      <connect from_op="SVM" from_port="model" to_port="result 1"/>
      <connect from_op="SVM" from_port="weights" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Tagged:

Answers

  • Hello

    You could use the rename and set role operators followed by the join operator to make a combined example set with the result of the various clusterings. From there use a suitable classifier - I used Naive Bayes because SVM doesn't like polynominal attributes.

    Here's an example...
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.005">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.005" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="5.3.005" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30">
            <parameter key="repository_entry" value="//Samples/data/Iris"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="5.3.005" expanded="true" height="94" name="Multiply" width="90" x="179" y="30"/>
          <operator activated="true" class="k_medoids" compatibility="5.3.005" expanded="true" height="76" name="k-medoids" width="90" x="313" y="120">
            <parameter key="k" value="3"/>
          </operator>
          <operator activated="true" class="k_means" compatibility="5.3.005" expanded="true" height="76" name="k-means" width="90" x="313" y="30">
            <parameter key="k" value="3"/>
            <parameter key="measure_types" value="MixedMeasures"/>
          </operator>
          <operator activated="true" class="rename" compatibility="5.3.005" expanded="true" height="76" name="Rename" width="90" x="447" y="30">
            <parameter key="old_name" value="cluster"/>
            <parameter key="new_name" value="kMeansCluster"/>
            <list key="rename_additional_attributes"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="5.3.005" expanded="true" height="76" name="Set Role" width="90" x="581" y="30">
            <parameter key="name" value="kMeansCluster"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="rename" compatibility="5.3.005" expanded="true" height="76" name="Rename (2)" width="90" x="447" y="120">
            <parameter key="old_name" value="cluster"/>
            <parameter key="new_name" value="kMedoidsCluster"/>
            <list key="rename_additional_attributes"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="5.3.005" expanded="true" height="76" name="Set Role (2)" width="90" x="581" y="120">
            <parameter key="name" value="kMedoidsCluster"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="join" compatibility="5.3.005" expanded="true" height="76" name="Join" width="90" x="715" y="30">
            <list key="key_attributes"/>
          </operator>
          <operator activated="true" class="x_validation" compatibility="5.0.000" expanded="true" height="112" name="Validation" width="90" x="849" y="30">
            <description>A cross-validation evaluating a decision tree model.</description>
            <parameter key="number_of_validations" value="3"/>
            <process expanded="true">
              <operator activated="true" class="naive_bayes" compatibility="5.3.005" expanded="true" height="76" name="Naive Bayes" width="90" x="231" y="30"/>
              <connect from_port="training" to_op="Naive Bayes" to_port="training set"/>
              <connect from_op="Naive Bayes" from_port="model" to_port="model"/>
              <portSpacing port="source_training" spacing="0"/>
              <portSpacing port="sink_model" spacing="0"/>
              <portSpacing port="sink_through 1" spacing="0"/>
            </process>
            <process expanded="true">
              <operator activated="true" class="apply_model" compatibility="5.0.000" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
                <list key="application_parameters"/>
              </operator>
              <operator activated="true" class="performance" compatibility="5.0.000" expanded="true" height="76" name="Performance" width="90" x="179" y="30"/>
              <connect from_port="model" to_op="Apply Model" to_port="model"/>
              <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
              <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
              <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
              <portSpacing port="source_model" spacing="0"/>
              <portSpacing port="source_test set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Retrieve" from_port="output" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="k-means" to_port="example set"/>
          <connect from_op="Multiply" from_port="output 2" to_op="k-medoids" to_port="example set"/>
          <connect from_op="k-medoids" from_port="clustered set" to_op="Rename (2)" to_port="example set input"/>
          <connect from_op="k-means" from_port="clustered set" to_op="Rename" to_port="example set input"/>
          <connect from_op="Rename" from_port="example set output" to_op="Set Role" to_port="example set input"/>
          <connect from_op="Set Role" from_port="example set output" to_op="Join" to_port="left"/>
          <connect from_op="Rename (2)" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/>
          <connect from_op="Set Role (2)" from_port="example set output" to_op="Join" to_port="right"/>
          <connect from_op="Join" from_port="join" to_op="Validation" to_port="training"/>
          <connect from_op="Validation" from_port="training" to_port="result 2"/>
          <connect from_op="Validation" from_port="averagable 1" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
        </process>
      </operator>
    </process>
    regards

    Andrew