🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

If statement

User: "mskh"
New Altair Community Member
Updated by Jocelyn
I want to use the below condition to create new attribute but its result is always true.
If(cluster==[least(cluster)],false,true)

Find more posts tagged with

Sort by:
1 - 7 of 71
    User: "lionelderkrikor"
    New Altair Community Member
    Hi @mina_s_kh,

    You want select the cluster with the lowest number of elements ?
    Can you share your process and your dataset(s) in order we better understand ?


    Regards,

    Lionel
    User: "mskh"
    New Altair Community Member
    OP
    Hi
    I use DBSCAN to cluster my dataset. I want to consider the cluster with the lowest number of elements as outlier elements. I want label the outlier as false and other as true and use them in k_nn algorithm.
    My problem is that the outlier cluster may change, when i use different dataset. I wan to find a way to dynamically determine the outlier and use it in if statement.
    ​<?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="7.1.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="7.1.001" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="7.1.001" expanded="true" height="68" name="Retrieve RDG_Day(Test)" width="90" x="45" y="34">
            <parameter key="repository_entry" value="RDG_Day(Test)"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="7.1.001" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="34">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value="dstIP|cnt"/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
          </operator>
          <operator activated="true" class="generate_id" compatibility="7.1.001" expanded="true" height="82" name="Generate ID" width="90" x="313" y="34">
            <parameter key="create_nominal_ids" value="false"/>
            <parameter key="offset" value="0"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="7.1.001" expanded="true" height="103" name="Multiply" width="90" x="45" y="289"/>
          <operator activated="true" class="dbscan" compatibility="7.1.001" expanded="true" height="82" name="Clustering" width="90" x="179" y="238">
            <parameter key="epsilon" value="30.5"/>
            <parameter key="min_points" value="2"/>
            <parameter key="add_cluster_attribute" value="true"/>
            <parameter key="add_as_label" value="false"/>
            <parameter key="remove_unlabeled" value="false"/>
            <parameter key="measure_types" value="MixedMeasures"/>
            <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
            <parameter key="nominal_measure" value="NominalDistance"/>
            <parameter key="numerical_measure" value="EuclideanDistance"/>
            <parameter key="divergence" value="GeneralizedIDivergence"/>
            <parameter key="kernel_type" value="radial"/>
            <parameter key="kernel_gamma" value="1.0"/>
            <parameter key="kernel_sigma1" value="1.0"/>
            <parameter key="kernel_sigma2" value="0.0"/>
            <parameter key="kernel_sigma3" value="2.0"/>
            <parameter key="kernel_degree" value="3.0"/>
            <parameter key="kernel_shift" value="1.0"/>
            <parameter key="kernel_a" value="1.0"/>
            <parameter key="kernel_b" value="0.0"/>
          </operator>
          <operator activated="true" class="aggregate" compatibility="7.1.001" expanded="true" height="82" name="Aggregate" width="90" x="514" y="34">
            <parameter key="use_default_aggregation" value="false"/>
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="default_aggregation_function" value="average"/>
            <list key="aggregation_attributes">
              <parameter key="cluster" value="least"/>
            </list>
            <parameter key="group_by_attributes" value="cluster|id"/>
            <parameter key="count_all_combinations" value="false"/>
            <parameter key="only_distinct" value="false"/>
            <parameter key="ignore_missings" value="true"/>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="7.1.001" expanded="true" height="82" name="Generate Attributes" width="90" x="648" y="34">
            <list key="function_descriptions">
              <parameter key="allow" value="if(cluster==[least(cluster)],false,true)"/>
            </list>
            <parameter key="keep_all" value="true"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="7.1.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="447" y="187">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value="cluster|allow|dstIP|id"/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="7.1.001" expanded="true" height="82" name="Set Role" width="90" x="581" y="187">
            <parameter key="attribute_name" value="id"/>
            <parameter key="target_role" value="id"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="join" compatibility="7.1.001" expanded="true" height="82" name="Join" width="90" x="581" y="391">
            <parameter key="remove_double_attributes" value="true"/>
            <parameter key="join_type" value="inner"/>
            <parameter key="use_id_attribute_as_key" value="true"/>
            <list key="key_attributes"/>
            <parameter key="keep_both_join_attributes" value="false"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="7.1.001" expanded="true" height="82" name="Set Role (2)" width="90" x="715" y="289">
            <parameter key="attribute_name" value="allow"/>
            <parameter key="target_role" value="label"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="7.1.001" expanded="true" height="82" name="Select Attributes (3)" width="90" x="715" y="187">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value="allow|id|cnt|dstIP"/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
          </operator>
          <connect from_op="Retrieve RDG_Day(Test)" from_port="output" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
          <connect from_op="Generate ID" from_port="example set output" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="Clustering" to_port="example set"/>
          <connect from_op="Multiply" from_port="output 2" to_op="Join" to_port="left"/>
          <connect from_op="Clustering" from_port="clustered set" to_op="Aggregate" to_port="example set input"/>
          <connect from_op="Aggregate" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
          <connect from_op="Generate Attributes" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/>
          <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Set Role" to_port="example set input"/>
          <connect from_op="Set Role" from_port="example set output" to_op="Join" to_port="right"/>
          <connect from_op="Join" from_port="join" to_op="Set Role (2)" to_port="example set input"/>
          <connect from_op="Set Role (2)" from_port="example set output" to_op="Select Attributes (3)" to_port="example set input"/>
          <connect from_op="Select Attributes (3)" from_port="example set output" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    

    I use rapidminer v 7.1.1
    Thanks
    User: "lionelderkrikor"
    New Altair Community Member
    Hi @mina_s_kh,

    Can you share your dataset (the file RDG_Day(Test)) in order I can run your process ?

    Regards,
    Lionel
    User: "mskh"
    New Altair Community Member
    OP
    Unfortunately I dont access to my dataset now. It consist of 2 attribute, ip address and connection count
    User: "YYH"
    Altair Employee
    Hi @mina_s_kh
    Thanks very much for the information. If you have only two attributes, ip and counts, you can basically ignore ip or translate ip to country/city names with geo-location functions.
    1. example code to integrate python scripts for geo locating from ip address
    import pandas
    import pygeoip
    import requests
    # Use python script to convert ip addresses to contry names/codes check out
    # http://pygeoip.readthedocs.io/en/latest/getting-started.html
    # GeoIP data can be found from maxmind API or github https://github.com/gsmlg/GeoIP.dat
    # rm_main is a mandatory function, 
    # the number of arguments has to be the number of input ports (can be none)
    def rm_main(data):
    	GEOIP = pygeoip.GeoIP("path/to/your/GeoIP data/GeoIP.dat", pygeoip.MEMORY_CACHE)
    	country =[]
    	for row in data['ip']:
    		print(GEOIP.country_name_by_addr(row))
    		country.append(GEOIP.country_name_by_addr(row))
    	data['country_name_from_IP'] = country
    	return data
    2. least() function in your aggregate will not get the cluster name with least counts. You will need to aggregate the count() by cluster and then label the "minorities" as you described above. My example process will use X-means  (much faster) and return 3 clusters. From the bar charts, the cluster_2 has the lest number of examples and will be labeled as outlier.



    3. You will need the python extension from marketplace to test the process from my git https://github.com/sunnyuan/geoIP-clustering but you can skip the geo-locating with python by using the sampleSet_country_names.csv directly with clustering and k-nn



    User: "Marco_Barradas"
    Altair Employee
    Accepted Answer
    Hi @mina_s_kh Assuming you already solved the clustering process you may use this logic. 

    I built on top of what @yyhuang said and sorted the count, extracted a macro with the name of the least cluster and created the outlier attribute. Hope it helps. 
    <?xml version="1.0" encoding="UTF-8"?><process version="9.1.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.1.000" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="9.1.000" expanded="true" height="68" name="Retrieve Iris" origin="GENERATED_TUTORIAL" width="90" x="45" y="34">
            <parameter key="repository_entry" value="//Samples/data/Iris"/>
          </operator>
          <operator activated="true" class="concurrency:k_means" compatibility="9.0.001" expanded="true" height="82" name="Clustering" origin="GENERATED_TUTORIAL" width="90" x="246" y="34">
            <parameter key="add_cluster_attribute" value="true"/>
            <parameter key="add_as_label" value="false"/>
            <parameter key="remove_unlabeled" value="false"/>
            <parameter key="k" value="3"/>
            <parameter key="max_runs" value="10"/>
            <parameter key="determine_good_start_values" value="false"/>
            <parameter key="measure_types" value="BregmanDivergences"/>
            <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
            <parameter key="nominal_measure" value="NominalDistance"/>
            <parameter key="numerical_measure" value="EuclideanDistance"/>
            <parameter key="divergence" value="SquaredEuclideanDistance"/>
            <parameter key="kernel_type" value="radial"/>
            <parameter key="kernel_gamma" value="1.0"/>
            <parameter key="kernel_sigma1" value="1.0"/>
            <parameter key="kernel_sigma2" value="0.0"/>
            <parameter key="kernel_sigma3" value="2.0"/>
            <parameter key="kernel_degree" value="3.0"/>
            <parameter key="kernel_shift" value="1.0"/>
            <parameter key="kernel_a" value="1.0"/>
            <parameter key="kernel_b" value="0.0"/>
            <parameter key="max_optimization_steps" value="100"/>
            <parameter key="use_local_random_seed" value="true"/>
            <parameter key="local_random_seed" value="1992"/>
          </operator>
          <operator activated="true" class="aggregate" compatibility="9.1.000" expanded="true" height="82" name="Aggregate" origin="GENERATED_TUTORIAL" width="90" x="380" y="34">
            <parameter key="use_default_aggregation" value="false"/>
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="default_aggregation_function" value="average"/>
            <list key="aggregation_attributes">
              <parameter key="a1" value="count"/>
            </list>
            <parameter key="group_by_attributes" value="cluster|label"/>
            <parameter key="count_all_combinations" value="false"/>
            <parameter key="only_distinct" value="false"/>
            <parameter key="ignore_missings" value="true"/>
          </operator>
          <operator activated="true" class="sort" compatibility="9.1.000" expanded="true" height="82" name="Sort" width="90" x="514" y="34">
            <parameter key="attribute_name" value="count(a1)"/>
            <parameter key="sorting_direction" value="increasing"/>
          </operator>
          <operator activated="true" class="extract_macro" compatibility="9.1.000" expanded="true" height="68" name="Extract Macro" width="90" x="648" y="34">
            <parameter key="macro" value="MIN_CLUSTER"/>
            <parameter key="macro_type" value="data_value"/>
            <parameter key="statistics" value="average"/>
            <parameter key="attribute_name" value="cluster"/>
            <parameter key="example_index" value="1"/>
            <list key="additional_macros"/>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="9.1.000" expanded="true" height="82" name="Generate Attributes" width="90" x="514" y="187">
            <list key="function_descriptions">
              <parameter key="Outlier" value="if(cluster==%{MIN_CLUSTER},&quot;Outlier&quot;,&quot;Normal&quot;)"/>
            </list>
            <parameter key="keep_all" value="true"/>
          </operator>
          <connect from_op="Retrieve Iris" from_port="output" to_op="Clustering" to_port="example set"/>
          <connect from_op="Clustering" from_port="clustered set" to_op="Aggregate" to_port="example set input"/>
          <connect from_op="Aggregate" from_port="example set output" to_op="Sort" to_port="example set input"/>
          <connect from_op="Aggregate" from_port="original" to_op="Generate Attributes" to_port="example set input"/>
          <connect from_op="Sort" from_port="example set output" to_op="Extract Macro" to_port="example set"/>
          <connect from_op="Extract Macro" from_port="example set" to_port="result 1"/>
          <connect from_op="Generate Attributes" from_port="example set output" to_port="result 2"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
        </process>
      </operator>
    </process>
    


    User: "mskh"
    New Altair Community Member
    OP
    Thanks a million @yyhuang, @MarcoBarradas