Numbers similarity

ZAM
ZAM New Altair Community Member
edited November 2024 in Community Q&A
Hello

I tried using nominal distance and tried cosine similarity for the numbers in the attached file but I don't get any distance other than 1 and 0 and no similarity other than 1 and 0.

I want a process that calculates the similarity or the distance more accurately like the similarity between "7799" and "7788" should be 0.5

Any help would be appreciated.
Thanks and best regards,
ZAM
Tagged:

Best Answer

  • Telcontar120
    Telcontar120 New Altair Community Member
    Answer ✓
    I think what you are looking for is a variation of the Levenshtein distance which calculates the number of characters that are different between two strings.  This is an operator available in the free Operators Toolbox extension. You could then express this as a fraction of the length of one of the input strings using Generate Attributes.
    See the attached process for an example.
    <?xml version="1.0" encoding="UTF-8"?><process version="9.5.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.5.001" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="utility:create_exampleset" compatibility="9.5.001" expanded="true" height="68" name="Create ExampleSet" width="90" x="112" y="85">
            <parameter key="generator_type" value="comma separated text"/>
            <parameter key="number_of_examples" value="100"/>
            <parameter key="use_stepsize" value="false"/>
            <list key="function_descriptions"/>
            <parameter key="add_id_attribute" value="false"/>
            <list key="numeric_series_configuration"/>
            <list key="date_series_configuration"/>
            <list key="date_series_configuration (interval)"/>
            <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
            <parameter key="time_zone" value="SYSTEM"/>
            <parameter key="input_csv_text" value="att1,att2&#10;7799,7788&#10;"/>
            <parameter key="column_separator" value=","/>
            <parameter key="parse_all_as_nominal" value="false"/>
            <parameter key="decimal_point_character" value="."/>
            <parameter key="trim_attribute_names" value="true"/>
          </operator>
          <operator activated="true" class="numerical_to_polynominal" compatibility="9.5.001" expanded="true" height="82" name="Numerical to Polynominal" width="90" x="246" y="85">
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="numeric"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="real"/>
            <parameter key="block_type" value="value_series"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_series_end"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
          </operator>
          <operator activated="true" class="operator_toolbox:levenshtein_distance" compatibility="2.3.000" expanded="true" height="82" name="Generate Levenshtein Distance" width="90" x="380" y="85">
            <parameter key="first_attribute_for_distance_calculation" value="att1"/>
            <parameter key="second_attribute_for_distance_calculation" value="att2"/>
          </operator>
          <connect from_op="Create ExampleSet" from_port="output" to_op="Numerical to Polynominal" to_port="example set input"/>
          <connect from_op="Numerical to Polynominal" from_port="example set output" to_op="Generate Levenshtein Distance" to_port="exa"/>
          <connect from_op="Generate Levenshtein Distance" from_port="out" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    


Answers

  • Telcontar120
    Telcontar120 New Altair Community Member
    Answer ✓
    I think what you are looking for is a variation of the Levenshtein distance which calculates the number of characters that are different between two strings.  This is an operator available in the free Operators Toolbox extension. You could then express this as a fraction of the length of one of the input strings using Generate Attributes.
    See the attached process for an example.
    <?xml version="1.0" encoding="UTF-8"?><process version="9.5.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.5.001" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="utility:create_exampleset" compatibility="9.5.001" expanded="true" height="68" name="Create ExampleSet" width="90" x="112" y="85">
            <parameter key="generator_type" value="comma separated text"/>
            <parameter key="number_of_examples" value="100"/>
            <parameter key="use_stepsize" value="false"/>
            <list key="function_descriptions"/>
            <parameter key="add_id_attribute" value="false"/>
            <list key="numeric_series_configuration"/>
            <list key="date_series_configuration"/>
            <list key="date_series_configuration (interval)"/>
            <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
            <parameter key="time_zone" value="SYSTEM"/>
            <parameter key="input_csv_text" value="att1,att2&#10;7799,7788&#10;"/>
            <parameter key="column_separator" value=","/>
            <parameter key="parse_all_as_nominal" value="false"/>
            <parameter key="decimal_point_character" value="."/>
            <parameter key="trim_attribute_names" value="true"/>
          </operator>
          <operator activated="true" class="numerical_to_polynominal" compatibility="9.5.001" expanded="true" height="82" name="Numerical to Polynominal" width="90" x="246" y="85">
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="numeric"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="real"/>
            <parameter key="block_type" value="value_series"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_series_end"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
          </operator>
          <operator activated="true" class="operator_toolbox:levenshtein_distance" compatibility="2.3.000" expanded="true" height="82" name="Generate Levenshtein Distance" width="90" x="380" y="85">
            <parameter key="first_attribute_for_distance_calculation" value="att1"/>
            <parameter key="second_attribute_for_distance_calculation" value="att2"/>
          </operator>
          <connect from_op="Create ExampleSet" from_port="output" to_op="Numerical to Polynominal" to_port="example set input"/>
          <connect from_op="Numerical to Polynominal" from_port="example set output" to_op="Generate Levenshtein Distance" to_port="exa"/>
          <connect from_op="Generate Levenshtein Distance" from_port="out" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>