🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

Count of the non-zero values

User: "Kathi"
New Altair Community Member
Updated by Jocelyn
Hi everyone,
I am working with a n-Gram process and want RapidMiner to count just the non-zero values. I tried the 'generate aggregation' operator with the numeric condition > 0. But the output is still the count of all the values (including the zero values). Do I have the wrong parameters? Or do I need a different operator?

Regards and thanks for the help!

Find more posts tagged with

Sort by:
1 - 1 of 11
    User: "kayman"
    New Altair Community Member
    Accepted Answer
    Updated by kayman
    Hi @Kathi, I wanted to leave some of the fun for you :-)

    If you only need to know the unique values per document you can change the vector creation to binary, than it just checks if it exists or not so it will state 1 even if the same n-gram appears 20 times.

    If you need both total and unique try this : 
    <?xml version="1.0" encoding="UTF-8"?><process version="9.4.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.4.000" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="9.4.001" expanded="true" height="68" name="Retrieve Input Analyse" width="90" x="45" y="34">
            <parameter key="repository_entry" value="Input Analyse"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="9.4.001" expanded="true" height="82" name="Multiply" width="90" x="179" y="34"/>
          <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="34">
            <parameter key="create_word_vector" value="true"/>
            <parameter key="vector_creation" value="Term Occurrences"/>
            <parameter key="add_meta_information" value="true"/>
            <parameter key="keep_text" value="false"/>
            <parameter key="prune_method" value="absolute"/>
            <parameter key="prune_below_percent" value="3.0"/>
            <parameter key="prune_above_percent" value="30.0"/>
            <parameter key="prune_below_absolute" value="2"/>
            <parameter key="prune_above_absolute" value="50"/>
            <parameter key="prune_below_rank" value="0.05"/>
            <parameter key="prune_above_rank" value="0.95"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <parameter key="select_attributes_and_weights" value="false"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="8.2.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="34">
                <parameter key="mode" value="non letters"/>
                <parameter key="characters" value=".:"/>
                <parameter key="language" value="English"/>
                <parameter key="max_token_length" value="3"/>
              </operator>
              <operator activated="true" class="text:transform_cases" compatibility="8.2.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="34">
                <parameter key="transform_to" value="lower case"/>
              </operator>
              <operator activated="true" class="text:generate_n_grams_terms" compatibility="8.2.000" expanded="true" height="68" name="Generate n-Grams (Terms) (2)" width="90" x="313" y="34">
                <parameter key="max_length" value="4"/>
              </operator>
              <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.2.000" expanded="true" height="68" name="Filter Tokens (by Content)" width="90" x="514" y="34">
                <parameter key="condition" value="matches"/>
                <parameter key="regular_expression" value="^\w+_\w+_\w+_\w+"/>
                <parameter key="case_sensitive" value="false"/>
                <parameter key="invert condition" value="false"/>
                <description align="center" color="transparent" colored="false" width="126">only keep 4-gram</description>
              </operator>
              <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
              <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
              <connect from_op="Transform Cases (2)" from_port="document" to_op="Generate n-Grams (Terms) (2)" to_port="document"/>
              <connect from_op="Generate n-Grams (Terms) (2)" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
              <connect from_op="Filter Tokens (by Content)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="concurrency:loop_attributes" compatibility="9.4.001" expanded="true" height="82" name="Loop Attributes" width="90" x="447" y="34">
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="attribute_name_macro" value="la"/>
            <parameter key="reuse_results" value="false"/>
            <parameter key="enable_parallel_execution" value="false"/>
            <process expanded="true">
              <operator activated="true" class="select_attributes" compatibility="9.4.001" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="34">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="%{la}"/>
                <parameter key="attributes" value=""/>
                <parameter key="use_except_expression" value="false"/>
                <parameter key="value_type" value="attribute_value"/>
                <parameter key="use_value_type_exception" value="false"/>
                <parameter key="except_value_type" value="time"/>
                <parameter key="block_type" value="attribute_block"/>
                <parameter key="use_block_type_exception" value="false"/>
                <parameter key="except_block_type" value="value_matrix_row_start"/>
                <parameter key="invert_selection" value="false"/>
                <parameter key="include_special_attributes" value="false"/>
              </operator>
              <operator activated="true" class="rename" compatibility="9.4.001" expanded="true" height="82" name="Rename" width="90" x="179" y="34">
                <parameter key="old_name" value="%{la}"/>
                <parameter key="new_name" value="ngram"/>
                <list key="rename_additional_attributes"/>
              </operator>
              <operator activated="true" class="generate_attributes" compatibility="9.4.001" expanded="true" height="82" name="Generate Attributes" width="90" x="313" y="34">
                <list key="function_descriptions">
                  <parameter key="value" value="%{la}"/>
                </list>
                <parameter key="keep_all" value="true"/>
              </operator>
              <connect from_port="input 1" to_op="Select Attributes" to_port="example set input"/>
              <connect from_op="Select Attributes" from_port="example set output" to_op="Rename" to_port="example set input"/>
              <connect from_op="Rename" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
              <connect from_op="Generate Attributes" from_port="example set output" to_port="output 1"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="operator_toolbox:advanced_append" compatibility="2.2.000" expanded="true" height="82" name="Append (Superset)" width="90" x="581" y="34"/>
          <operator activated="true" class="blending:pivot" compatibility="9.4.001" expanded="true" height="82" name="Pivot (3)" origin="GENERATED_TURBOPREP" width="90" x="715" y="34">
            <parameter key="group_by_attributes" value="value"/>
            <parameter key="column_grouping_attribute" value="ID"/>
            <list key="aggregation_attributes">
              <parameter key="ngram" value="sum"/>
            </list>
            <parameter key="use_default_aggregation" value="false"/>
            <parameter key="default_aggregation_function" value="first"/>
          </operator>
          <operator activated="true" class="rename_by_replacing" compatibility="9.4.001" expanded="true" height="82" name="Rename by Replacing" width="90" x="849" y="34">
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="replace_what" value="sum\(ngram\)_(.*)$"/>
            <parameter key="replace_by" value="$1"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="9.4.001" expanded="true" height="82" name="Set Role" width="90" x="983" y="34">
            <parameter key="attribute_name" value="value"/>
            <parameter key="target_role" value="id"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_aggregation" compatibility="9.4.001" expanded="true" height="82" name="Generate Aggregation" width="90" x="1117" y="34">
            <parameter key="attribute_name" value="sum_of_all"/>
            <parameter key="attribute_filter_type" value="value_type"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="numeric"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="aggregation_function" value="sum"/>
            <parameter key="concatenation_separator" value="|"/>
            <parameter key="keep_all" value="true"/>
            <parameter key="ignore_missings" value="true"/>
            <parameter key="ignore_missing_attributes" value="false"/>
          </operator>
          <operator activated="true" class="concurrency:loop_attributes" compatibility="9.4.001" expanded="true" height="82" name="Loop Attributes (2)" width="90" x="1251" y="187">
            <parameter key="attribute_filter_type" value="all"/>
            <parameter key="attribute" value="%{la}"/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="attribute_value"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="attribute_name_macro" value="la"/>
            <parameter key="reuse_results" value="true"/>
            <parameter key="enable_parallel_execution" value="true"/>
            <process expanded="true">
              <operator activated="true" class="select_attributes" compatibility="9.4.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="45" y="34">
                <parameter key="attribute_filter_type" value="all"/>
                <parameter key="attribute" value=""/>
                <parameter key="attributes" value=""/>
                <parameter key="use_except_expression" value="false"/>
                <parameter key="value_type" value="attribute_value"/>
                <parameter key="use_value_type_exception" value="false"/>
                <parameter key="except_value_type" value="time"/>
                <parameter key="block_type" value="attribute_block"/>
                <parameter key="use_block_type_exception" value="false"/>
                <parameter key="except_block_type" value="value_matrix_row_start"/>
                <parameter key="invert_selection" value="false"/>
                <parameter key="include_special_attributes" value="false"/>
              </operator>
              <operator activated="true" class="numerical_to_polynominal" compatibility="9.4.001" expanded="true" height="82" name="Numerical to Polynominal" width="90" x="179" y="34">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="%{la}"/>
                <parameter key="attributes" value=""/>
                <parameter key="use_except_expression" value="false"/>
                <parameter key="value_type" value="numeric"/>
                <parameter key="use_value_type_exception" value="false"/>
                <parameter key="except_value_type" value="real"/>
                <parameter key="block_type" value="value_series"/>
                <parameter key="use_block_type_exception" value="false"/>
                <parameter key="except_block_type" value="value_series_end"/>
                <parameter key="invert_selection" value="false"/>
                <parameter key="include_special_attributes" value="false"/>
              </operator>
              <operator activated="true" class="replace" compatibility="9.4.001" expanded="true" height="82" name="Replace" width="90" x="313" y="34">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="%{la}"/>
                <parameter key="attributes" value=""/>
                <parameter key="use_except_expression" value="false"/>
                <parameter key="value_type" value="nominal"/>
                <parameter key="use_value_type_exception" value="false"/>
                <parameter key="except_value_type" value="file_path"/>
                <parameter key="block_type" value="single_value"/>
                <parameter key="use_block_type_exception" value="false"/>
                <parameter key="except_block_type" value="single_value"/>
                <parameter key="invert_selection" value="false"/>
                <parameter key="include_special_attributes" value="false"/>
                <parameter key="replace_what" value="^[^0].*$"/>
                <parameter key="replace_by" value="1"/>
              </operator>
              <operator activated="true" class="parse_numbers" compatibility="9.4.001" expanded="true" height="82" name="Parse Numbers" width="90" x="447" y="34">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="%{la}"/>
                <parameter key="attributes" value=""/>
                <parameter key="use_except_expression" value="false"/>
                <parameter key="value_type" value="nominal"/>
                <parameter key="use_value_type_exception" value="false"/>
                <parameter key="except_value_type" value="file_path"/>
                <parameter key="block_type" value="single_value"/>
                <parameter key="use_block_type_exception" value="false"/>
                <parameter key="except_block_type" value="single_value"/>
                <parameter key="invert_selection" value="false"/>
                <parameter key="include_special_attributes" value="false"/>
                <parameter key="decimal_character" value="."/>
                <parameter key="grouped_digits" value="false"/>
                <parameter key="grouping_character" value=","/>
                <parameter key="infinity_representation" value=""/>
                <parameter key="unparsable_value_handling" value="fail"/>
              </operator>
              <connect from_port="input 1" to_op="Select Attributes (2)" to_port="example set input"/>
              <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Numerical to Polynominal" to_port="example set input"/>
              <connect from_op="Numerical to Polynominal" from_port="example set output" to_op="Replace" to_port="example set input"/>
              <connect from_op="Replace" from_port="example set output" to_op="Parse Numbers" to_port="example set input"/>
              <connect from_op="Parse Numbers" from_port="example set output" to_port="output 1"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="source_input 2" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="generate_aggregation" compatibility="9.4.001" expanded="true" height="82" name="Generate Aggregation (2)" width="90" x="1385" y="187">
            <parameter key="attribute_name" value="sum_of_unique"/>
            <parameter key="attribute_filter_type" value="value_type"/>
            <parameter key="attribute" value=""/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="numeric"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="time"/>
            <parameter key="block_type" value="attribute_block"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="value_matrix_row_start"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
            <parameter key="aggregation_function" value="sum"/>
            <parameter key="concatenation_separator" value="|"/>
            <parameter key="keep_all" value="true"/>
            <parameter key="ignore_missings" value="true"/>
            <parameter key="ignore_missing_attributes" value="false"/>
          </operator>
          <operator activated="true" class="concurrency:join" compatibility="9.4.001" expanded="true" height="82" name="Join" width="90" x="1519" y="34">
            <parameter key="remove_double_attributes" value="true"/>
            <parameter key="join_type" value="inner"/>
            <parameter key="use_id_attribute_as_key" value="true"/>
            <list key="key_attributes">
              <parameter key="value" value="value"/>
            </list>
            <parameter key="keep_both_join_attributes" value="false"/>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="9.4.001" expanded="true" height="82" name="Generate Attributes (2)" width="90" x="1653" y="34">
            <list key="function_descriptions">
              <parameter key="delta" value="[sum_of_all]-[sum_of_unique]"/>
            </list>
            <parameter key="keep_all" value="true"/>
          </operator>
          <connect from_op="Retrieve Input Analyse" from_port="output" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="example set" to_op="Loop Attributes" to_port="input 1"/>
          <connect from_op="Loop Attributes" from_port="output 1" to_op="Append (Superset)" to_port="example set 1"/>
          <connect from_op="Append (Superset)" from_port="merged set" to_op="Pivot (3)" to_port="input"/>
          <connect from_op="Pivot (3)" from_port="output" to_op="Rename by Replacing" to_port="example set input"/>
          <connect from_op="Rename by Replacing" from_port="example set output" to_op="Set Role" to_port="example set input"/>
          <connect from_op="Set Role" from_port="example set output" to_op="Generate Aggregation" to_port="example set input"/>
          <connect from_op="Generate Aggregation" from_port="example set output" to_op="Join" to_port="left"/>
          <connect from_op="Generate Aggregation" from_port="original" to_op="Loop Attributes (2)" to_port="input 1"/>
          <connect from_op="Loop Attributes (2)" from_port="output 1" to_op="Generate Aggregation (2)" to_port="example set input"/>
          <connect from_op="Generate Aggregation (2)" from_port="example set output" to_op="Join" to_port="right"/>
          <connect from_op="Join" from_port="join" to_op="Generate Attributes (2)" to_port="example set input"/>
          <connect from_op="Generate Attributes (2)" from_port="example set output" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>