Altair RISE

A program to recognize and reward our most engaged community members

Nominate Yourself Now!

Tf-IDF in data values

olgakulesza2

Hello,

I have such data:

I want to generate TF IDF to know which values are the most popular and discriminatory.

Do you know how can I apply TF IDF on theese data?

Find more posts tagged with

AI Studio

Accepted answers

All comments

lionelderkrikor

Hi @olgakulesza2,

Here a general template of process to obtain the TF-IDF (to adapt to your own data) :

<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="read_excel" compatibility="8.2.000" expanded="true" height="68" name="Read Excel" width="90" x="112" y="34">
        <parameter key="excel_file" value="C:\Users\Lionel\Documents\Formations_DataScience\Rapidminer\Tests_Rapidminer\Tag_Name\Tag_name.xlsx"/>
        <parameter key="imported_cell_range" value="A1:D11"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="Id.true.integer.attribute"/>
          <parameter key="1" value="Author.true.polynominal.attribute"/>
          <parameter key="2" value="Title.true.polynominal.attribute"/>
          <parameter key="3" value="Tag name.true.polynominal.attribute"/>
        </list>
      </operator>
      <operator activated="true" class="aggregate" compatibility="8.2.000" expanded="true" height="82" name="Aggregate" width="90" x="246" y="34">
        <list key="aggregation_attributes">
          <parameter key="Tag name" value="concatenation"/>
        </list>
        <parameter key="group_by_attributes" value="Author|Title|Id"/>
      </operator>
      <operator activated="true" class="split" compatibility="8.2.000" expanded="true" height="82" name="Split" width="90" x="380" y="34">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="concat(Tag name)"/>
        <parameter key="split_pattern" value="[|]"/>
      </operator>
      <operator activated="true" class="concurrency:loop" compatibility="8.2.000" expanded="true" height="82" name="Loop" width="90" x="514" y="34">
        <parameter key="number_of_iterations" value="10"/>
        <parameter key="reuse_results" value="true"/>
        <process expanded="true">
          <operator activated="true" class="rename_by_generic_names" compatibility="8.2.000" expanded="true" height="82" name="Rename by Generic Names" width="90" x="313" y="85">
            <parameter key="attribute_filter_type" value="regular_expression"/>
            <parameter key="regular_expression" value="concat.*"/>
            <parameter key="generic_name_stem" value="tag"/>
          </operator>
          <connect from_port="input 1" to_op="Rename by Generic Names" to_port="example set input"/>
          <connect from_op="Rename by Generic Names" from_port="example set output" to_port="output 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="8.2.000" expanded="true" height="82" name="Nominal to Text" width="90" x="648" y="34">
        <parameter key="attribute_filter_type" value="regular_expression"/>
        <parameter key="regular_expression" value="tag.*"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="782" y="34">
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="447" y="34"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Read Excel" from_port="output" to_op="Aggregate" to_port="example set input"/>
      <connect from_op="Aggregate" from_port="example set output" to_op="Split" to_port="example set input"/>
      <connect from_op="Split" from_port="example set output" to_op="Loop" to_port="input 1"/>
      <connect from_op="Loop" from_port="output 1" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

I hope it helps,

Regards,

Lionel

olgakulesza2

Thank you @lionelderkrikor!

And do you know, how now filter only these values with the highest tf idf generated?

lionelderkrikor

Hi again @olgakulesza2,

Yes, simply use WordList to Data and Sort operators at the end of the process :

<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="read_excel" compatibility="8.2.000" expanded="true" height="68" name="Read Excel" width="90" x="112" y="34">
        <parameter key="excel_file" value="C:\Users\Lionel\Documents\Formations_DataScience\Rapidminer\Tests_Rapidminer\Tag_Name\Tag_name.xlsx"/>
        <parameter key="imported_cell_range" value="A1:D11"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="Id.true.integer.attribute"/>
          <parameter key="1" value="Author.true.polynominal.attribute"/>
          <parameter key="2" value="Title.true.polynominal.attribute"/>
          <parameter key="3" value="Tag name.true.polynominal.attribute"/>
        </list>
      </operator>
      <operator activated="true" class="aggregate" compatibility="8.2.000" expanded="true" height="82" name="Aggregate" width="90" x="246" y="34">
        <list key="aggregation_attributes">
          <parameter key="Tag name" value="concatenation"/>
        </list>
        <parameter key="group_by_attributes" value="Author|Title|Id"/>
      </operator>
      <operator activated="true" class="split" compatibility="8.2.000" expanded="true" height="82" name="Split" width="90" x="380" y="34">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="concat(Tag name)"/>
        <parameter key="split_pattern" value="[|]"/>
      </operator>
      <operator activated="true" class="concurrency:loop" compatibility="8.2.000" expanded="true" height="82" name="Loop" width="90" x="514" y="34">
        <parameter key="number_of_iterations" value="10"/>
        <parameter key="reuse_results" value="true"/>
        <process expanded="true">
          <operator activated="true" class="rename_by_generic_names" compatibility="8.2.000" expanded="true" height="82" name="Rename by Generic Names" width="90" x="313" y="85">
            <parameter key="attribute_filter_type" value="regular_expression"/>
            <parameter key="regular_expression" value="concat.*"/>
            <parameter key="generic_name_stem" value="tag"/>
          </operator>
          <connect from_port="input 1" to_op="Rename by Generic Names" to_port="example set input"/>
          <connect from_op="Rename by Generic Names" from_port="example set output" to_port="output 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="8.2.000" expanded="true" height="82" name="Nominal to Text" width="90" x="648" y="34">
        <parameter key="attribute_filter_type" value="regular_expression"/>
        <parameter key="regular_expression" value="tag.*"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="782" y="34">
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="447" y="34"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:wordlist_to_data" compatibility="8.1.000" expanded="true" height="82" name="WordList to Data" width="90" x="782" y="187"/>
      <operator activated="true" class="sort" compatibility="8.2.000" expanded="true" height="82" name="Sort" width="90" x="916" y="187">
        <parameter key="attribute_name" value="total"/>
        <parameter key="sorting_direction" value="decreasing"/>
      </operator>
      <connect from_op="Read Excel" from_port="output" to_op="Aggregate" to_port="example set input"/>
      <connect from_op="Aggregate" from_port="example set output" to_op="Split" to_port="example set input"/>
      <connect from_op="Split" from_port="example set output" to_op="Loop" to_port="input 1"/>
      <connect from_op="Loop" from_port="output 1" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
      <connect from_op="WordList to Data" from_port="example set" to_op="Sort" to_port="example set input"/>
      <connect from_op="Sort" from_port="example set output" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Regards,

Lionel

olgakulesza2

Dear @lionelderkrikor

I have some problems with it. When I apply Sort or Filter Examples operators I got this:

Attribute name is empty.

Here my code:

<?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="8.1.003" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="8.1.003" expanded="true" height="68" name="Retrieve Books_Ratings_Tags_forUser10" width="90" x="45" y="34">
        <parameter key="repository_entry" value="//NewLocalRepositoryOlga/Books_Ratings_Tags_forUser10"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="8.1.003" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="136">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="tag21|tag20|tag2|tag19|tag18|tag17|tag16|tag15|tag14|tag13|tag12|tag11|tag100|tag10|tag1|tag35|tag34|tag33|tag32|tag31|tag30|tag3|tag29|tag28|tag27|tag26|tag25|tag24|tag23|tag22|tag37|tag36|tag5|tag49|tag48|tag47|tag46|tag45|tag44|tag43|tag42|tag41|tag40|tag4|tag39|tag38|tag55|tag53|tag52|tag51|tag50|tag54|tag68|tag67|tag66|tag65|tag64|tag63|tag62|tag61|tag60|tag6|tag59|tag58|tag57|tag56|tag80|tag8|tag79|tag78|tag77|tag76|tag75|tag74|tag73|tag72|tag71|tag70|tag7|tag69|tag93|tag92|tag91|tag90|tag9|tag89|tag88|tag87|tag86|tag85|tag84|tag83|tag82|tag81|tag99|tag98|tag97|tag96|tag95|tag94|rating"/>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="8.1.003" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="187"/>
      <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="136">
        <parameter key="vector_creation" value="Term Occurrences"/>
        <list key="specify_weights"/>
        <process expanded="true">
          <connect from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:wordlist_to_data" compatibility="8.1.000" expanded="true" height="82" name="WordList to Data" width="90" x="447" y="187"/>
      <operator activated="true" class="sort" compatibility="8.1.003" expanded="true" height="82" name="Sort" width="90" x="648" y="238"/>
      <connect from_op="Retrieve Books_Ratings_Tags_forUser10" from_port="output" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
      <connect from_op="WordList to Data" from_port="word list" to_port="result 3"/>
      <connect from_op="WordList to Data" from_port="example set" to_op="Sort" to_port="example set input"/>
      <connect from_op="Sort" from_port="example set output" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
    </process>
  </operator>
</process>

sort.png

filterer.PNG

lionelderkrikor

@olgakulesza2,

It's a problem of data propagation : Sometimes it happens....

First try to go to menu Process --> Check Synchronize Meta Data with Real Data.

If attribute name is always empty, type directly the name of your attribute.

In your case, it's a priori the attribute total generated by the WordList to Data operator :

I hope it helps,

Regards,

Lionel

Tag_Name_2.png

Tag_Name_3.png