Tf-IDF in data values
Hello,
I have such data:
I want to generate TF IDF to know which values are the most popular and discriminatory.
Do you know how can I apply TF IDF on theese data?
Answers
-
Hi @olgakulesza2,
Here a general template of process to obtain the TF-IDF (to adapt to your own data) :
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="8.2.000" expanded="true" height="68" name="Read Excel" width="90" x="112" y="34">
<parameter key="excel_file" value="C:\Users\Lionel\Documents\Formations_DataScience\Rapidminer\Tests_Rapidminer\Tag_Name\Tag_name.xlsx"/>
<parameter key="imported_cell_range" value="A1:D11"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="Id.true.integer.attribute"/>
<parameter key="1" value="Author.true.polynominal.attribute"/>
<parameter key="2" value="Title.true.polynominal.attribute"/>
<parameter key="3" value="Tag name.true.polynominal.attribute"/>
</list>
</operator>
<operator activated="true" class="aggregate" compatibility="8.2.000" expanded="true" height="82" name="Aggregate" width="90" x="246" y="34">
<list key="aggregation_attributes">
<parameter key="Tag name" value="concatenation"/>
</list>
<parameter key="group_by_attributes" value="Author|Title|Id"/>
</operator>
<operator activated="true" class="split" compatibility="8.2.000" expanded="true" height="82" name="Split" width="90" x="380" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="concat(Tag name)"/>
<parameter key="split_pattern" value="[|]"/>
</operator>
<operator activated="true" class="concurrency:loop" compatibility="8.2.000" expanded="true" height="82" name="Loop" width="90" x="514" y="34">
<parameter key="number_of_iterations" value="10"/>
<parameter key="reuse_results" value="true"/>
<process expanded="true">
<operator activated="true" class="rename_by_generic_names" compatibility="8.2.000" expanded="true" height="82" name="Rename by Generic Names" width="90" x="313" y="85">
<parameter key="attribute_filter_type" value="regular_expression"/>
<parameter key="regular_expression" value="concat.*"/>
<parameter key="generic_name_stem" value="tag"/>
</operator>
<connect from_port="input 1" to_op="Rename by Generic Names" to_port="example set input"/>
<connect from_op="Rename by Generic Names" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="8.2.000" expanded="true" height="82" name="Nominal to Text" width="90" x="648" y="34">
<parameter key="attribute_filter_type" value="regular_expression"/>
<parameter key="regular_expression" value="tag.*"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="782" y="34">
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="447" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read Excel" from_port="output" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Aggregate" from_port="example set output" to_op="Split" to_port="example set input"/>
<connect from_op="Split" from_port="example set output" to_op="Loop" to_port="input 1"/>
<connect from_op="Loop" from_port="output 1" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
<connect from_op="Process Documents from Data" from_port="word list" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>I hope it helps,
Regards,
Lionel
0 -
Thank you @lionelderkrikor!
And do you know, how now filter only these values with the highest tf idf generated?
0 -
Hi again @olgakulesza2,
Yes, simply use WordList to Data and Sort operators at the end of the process :
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="8.2.000" expanded="true" height="68" name="Read Excel" width="90" x="112" y="34">
<parameter key="excel_file" value="C:\Users\Lionel\Documents\Formations_DataScience\Rapidminer\Tests_Rapidminer\Tag_Name\Tag_name.xlsx"/>
<parameter key="imported_cell_range" value="A1:D11"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="Id.true.integer.attribute"/>
<parameter key="1" value="Author.true.polynominal.attribute"/>
<parameter key="2" value="Title.true.polynominal.attribute"/>
<parameter key="3" value="Tag name.true.polynominal.attribute"/>
</list>
</operator>
<operator activated="true" class="aggregate" compatibility="8.2.000" expanded="true" height="82" name="Aggregate" width="90" x="246" y="34">
<list key="aggregation_attributes">
<parameter key="Tag name" value="concatenation"/>
</list>
<parameter key="group_by_attributes" value="Author|Title|Id"/>
</operator>
<operator activated="true" class="split" compatibility="8.2.000" expanded="true" height="82" name="Split" width="90" x="380" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="concat(Tag name)"/>
<parameter key="split_pattern" value="[|]"/>
</operator>
<operator activated="true" class="concurrency:loop" compatibility="8.2.000" expanded="true" height="82" name="Loop" width="90" x="514" y="34">
<parameter key="number_of_iterations" value="10"/>
<parameter key="reuse_results" value="true"/>
<process expanded="true">
<operator activated="true" class="rename_by_generic_names" compatibility="8.2.000" expanded="true" height="82" name="Rename by Generic Names" width="90" x="313" y="85">
<parameter key="attribute_filter_type" value="regular_expression"/>
<parameter key="regular_expression" value="concat.*"/>
<parameter key="generic_name_stem" value="tag"/>
</operator>
<connect from_port="input 1" to_op="Rename by Generic Names" to_port="example set input"/>
<connect from_op="Rename by Generic Names" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="8.2.000" expanded="true" height="82" name="Nominal to Text" width="90" x="648" y="34">
<parameter key="attribute_filter_type" value="regular_expression"/>
<parameter key="regular_expression" value="tag.*"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="782" y="34">
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="447" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:wordlist_to_data" compatibility="8.1.000" expanded="true" height="82" name="WordList to Data" width="90" x="782" y="187"/>
<operator activated="true" class="sort" compatibility="8.2.000" expanded="true" height="82" name="Sort" width="90" x="916" y="187">
<parameter key="attribute_name" value="total"/>
<parameter key="sorting_direction" value="decreasing"/>
</operator>
<connect from_op="Read Excel" from_port="output" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Aggregate" from_port="example set output" to_op="Split" to_port="example set input"/>
<connect from_op="Split" from_port="example set output" to_op="Loop" to_port="input 1"/>
<connect from_op="Loop" from_port="output 1" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
<connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
<connect from_op="WordList to Data" from_port="example set" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>Regards,
Lionel
0 -
Dear @lionelderkrikor
I have some problems with it. When I apply Sort or Filter Examples operators I got this:
Attribute name is empty.
Here my code:
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.1.003" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.1.003" expanded="true" height="68" name="Retrieve Books_Ratings_Tags_forUser10" width="90" x="45" y="34">
<parameter key="repository_entry" value="//NewLocalRepositoryOlga/Books_Ratings_Tags_forUser10"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.1.003" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="136">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="tag21|tag20|tag2|tag19|tag18|tag17|tag16|tag15|tag14|tag13|tag12|tag11|tag100|tag10|tag1|tag35|tag34|tag33|tag32|tag31|tag30|tag3|tag29|tag28|tag27|tag26|tag25|tag24|tag23|tag22|tag37|tag36|tag5|tag49|tag48|tag47|tag46|tag45|tag44|tag43|tag42|tag41|tag40|tag4|tag39|tag38|tag55|tag53|tag52|tag51|tag50|tag54|tag68|tag67|tag66|tag65|tag64|tag63|tag62|tag61|tag60|tag6|tag59|tag58|tag57|tag56|tag80|tag8|tag79|tag78|tag77|tag76|tag75|tag74|tag73|tag72|tag71|tag70|tag7|tag69|tag93|tag92|tag91|tag90|tag9|tag89|tag88|tag87|tag86|tag85|tag84|tag83|tag82|tag81|tag99|tag98|tag97|tag96|tag95|tag94|rating"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="8.1.003" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="187"/>
<operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="136">
<parameter key="vector_creation" value="Term Occurrences"/>
<list key="specify_weights"/>
<process expanded="true">
<connect from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:wordlist_to_data" compatibility="8.1.000" expanded="true" height="82" name="WordList to Data" width="90" x="447" y="187"/>
<operator activated="true" class="sort" compatibility="8.1.003" expanded="true" height="82" name="Sort" width="90" x="648" y="238"/>
<connect from_op="Retrieve Books_Ratings_Tags_forUser10" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
<connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
<connect from_op="WordList to Data" from_port="word list" to_port="result 3"/>
<connect from_op="WordList to Data" from_port="example set" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>0 -
It's a problem of data propagation : Sometimes it happens....
First try to go to menu Process --> Check Synchronize Meta Data with Real Data.
If attribute name is always empty, type directly the name of your attribute.
In your case, it's a priori the attribute total generated by the WordList to Data operator :
I hope it helps,
Regards,
Lionel
1