A program to recognize and reward our most engaged community members
Hello,
I have such data:
I want to generate TF IDF to know which values are the most popular and discriminatory.
Do you know how can I apply TF IDF on theese data?
Hi @olgakulesza2,
Here a general template of process to obtain the TF-IDF (to adapt to your own data) :
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process"> <process expanded="true"> <operator activated="true" class="read_excel" compatibility="8.2.000" expanded="true" height="68" name="Read Excel" width="90" x="112" y="34"> <parameter key="excel_file" value="C:\Users\Lionel\Documents\Formations_DataScience\Rapidminer\Tests_Rapidminer\Tag_Name\Tag_name.xlsx"/> <parameter key="imported_cell_range" value="A1:D11"/> <parameter key="first_row_as_names" value="false"/> <list key="annotations"> <parameter key="0" value="Name"/> </list> <list key="data_set_meta_data_information"> <parameter key="0" value="Id.true.integer.attribute"/> <parameter key="1" value="Author.true.polynominal.attribute"/> <parameter key="2" value="Title.true.polynominal.attribute"/> <parameter key="3" value="Tag name.true.polynominal.attribute"/> </list> </operator> <operator activated="true" class="aggregate" compatibility="8.2.000" expanded="true" height="82" name="Aggregate" width="90" x="246" y="34"> <list key="aggregation_attributes"> <parameter key="Tag name" value="concatenation"/> </list> <parameter key="group_by_attributes" value="Author|Title|Id"/> </operator> <operator activated="true" class="split" compatibility="8.2.000" expanded="true" height="82" name="Split" width="90" x="380" y="34"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="concat(Tag name)"/> <parameter key="split_pattern" value="[|]"/> </operator> <operator activated="true" class="concurrency:loop" compatibility="8.2.000" expanded="true" height="82" name="Loop" width="90" x="514" y="34"> <parameter key="number_of_iterations" value="10"/> <parameter key="reuse_results" value="true"/> <process expanded="true"> <operator activated="true" class="rename_by_generic_names" compatibility="8.2.000" expanded="true" height="82" name="Rename by Generic Names" width="90" x="313" y="85"> <parameter key="attribute_filter_type" value="regular_expression"/> <parameter key="regular_expression" value="concat.*"/> <parameter key="generic_name_stem" value="tag"/> </operator> <connect from_port="input 1" to_op="Rename by Generic Names" to_port="example set input"/> <connect from_op="Rename by Generic Names" from_port="example set output" to_port="output 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="source_input 2" spacing="0"/> <portSpacing port="sink_output 1" spacing="0"/> <portSpacing port="sink_output 2" spacing="0"/> </process> </operator> <operator activated="true" class="nominal_to_text" compatibility="8.2.000" expanded="true" height="82" name="Nominal to Text" width="90" x="648" y="34"> <parameter key="attribute_filter_type" value="regular_expression"/> <parameter key="regular_expression" value="tag.*"/> </operator> <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="782" y="34"> <list key="specify_weights"/> <process expanded="true"> <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="447" y="34"/> <connect from_port="document" to_op="Tokenize" to_port="document"/> <connect from_op="Tokenize" from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <connect from_op="Read Excel" from_port="output" to_op="Aggregate" to_port="example set input"/> <connect from_op="Aggregate" from_port="example set output" to_op="Split" to_port="example set input"/> <connect from_op="Split" from_port="example set output" to_op="Loop" to_port="input 1"/> <connect from_op="Loop" from_port="output 1" to_op="Nominal to Text" to_port="example set input"/> <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/> <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/> <connect from_op="Process Documents from Data" from_port="word list" to_port="result 2"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> <portSpacing port="sink_result 3" spacing="0"/> </process> </operator></process>
I hope it helps,
Regards,
Lionel
Thank you @lionelderkrikor!
And do you know, how now filter only these values with the highest tf idf generated?
Hi again @olgakulesza2,
Yes, simply use WordList to Data and Sort operators at the end of the process :
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process"> <process expanded="true"> <operator activated="true" class="read_excel" compatibility="8.2.000" expanded="true" height="68" name="Read Excel" width="90" x="112" y="34"> <parameter key="excel_file" value="C:\Users\Lionel\Documents\Formations_DataScience\Rapidminer\Tests_Rapidminer\Tag_Name\Tag_name.xlsx"/> <parameter key="imported_cell_range" value="A1:D11"/> <parameter key="first_row_as_names" value="false"/> <list key="annotations"> <parameter key="0" value="Name"/> </list> <list key="data_set_meta_data_information"> <parameter key="0" value="Id.true.integer.attribute"/> <parameter key="1" value="Author.true.polynominal.attribute"/> <parameter key="2" value="Title.true.polynominal.attribute"/> <parameter key="3" value="Tag name.true.polynominal.attribute"/> </list> </operator> <operator activated="true" class="aggregate" compatibility="8.2.000" expanded="true" height="82" name="Aggregate" width="90" x="246" y="34"> <list key="aggregation_attributes"> <parameter key="Tag name" value="concatenation"/> </list> <parameter key="group_by_attributes" value="Author|Title|Id"/> </operator> <operator activated="true" class="split" compatibility="8.2.000" expanded="true" height="82" name="Split" width="90" x="380" y="34"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="concat(Tag name)"/> <parameter key="split_pattern" value="[|]"/> </operator> <operator activated="true" class="concurrency:loop" compatibility="8.2.000" expanded="true" height="82" name="Loop" width="90" x="514" y="34"> <parameter key="number_of_iterations" value="10"/> <parameter key="reuse_results" value="true"/> <process expanded="true"> <operator activated="true" class="rename_by_generic_names" compatibility="8.2.000" expanded="true" height="82" name="Rename by Generic Names" width="90" x="313" y="85"> <parameter key="attribute_filter_type" value="regular_expression"/> <parameter key="regular_expression" value="concat.*"/> <parameter key="generic_name_stem" value="tag"/> </operator> <connect from_port="input 1" to_op="Rename by Generic Names" to_port="example set input"/> <connect from_op="Rename by Generic Names" from_port="example set output" to_port="output 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="source_input 2" spacing="0"/> <portSpacing port="sink_output 1" spacing="0"/> <portSpacing port="sink_output 2" spacing="0"/> </process> </operator> <operator activated="true" class="nominal_to_text" compatibility="8.2.000" expanded="true" height="82" name="Nominal to Text" width="90" x="648" y="34"> <parameter key="attribute_filter_type" value="regular_expression"/> <parameter key="regular_expression" value="tag.*"/> </operator> <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="782" y="34"> <list key="specify_weights"/> <process expanded="true"> <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="447" y="34"/> <connect from_port="document" to_op="Tokenize" to_port="document"/> <connect from_op="Tokenize" from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="text:wordlist_to_data" compatibility="8.1.000" expanded="true" height="82" name="WordList to Data" width="90" x="782" y="187"/> <operator activated="true" class="sort" compatibility="8.2.000" expanded="true" height="82" name="Sort" width="90" x="916" y="187"> <parameter key="attribute_name" value="total"/> <parameter key="sorting_direction" value="decreasing"/> </operator> <connect from_op="Read Excel" from_port="output" to_op="Aggregate" to_port="example set input"/> <connect from_op="Aggregate" from_port="example set output" to_op="Split" to_port="example set input"/> <connect from_op="Split" from_port="example set output" to_op="Loop" to_port="input 1"/> <connect from_op="Loop" from_port="output 1" to_op="Nominal to Text" to_port="example set input"/> <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/> <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/> <connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/> <connect from_op="WordList to Data" from_port="example set" to_op="Sort" to_port="example set input"/> <connect from_op="Sort" from_port="example set output" to_port="result 2"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> <portSpacing port="sink_result 3" spacing="0"/> </process> </operator></process>
Dear @lionelderkrikor
I have some problems with it. When I apply Sort or Filter Examples operators I got this:
Attribute name is empty.
Here my code:
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.003"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="8.1.003" expanded="true" name="Process"> <process expanded="true"> <operator activated="true" class="retrieve" compatibility="8.1.003" expanded="true" height="68" name="Retrieve Books_Ratings_Tags_forUser10" width="90" x="45" y="34"> <parameter key="repository_entry" value="//NewLocalRepositoryOlga/Books_Ratings_Tags_forUser10"/> </operator> <operator activated="true" class="select_attributes" compatibility="8.1.003" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="136"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attributes" value="tag21|tag20|tag2|tag19|tag18|tag17|tag16|tag15|tag14|tag13|tag12|tag11|tag100|tag10|tag1|tag35|tag34|tag33|tag32|tag31|tag30|tag3|tag29|tag28|tag27|tag26|tag25|tag24|tag23|tag22|tag37|tag36|tag5|tag49|tag48|tag47|tag46|tag45|tag44|tag43|tag42|tag41|tag40|tag4|tag39|tag38|tag55|tag53|tag52|tag51|tag50|tag54|tag68|tag67|tag66|tag65|tag64|tag63|tag62|tag61|tag60|tag6|tag59|tag58|tag57|tag56|tag80|tag8|tag79|tag78|tag77|tag76|tag75|tag74|tag73|tag72|tag71|tag70|tag7|tag69|tag93|tag92|tag91|tag90|tag9|tag89|tag88|tag87|tag86|tag85|tag84|tag83|tag82|tag81|tag99|tag98|tag97|tag96|tag95|tag94|rating"/> </operator> <operator activated="true" class="nominal_to_text" compatibility="8.1.003" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="187"/> <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="136"> <parameter key="vector_creation" value="Term Occurrences"/> <list key="specify_weights"/> <process expanded="true"> <connect from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="text:wordlist_to_data" compatibility="8.1.000" expanded="true" height="82" name="WordList to Data" width="90" x="447" y="187"/> <operator activated="true" class="sort" compatibility="8.1.003" expanded="true" height="82" name="Sort" width="90" x="648" y="238"/> <connect from_op="Retrieve Books_Ratings_Tags_forUser10" from_port="output" to_op="Select Attributes" to_port="example set input"/> <connect from_op="Select Attributes" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/> <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/> <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/> <connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/> <connect from_op="WordList to Data" from_port="word list" to_port="result 3"/> <connect from_op="WordList to Data" from_port="example set" to_op="Sort" to_port="example set input"/> <connect from_op="Sort" from_port="example set output" to_port="result 2"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> <portSpacing port="sink_result 3" spacing="0"/> <portSpacing port="sink_result 4" spacing="0"/> </process> </operator></process>
@olgakulesza2,
It's a problem of data propagation : Sometimes it happens....
First try to go to menu Process --> Check Synchronize Meta Data with Real Data.
If attribute name is always empty, type directly the name of your attribute.
In your case, it's a priori the attribute total generated by the WordList to Data operator :