Select most important words
Hi to everybody,
after a classical process documents where i create a word-vector (TF-IDF) , is possible to select for each document only the terms (attributes) whose sum of the values of tf-idf rapresents the upperf half of the total sum of tf-idf of the same document , or other percentual? Maybe i have to set a weight operator but i don't know which and how. I need it to reduce the number of attributes.
Thank you all!
Answers
-
Hi,
I am not 100% sure what you want to achieve but it sounds like you could potentially use the pruning parameters for this... I would suggest to check them out and give this a try.
Hope this helps,
Ingo
1 -
Hi Ingo,
thank you for your answer, the pruning is not good for my use because it can eliminate some words important for a single document.
Here is a screnshot to understand better what i need:
Cumulated is the aggregate of tf-idf value of a single text, i want to select only the hight value terms that rapresent, for example, the 50%, of the cumulated value. I transponse the matrix only to better visualization. So in this way i hope to obtain only important words for a single document. Sometimes
happens that a specific word has an high tf-idf value for a document and a low-value in another, the goal is to maintain only words with a strong weight for every document weighted against the "cumulated" or to set 0 the lower-value words so i can go on with my analysis .
Thank you, I hope someone could help me.
0 -
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.0.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.0.002" expanded="true" height="68" name="Retrieve" width="90" x="45" y="34">
<parameter key="repository_entry" value="//ProgettoBigData/DatiPreProcess1/RimozioneDuplicatiSelezioneInglese"/>
</operator>
<operator activated="true" class="discretize_by_bins" compatibility="9.0.002" expanded="true" height="103" name="Discretize" width="90" x="179" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="confidence"/>
</operator>
<operator activated="true" class="text:data_to_documents" compatibility="8.1.000" expanded="true" height="68" name="Data to Documents" width="90" x="313" y="34">
<list key="specify_weights"/>
</operator>
<operator activated="true" class="text:process_documents" compatibility="8.1.000" expanded="true" height="103" name="Process Documents" width="90" x="313" y="136">
<parameter key="add_meta_information" value="false"/>
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="percentual"/>
<parameter key="prune_below_percent" value="0.2"/>
<parameter key="prune_above_percent" value="50.0"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="100000"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
<operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
<operator activated="true" class="text:replace_tokens" compatibility="8.1.000" expanded="true" height="68" name="Replace Tokens" width="90" x="112" y="136">
<list key="replace_dictionary">
<parameter key="aa" value="a"/>
</list>
</operator>
<operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="246" y="136"/>
<operator activated="true" class="text:stem_porter" compatibility="8.1.000" expanded="true" height="68" name="Stem (Porter)" width="90" x="380" y="34"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (2)" width="90" x="514" y="34"/>
<operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="648" y="34">
<parameter key="min_chars" value="3"/>
<parameter key="max_chars" value="15"/>
</operator>
<operator activated="true" class="text:extract_length" compatibility="8.1.000" expanded="true" height="68" name="Extract Length" width="90" x="782" y="34"/>
<operator activated="true" class="text:extract_token_number" compatibility="8.1.000" expanded="true" height="68" name="Extract Token Number" width="90" x="916" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Replace Tokens" to_port="document"/>
<connect from_op="Replace Tokens" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
<connect from_op="Stem (Porter)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
<connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Extract Length" to_port="document"/>
<connect from_op="Extract Length" from_port="document" to_op="Extract Token Number" to_port="document"/>
<connect from_op="Extract Token Number" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="set_role" compatibility="9.0.002" expanded="true" height="82" name="Set Role" width="90" x="447" y="34">
<parameter key="attribute_name" value="text"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="multiply" compatibility="9.0.002" expanded="true" height="103" name="Multiply" width="90" x="581" y="34"/>
<operator activated="true" class="generate_aggregation" compatibility="9.0.002" expanded="true" height="82" name="Generate Aggregation" width="90" x="715" y="85">
<parameter key="attribute_name" value="cumulated"/>
<parameter key="keep_all" value="false"/>
</operator>
<operator activated="true" class="set_role" compatibility="9.0.002" expanded="true" height="82" name="Set Role (2)" width="90" x="849" y="136">
<parameter key="attribute_name" value="cumulated"/>
<parameter key="target_role" value="weight"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="concurrency:join" compatibility="9.0.002" expanded="true" height="82" name="Join" width="90" x="983" y="85">
<parameter key="use_id_attribute_as_key" value="true"/>
<list key="key_attributes"/>
</operator>
<operator activated="true" class="multiply" compatibility="9.0.002" expanded="true" height="103" name="Multiply (2)" width="90" x="1117" y="85"/>
<operator activated="true" class="set_role" compatibility="9.0.002" expanded="true" height="82" name="Set Role (3)" width="90" x="1117" y="238">
<parameter key="attribute_name" value="text"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="transpose" compatibility="9.0.002" expanded="true" height="82" name="Transpose (2)" width="90" x="1251" y="136"/>
<connect from_op="Retrieve" from_port="output" to_op="Discretize" to_port="example set input"/>
<connect from_op="Discretize" from_port="example set output" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Data to Documents" from_port="documents" to_op="Process Documents" to_port="documents 1"/>
<connect from_op="Process Documents" from_port="example set" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Join" to_port="left"/>
<connect from_op="Multiply" from_port="output 2" to_op="Generate Aggregation" to_port="example set input"/>
<connect from_op="Generate Aggregation" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Join" to_port="right"/>
<connect from_op="Join" from_port="join" to_op="Multiply (2)" to_port="input"/>
<connect from_op="Multiply (2)" from_port="output 1" to_port="result 1"/>
<connect from_op="Multiply (2)" from_port="output 2" to_op="Set Role (3)" to_port="example set input"/>
<connect from_op="Set Role (3)" from_port="example set output" to_op="Transpose (2)" to_port="example set input"/>
<connect from_op="Transpose (2)" from_port="example set output" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>Here XML, sorry i forgot!
0 -
Hi,
Got it now, thanks :-)
If the threshold is exactly 50%, the easiest way to achieve this is to use a median aggregation. Here is the concept:
- Do the regular text processing first. I kept some pruning in to keep the number of columns a bit lower. You can change this of course if you like. I use the operator "Process Documents from Files" for this, but any of the text processing operators will do.
- The next thing I did is to set "0" as missing value for all TFIDF columns. This will prevent the 0 from being used in the median calculation in the next step (otherwise the median is pretty much always 0 since most values are 0 for most docs). I use the operator "Declare Missing Value" for this.
- Now you can calculate the median of all columns (since the calculation won't use the missing values, this is the median of all non-zero values now - just in your cumulative example above). I use the operator "Generate Aggregate" for this. Please note that you might need to restrict which columns are used for this operator (and for the other ones for that matter). Since all my regular attributes are TFIDF columns I can simply use all columns here.
- Now the tricky part: we need to set all TFIDF values to zero if they are lower than the median (or any other threshold, see below). We also need to set the missing values back to zero as well. Luckily this can be done in the same step. I use the operator "Loop Attributes" for this (I hope you are familiar with the loops and macros in RapidMiner, if not please check out the links at the bottom of this post. Inside of the loop, I replace each TFIDF column with a new one which is 0 if the value is lower than the median threshold or missing. And just the actual value if it is bigger than the median threshold.
- As the last step, I remove the median threshold attribute which I have created before. I use "Select Attributes" for this with an inverted selection.
For other percentages than 50%, you will need to come up with a smarter threshold calculation. Or you simply keep the median and apply some "correction" factor, e.g. something like 0.9 * median or 1.1 * median to remove more features or include more.
Below are the links explaining how to work with macros as well a sample process (you will need to adapt the data sources).
Hope this helps,
Ingo
Information on macros:
- https://community.rapidminer.com/t5/RapidMiner-Studio-Knowledge-Base/How-to-Use-Macros/ta-p/32966
- https://docs.rapidminer.com/latest/studio/getting-started/macros.html
Example process:<?xml version="1.0" encoding="UTF-8"?><process version="9.0.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:process_document_from_file" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Files" width="90" x="45" y="34">
<list key="text_directories">
<parameter key="negative" value="C:\Users\IngoMierswa\Desktop\Latest Materials\Data\sentiment\neg"/>
<parameter key="positive" value="C:\Users\IngoMierswa\Desktop\Latest Materials\Data\sentiment\pos"/>
</list>
<parameter key="prune_method" value="percentual"/>
<parameter key="prune_above_percent" value="100.0"/>
<parameter key="prune_above_rank" value="0.05"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30"/>
<operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="60" name="Transform Cases" width="90" x="179" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="313" y="30"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="declare_missing_value" compatibility="9.0.002" expanded="true" height="82" name="Declare Missing Value" width="90" x="179" y="34">
<parameter key="numeric_value" value="0.0"/>
</operator>
<operator activated="true" class="generate_aggregation" compatibility="9.0.002" expanded="true" height="82" name="Generate Aggregation" width="90" x="313" y="34">
<parameter key="attribute_name" value="median_tfidf"/>
<parameter key="aggregation_function" value="median"/>
</operator>
<operator activated="true" class="concurrency:loop_attributes" compatibility="9.0.002" expanded="true" height="82" name="Loop Attributes" width="90" x="447" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="avg_tfidf"/>
<parameter key="invert_selection" value="true"/>
<parameter key="reuse_results" value="true"/>
<process expanded="true">
<operator activated="true" class="generate_attributes" compatibility="9.0.002" expanded="true" height="82" name="Generate Attributes" width="90" x="45" y="34">
<list key="function_descriptions">
<parameter key="%{loop_attribute}" value="if(eval(%{loop_attribute})>[median_tfidf],eval(%{loop_attribute}),0)"/>
</list>
</operator>
<connect from_port="input 1" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="select_attributes" compatibility="9.0.002" expanded="true" height="82" name="Select Attributes" width="90" x="581" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="median_tfidf"/>
<parameter key="invert_selection" value="true"/>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Declare Missing Value" to_port="example set input"/>
<connect from_op="Declare Missing Value" from_port="example set output" to_op="Generate Aggregation" to_port="example set input"/>
<connect from_op="Generate Aggregation" from_port="example set output" to_op="Loop Attributes" to_port="input 1"/>
<connect from_op="Loop Attributes" from_port="output 1" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="72"/>
</process>
</operator>
</process>2 -
For all -
I have put @IngoRM's solution & data sets in the community repository:
Selecting 'Most Important Words' of a Document Corpus
Scott
1