"Generate pivoted example set from word vector"
Hi,
I have got some text entries in an excel worksheet that I would like to text mine and find associations(if any) between some words.
So my initial thinking was to process the text into Process Documents from Data->Convert WordList to Data and then Pivot it. The problem is after processing documents, I only get a word vector and their counts(I am using TF-IDF), and I cannot see how I could easily achieve this.
In short if I have this excel sheet:
column A: columnB
review1: This is text for review1
review2: This text for review2
Out of this, I would like to end up with a pivot table that looks like:
This is text for review1 review2
review1 1 1 1 1 1 0
review2 1 0 1 1 0 1
Here is the process XML that I am using:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.1.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.1.001" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="7.1.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34">
<parameter key="excel_file" value="D:\data\offers\pizza.xlsx"/>
<parameter key="sheet_number" value="1"/>
<parameter key="imported_cell_range" value="A1:B10"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="date_format" value=""/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="source.true.text.attribute"/>
<parameter key="1" value="textfeed.true.text.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="true"/>
<parameter key="datamanagement" value="double_array"/>
</operator>
<operator activated="true" breakpoints="before,after" class="text:process_document_from_data" compatibility="7.1.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="179" y="34">
<parameter key="create_word_vector" value="true"/>
<parameter key="vector_creation" value="Term Frequency"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="false"/>
<parameter key="prune_method" value="none"/>
<parameter key="prune_below_percent" value="3.0"/>
<parameter key="prune_above_percent" value="30.0"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="select_attributes_and_weights" value="false"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.1.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34">
<parameter key="mode" value="non letters"/>
<parameter key="characters" value=".:"/>
<parameter key="language" value="English"/>
<parameter key="max_token_length" value="3"/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:wordlist_to_data" compatibility="7.1.001" expanded="true" height="82" name="WordList to Data" width="90" x="380" y="34"/>
<connect from_op="Read Excel" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
<connect from_op="WordList to Data" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Best Answer
-
Are you looking for something like this?
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.1.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.1.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="7.1.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34">
<parameter key="excel_file" value="C:\Users\tott_000\Desktop\Review Text.xlsx"/>
<parameter key="imported_cell_range" value="A1:B3"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="Label.true.polynominal.label"/>
<parameter key="1" value="Text.true.text.attribute"/>
</list>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="7.1.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="34">
<parameter key="vector_creation" value="Binary Term Occurrences"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.1.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read Excel" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>1
Answers
-
I would suggest giving the WordList to Data operator a try. It will take the Word Vectors and break them into a data table. From there you can pivot or rotate as needed.
0 -
WordList to Data isn't helping either. By the time I get the vector of words as output of Process Document from Data, the original attribute that contains review1 and review2 would have been lost. It doesn't come out of the Process Document operator.
0 -
Hi Chiko,
What I can recommend is generate a ID using teh "generate ID" before you do "process documents from Data"
Once the vectorized exampleset comes out, you can then simply do a join (left/right, inner) all shoudl woudl in this case,
Based on the ID,
Let us know if this helps
0 -
Thanks BP,
I have added a "Generate ID" operator, followed by "Set Role" in which I set my newly generated Id to target role="id", then fed this into the "Process Document from Data" operator. However still doesn't help due to the fact that for some reason, once the examples are fed into "Process Document from Data", I somehow "loose" the previoulsy generated ID, so i have nothing to join this to. See output below:
0 -
Are you looking for something like this?
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.1.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.1.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="7.1.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34">
<parameter key="excel_file" value="C:\Users\tott_000\Desktop\Review Text.xlsx"/>
<parameter key="imported_cell_range" value="A1:B3"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="Label.true.polynominal.label"/>
<parameter key="1" value="Text.true.text.attribute"/>
</list>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="7.1.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="34">
<parameter key="vector_creation" value="Binary Term Occurrences"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.1.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read Excel" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>1 -
Thanks TBone, this is precisely what I wanted. Interesting result you came up with, I will compare your process aginst my original process so I can better understand this. A quick observation is your use of a binary occurences word vector as opposed to my TF-IDF.
Thanks very much!!
0