Wordlist with column per link/text

saure
saure New Altair Community Member
edited November 5 in Community Q&A

Hello experts,

i`am looking for a solution to subdivide the wordlist after a web mining process.

the process i used based on this video: https://www.youtube.com/watch?v=OXIKydgGbYk

Read Excel (5 links) > Get Pages > Data to Document > Process Documents > Wordlist

Everything fine!

 

My question is:

Is it possible to subdived the wordlist with columns from the linklist?

Like this:

Word  | Attribute Name | Total Occurences | Document Occurences | Link1 | Link2 | Link3 | ...

power | power               | 14                         | 2                                   | 10      | 0        | 4       | ...

 

Is it possible to do this?

 

Here is the code:

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.4.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.4.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="7.4.000" expanded="true" height="68" name="Read Excel" width="90" x="179" y="289">
<parameter key="excel_file" value="C:\Users\saure\Desktop\Links_energate.xlsx"/>
<parameter key="sheet_number" value="1"/>
<parameter key="imported_cell_range" value="A1:A5"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="date_format" value=""/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="German"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="Link.true.file_path.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="true"/>
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
</operator>
<operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="313" y="289">
<parameter key="link_attribute" value="Link"/>
<parameter key="random_user_agent" value="true"/>
<parameter key="connection_timeout" value="10000"/>
<parameter key="read_timeout" value="10000"/>
<parameter key="follow_redirects" value="true"/>
<parameter key="accept_cookies" value="none"/>
<parameter key="cookie_scope" value="global"/>
<parameter key="request_method" value="GET"/>
<parameter key="delay" value="none"/>
<parameter key="delay_amount" value="1000"/>
<parameter key="min_delay_amount" value="0"/>
<parameter key="max_delay_amount" value="1000"/>
</operator>
<operator activated="true" class="text:data_to_documents" compatibility="7.4.001" expanded="true" height="68" name="Data to Documents" width="90" x="447" y="289">
<parameter key="select_attributes_and_weights" value="false"/>
<list key="specify_weights">
<parameter key="Kategorie" value="1.0"/>
</list>
</operator>
<operator activated="true" class="text:process_documents" compatibility="7.4.001" expanded="true" height="103" name="Process Documents" width="90" x="648" y="289">
<parameter key="create_word_vector" value="true"/>
<parameter key="vector_creation" value="Binary Term Occurrences"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="absolute"/>
<parameter key="prune_below_percent" value="3.0"/>
<parameter key="prune_above_percent" value="30.0"/>
<parameter key="prune_below_absolute" value="1"/>
<parameter key="prune_above_absolute" value="2"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="179" y="34">
<parameter key="extract_content" value="true"/>
<parameter key="minimum_text_block_length" value="2"/>
<parameter key="override_content_type_information" value="false"/>
<parameter key="neglegt_span_tags" value="true"/>
<parameter key="neglect_p_tags" value="true"/>
<parameter key="neglect_b_tags" value="true"/>
<parameter key="neglect_i_tags" value="true"/>
<parameter key="neglect_br_tags" value="true"/>
<parameter key="ignore_non_html_tags" value="true"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="447" y="34">
<parameter key="mode" value="non letters"/>
<parameter key="characters" value=".:"/>
<parameter key="language" value="English"/>
<parameter key="max_token_length" value="3"/>
</operator>
<operator activated="true" class="text:filter_by_length" compatibility="7.4.001" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="648" y="34">
<parameter key="min_chars" value="4"/>
<parameter key="max_chars" value="50"/>
</operator>
<connect from_port="document" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:wordlist_to_data" compatibility="7.4.001" expanded="true" height="82" name="WordList to Data" width="90" x="849" y="442"/>
<connect from_op="Read Excel" from_port="output" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Data to Documents" from_port="documents" to_op="Process Documents" to_port="documents 1"/>
<connect from_op="Process Documents" from_port="example set" to_port="result 1"/>
<connect from_op="Process Documents" from_port="word list" to_op="WordList to Data" to_port="word list"/>
<connect from_op="WordList to Data" from_port="word list" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>

Thanks for the support!

Bernd

 

Best Answers

  • MartinLiebig
    MartinLiebig
    Altair Employee
    Answer ✓

    Dear Bernd,

     

    was trickier than i expected, but i think the attached process should do the trick.


    ~Martin

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.5.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.5.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="false" class="read_excel" compatibility="7.5.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="493">
    <parameter key="excel_file" value="C:\Users\saure\Desktop\Links_energate.xlsx"/>
    <parameter key="imported_cell_range" value="A1:A5"/>
    <parameter key="first_row_as_names" value="false"/>
    <list key="annotations">
    <parameter key="0" value="Name"/>
    </list>
    <parameter key="locale" value="German"/>
    <list key="data_set_meta_data_information">
    <parameter key="0" value="Link.true.file_path.attribute"/>
    </list>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.5.001" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="45" y="238">
    <list key="attribute_values">
    <parameter key="Link" value="&quot;http://www.energate-messenger.de/news/suche/index.php?cmdStartSearch=1&amp;amp;categories[]=508&amp;amp;pattern[]=Bundesnetzagentur&amp;quot;"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.5.001" expanded="true" height="68" name="Generate Data by User Specification (2)" width="90" x="45" y="340">
    <list key="attribute_values">
    <parameter key="Link" value="&quot;http://www.energate-messenger.de/news/suche/index.php?cmdStartSearch=1&amp;amp;categories[]=508&amp;amp;pattern[]=Ausschreibungen&amp;quot;"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="append" compatibility="7.5.001" expanded="true" height="103" name="Append" width="90" x="179" y="289"/>
    <operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="313" y="289">
    <parameter key="link_attribute" value="Link"/>
    <parameter key="random_user_agent" value="true"/>
    </operator>
    <operator activated="true" class="text:data_to_documents" compatibility="7.5.000" expanded="true" height="68" name="Data to Documents" width="90" x="447" y="289">
    <list key="specify_weights">
    <parameter key="Kategorie" value="1.0"/>
    </list>
    </operator>
    <operator activated="true" class="text:process_documents" compatibility="7.5.000" expanded="true" height="103" name="Process Documents" width="90" x="715" y="238">
    <parameter key="vector_creation" value="Binary Term Occurrences"/>
    <parameter key="keep_text" value="true"/>
    <parameter key="prune_method" value="absolute"/>
    <parameter key="prune_below_absolute" value="1"/>
    <parameter key="prune_above_absolute" value="2"/>
    <process expanded="true">
    <operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="179" y="34">
    <parameter key="minimum_text_block_length" value="2"/>
    <parameter key="override_content_type_information" value="false"/>
    </operator>
    <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="447" y="34"/>
    <operator activated="true" class="text:filter_by_length" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="648" y="34">
    <parameter key="max_chars" value="50"/>
    </operator>
    <connect from_port="document" to_op="Extract Content" to_port="document"/>
    <connect from_op="Extract Content" from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
    <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:wordlist_to_data" compatibility="7.5.000" expanded="true" height="82" name="WordList to Data" width="90" x="849" y="289"/>
    <operator activated="true" class="generate_attributes" compatibility="7.5.001" expanded="true" height="82" name="Generate Attributes" width="90" x="983" y="340">
    <list key="function_descriptions">
    <parameter key="id" value="1"/>
    </list>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.5.001" expanded="true" height="82" name="Select Attributes" width="90" x="1117" y="340">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="in documents"/>
    <parameter key="invert_selection" value="true"/>
    </operator>
    <operator activated="true" class="pivot" compatibility="7.5.001" expanded="true" height="82" name="Pivot" width="90" x="1251" y="340">
    <parameter key="group_attribute" value="id"/>
    <parameter key="index_attribute" value="word"/>
    <parameter key="skip_constant_attributes" value="false"/>
    </operator>
    <operator activated="true" class="cartesian_product" compatibility="7.5.001" expanded="true" height="82" name="Cartesian" width="90" x="1385" y="238"/>
    <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
    <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
    <connect from_op="Append" from_port="merged set" to_op="Get Pages" to_port="Example Set"/>
    <connect from_op="Get Pages" from_port="Example Set" to_op="Data to Documents" to_port="example set"/>
    <connect from_op="Data to Documents" from_port="documents" to_op="Process Documents" to_port="documents 1"/>
    <connect from_op="Process Documents" from_port="example set" to_op="Cartesian" to_port="left"/>
    <connect from_op="Process Documents" from_port="word list" to_op="WordList to Data" to_port="word list"/>
    <connect from_op="WordList to Data" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
    <connect from_op="Generate Attributes" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Pivot" to_port="example set input"/>
    <connect from_op="Pivot" from_port="example set output" to_op="Cartesian" to_port="right"/>
    <connect from_op="Cartesian" from_port="join" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>
  • MartinLiebig
    MartinLiebig
    Altair Employee
    Answer ✓

    Dear Bernd,

     

    you can switch the Generate Data operators to your read excel again. This was just my quick and dirty way to get your URLs in.

     

    For the occurcences, you can switch from Binary Occurences to Occurences in Process Documents. That should do the trick.

     

    Best,

    Martin

  • saure
    saure New Altair Community Member
    Answer ✓

    Dear Martin,

     

    thanks again. That`s it!

     

    Best regards
    Bernd

Answers

  • MartinLiebig
    MartinLiebig
    Altair Employee

    Dear Bernd,

     

    could you explain a bit more what you mean by "subdivide"? 

     

    BR,

    Martin 

  • Thomas_Ott
    Thomas_Ott New Altair Community Member

    This is in the wrong Forum. Will move. 

  • saure
    saure New Altair Community Member

    Hello Martin,

    subdivide is the wrong term, expand is the better one.

    The wordlist should show the total occurrence and also the occurrence per link/doc.

     

    I hope this is understandable.

    BR, Bernd

  • MartinLiebig
    MartinLiebig
    Altair Employee

    Hi,

     

    sounds like you can use aggregate andjoin to do this. Any chance you can sent me the first 10 lines of your excel file as a private message?

     

    ~Martin

  • MartinLiebig
    MartinLiebig
    Altair Employee
    Answer ✓

    Dear Bernd,

     

    was trickier than i expected, but i think the attached process should do the trick.


    ~Martin

     

    <?xml version="1.0" encoding="UTF-8"?><process version="7.5.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="7.5.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="false" class="read_excel" compatibility="7.5.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="493">
    <parameter key="excel_file" value="C:\Users\saure\Desktop\Links_energate.xlsx"/>
    <parameter key="imported_cell_range" value="A1:A5"/>
    <parameter key="first_row_as_names" value="false"/>
    <list key="annotations">
    <parameter key="0" value="Name"/>
    </list>
    <parameter key="locale" value="German"/>
    <list key="data_set_meta_data_information">
    <parameter key="0" value="Link.true.file_path.attribute"/>
    </list>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.5.001" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="45" y="238">
    <list key="attribute_values">
    <parameter key="Link" value="&quot;http://www.energate-messenger.de/news/suche/index.php?cmdStartSearch=1&amp;amp;categories[]=508&amp;amp;pattern[]=Bundesnetzagentur&amp;quot;"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="generate_data_user_specification" compatibility="7.5.001" expanded="true" height="68" name="Generate Data by User Specification (2)" width="90" x="45" y="340">
    <list key="attribute_values">
    <parameter key="Link" value="&quot;http://www.energate-messenger.de/news/suche/index.php?cmdStartSearch=1&amp;amp;categories[]=508&amp;amp;pattern[]=Ausschreibungen&amp;quot;"/>
    </list>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="append" compatibility="7.5.001" expanded="true" height="103" name="Append" width="90" x="179" y="289"/>
    <operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="313" y="289">
    <parameter key="link_attribute" value="Link"/>
    <parameter key="random_user_agent" value="true"/>
    </operator>
    <operator activated="true" class="text:data_to_documents" compatibility="7.5.000" expanded="true" height="68" name="Data to Documents" width="90" x="447" y="289">
    <list key="specify_weights">
    <parameter key="Kategorie" value="1.0"/>
    </list>
    </operator>
    <operator activated="true" class="text:process_documents" compatibility="7.5.000" expanded="true" height="103" name="Process Documents" width="90" x="715" y="238">
    <parameter key="vector_creation" value="Binary Term Occurrences"/>
    <parameter key="keep_text" value="true"/>
    <parameter key="prune_method" value="absolute"/>
    <parameter key="prune_below_absolute" value="1"/>
    <parameter key="prune_above_absolute" value="2"/>
    <process expanded="true">
    <operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="179" y="34">
    <parameter key="minimum_text_block_length" value="2"/>
    <parameter key="override_content_type_information" value="false"/>
    </operator>
    <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="447" y="34"/>
    <operator activated="true" class="text:filter_by_length" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="648" y="34">
    <parameter key="max_chars" value="50"/>
    </operator>
    <connect from_port="document" to_op="Extract Content" to_port="document"/>
    <connect from_op="Extract Content" from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
    <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:wordlist_to_data" compatibility="7.5.000" expanded="true" height="82" name="WordList to Data" width="90" x="849" y="289"/>
    <operator activated="true" class="generate_attributes" compatibility="7.5.001" expanded="true" height="82" name="Generate Attributes" width="90" x="983" y="340">
    <list key="function_descriptions">
    <parameter key="id" value="1"/>
    </list>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="7.5.001" expanded="true" height="82" name="Select Attributes" width="90" x="1117" y="340">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="in documents"/>
    <parameter key="invert_selection" value="true"/>
    </operator>
    <operator activated="true" class="pivot" compatibility="7.5.001" expanded="true" height="82" name="Pivot" width="90" x="1251" y="340">
    <parameter key="group_attribute" value="id"/>
    <parameter key="index_attribute" value="word"/>
    <parameter key="skip_constant_attributes" value="false"/>
    </operator>
    <operator activated="true" class="cartesian_product" compatibility="7.5.001" expanded="true" height="82" name="Cartesian" width="90" x="1385" y="238"/>
    <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
    <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
    <connect from_op="Append" from_port="merged set" to_op="Get Pages" to_port="Example Set"/>
    <connect from_op="Get Pages" from_port="Example Set" to_op="Data to Documents" to_port="example set"/>
    <connect from_op="Data to Documents" from_port="documents" to_op="Process Documents" to_port="documents 1"/>
    <connect from_op="Process Documents" from_port="example set" to_op="Cartesian" to_port="left"/>
    <connect from_op="Process Documents" from_port="word list" to_op="WordList to Data" to_port="word list"/>
    <connect from_op="WordList to Data" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
    <connect from_op="Generate Attributes" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Pivot" to_port="example set input"/>
    <connect from_op="Pivot" from_port="example set output" to_op="Cartesian" to_port="right"/>
    <connect from_op="Cartesian" from_port="join" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>
  • saure
    saure New Altair Community Member

    Hello Martin,

    thanks a lot for this solution.

    Each link by "Generate Data..." is not very comfortable - but it works. (In a project i have nearly 40 links).

    But in the ExampleSet is per word only the suggestion yes/no (1/0) not the total count of the word. Unter total_x is the correct sum. But i do not know how many are in each link. Have I overlooked something?

     

    BR, Bernd

     

  • MartinLiebig
    MartinLiebig
    Altair Employee
    Answer ✓

    Dear Bernd,

     

    you can switch the Generate Data operators to your read excel again. This was just my quick and dirty way to get your URLs in.

     

    For the occurcences, you can switch from Binary Occurences to Occurences in Process Documents. That should do the trick.

     

    Best,

    Martin

  • saure
    saure New Altair Community Member
    Answer ✓

    Dear Martin,

     

    thanks again. That`s it!

     

    Best regards
    Bernd