Text Mining Create Association Rules

TobiasNehrig
TobiasNehrig New Altair Community Member
edited November 5 in Community Q&A

Hi experts,

 

I‘ve X web pages and each web page has an ID. Now I‘d like to compute for each single web page with my sub operator „Word Association“ association rules, so that I can get association rule graphs for each page.

At the moment I only compute association rules over all X web pages.

I‘ve tried to loop my sub operator with Loop Collection, Loop cluster (ID) or a normal loop with a macro (ID). Has maybe someone a hint for me?

 

<?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="8.1.003" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="subprocess" compatibility="8.1.003" expanded="true" height="82" name="Crawler Spon 10 pages" width="90" x="45" y="544">
        <process expanded="true">
          <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web (2)" width="90" x="112" y="34">
            <parameter key="url" value="http://www.spiegel.de"/>
            <list key="crawling_rules">
              <parameter key="store_with_matching_url" value=".+www.spiegel.+"/>
              <parameter key="follow_link_with_matching_url" value=".+spiegel.+|.+de.+"/>
            </list>
            <parameter key="max_crawl_depth" value="10"/>
            <parameter key="retrieve_as_html" value="true"/>
            <parameter key="add_content_as_attribute" value="true"/>
            <parameter key="max_pages" value="10"/>
            <parameter key="delay" value="100"/>
            <parameter key="max_concurrent_connections" value="200"/>
            <parameter key="max_connections_per_host" value="100"/>
            <parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"/>
          </operator>
          <operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages (2)" width="90" x="246" y="34">
            <parameter key="link_attribute" value="Link"/>
            <parameter key="page_attribute" value="link"/>
            <parameter key="random_user_agent" value="true"/>
          </operator>
          <connect from_op="Crawl Web (2)" from_port="example set" to_op="Get Pages (2)" to_port="Example Set"/>
          <connect from_op="Get Pages (2)" from_port="Example Set" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="246" y="544">
        <parameter key="create_word_vector" value="false"/>
        <parameter key="keep_text" value="true"/>
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="179" y="34">
            <parameter key="ignore_non_html_tags" value="false"/>
          </operator>
          <connect from_port="document" to_op="Extract Content" to_port="document"/>
          <connect from_op="Extract Content" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="false" class="r_scripting:execute_r" compatibility="8.1.000" expanded="true" height="68" name="R-Script-Pairwise-Count" width="90" x="514" y="646">
        <parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;library(widyr)&#10;&#10;rm_main = function(data)&#10;{&#10;korpus &lt;- data_frame(id =data$id, text = data$text)&#10;&#10;print(korpus)&#10;&#10;woerter &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; group_by(id)%&gt;%&#10; count(word, sort =TRUE)%&gt;%&#10; filter(n&gt;=10)&#10; print(woerter)&#10;woerter &lt;- as.data.table(woerter)&#10;&#10;cooccurre &lt;- korpus %&gt;%&#10;  unnest_tokens(word, text)%&gt;%&#10;  pairwise_count(word, id, sort = TRUE)%&gt;%&#10; # filter(n&gt;=10)&#10; print(cooccurre)&#10;&#10; cooccurre &lt;- as.data.frame(cooccurre)&#10; &#10; return(list(woerter, cooccurre))&#10;}&#10;"/>
      </operator>
      <operator activated="false" class="r_scripting:execute_r" compatibility="8.1.000" expanded="true" height="68" name="R-Script-Bigram" width="90" x="514" y="544">
        <parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;library(widyr)&#10;&#10;rm_main = function(data)&#10;{&#10;korpus &lt;- data_frame(id =data$id, text = data$text)&#10;&#10;print(korpus)&#10;&#10;woerter &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; group_by(id)%&gt;%&#10; count(word, sort =TRUE)%&gt;%&#10; filter(n&gt;=10)&#10; print(woerter)&#10;woerter &lt;- as.data.table(woerter)&#10;&#10;cooccurre &lt;- korpus %&gt;%&#10;  unnest_tokens(bigram, text, token= &quot;ngrams&quot;, n= 2)%&gt;%&#10;  count(bigram, sort = TRUE)&#10;  #pairwise_count(word, id, sort = TRUE)%&gt;%&#10; # filter(n&gt;=10)&#10; print(cooccurre)&#10;&#10; cooccurre &lt;- as.data.frame(cooccurre)&#10;&#10; return(list(woerter, cooccurre))&#10;}&#10;"/>
      </operator>
      <operator activated="false" class="retrieve" compatibility="8.1.003" expanded="true" height="68" name="Retrieve 10-Rohseiten-Spiegel" width="90" x="45" y="34">
        <parameter key="repository_entry" value="../data/10-Rohseiten-Spiegel"/>
      </operator>
      <operator activated="true" class="subprocess" compatibility="8.1.003" expanded="true" height="124" name="Prepare Data" width="90" x="246" y="34">
        <process expanded="true">
          <operator activated="true" class="set_role" compatibility="8.1.003" expanded="true" height="82" name="Set Role (2)" width="90" x="45" y="34">
            <parameter key="attribute_name" value="text"/>
            <list key="set_additional_roles">
              <parameter key="Title" value="regular"/>
            </list>
          </operator>
          <operator activated="true" class="generate_id" compatibility="8.1.003" expanded="true" height="82" name="Generate ID" width="90" x="45" y="187"/>
          <operator activated="true" class="order_attributes" compatibility="8.1.003" expanded="true" height="82" name="Reorder Attributes" width="90" x="45" y="340">
            <parameter key="attribute_ordering" value="Title|text"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="8.1.003" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="493">
            <parameter key="attribute_filter_type" value="subset"/>
            <parameter key="attributes" value="Title|text"/>
          </operator>
          <operator activated="true" class="filter_examples" compatibility="8.1.003" expanded="true" height="103" name="Filter Examples" width="90" x="246" y="34">
            <list key="filters_list">
              <parameter key="filters_entry_key" value="Title.is_not_missing."/>
            </list>
            <parameter key="filters_logic_and" value="false"/>
            <parameter key="filters_check_metadata" value="false"/>
          </operator>
          <operator activated="true" class="set_macros" compatibility="8.1.003" expanded="true" height="82" name="Set Macros" width="90" x="246" y="187">
            <list key="macros">
              <parameter key="attribute_id" value="id"/>
            </list>
          </operator>
          <operator activated="true" class="multiply" compatibility="8.1.003" expanded="true" height="103" name="Multiply uncut" width="90" x="380" y="187"/>
          <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="cut in sentences" width="90" x="581" y="34">
            <parameter key="create_word_vector" value="false"/>
            <parameter key="keep_text" value="true"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="text:cut_document" compatibility="8.1.000" expanded="true" height="68" name="Cut Document" width="90" x="112" y="34">
                <parameter key="query_type" value="Regular Region"/>
                <list key="string_machting_queries"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries">
                  <parameter key="sentences" value="\\\.\\s[A-Z]| \\!\\s[A-Z]|\\?\\s[A-Z].\\\.|\\!|\\?"/>
                </list>
                <list key="xpath_queries"/>
                <list key="namespaces"/>
                <list key="index_queries"/>
                <list key="jsonpath_queries"/>
                <process expanded="true">
                  <connect from_port="segment" to_port="document 1"/>
                  <portSpacing port="source_segment" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                </process>
              </operator>
              <connect from_port="document" to_op="Cut Document" to_port="document"/>
              <connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
            <description align="center" color="transparent" colored="false" width="126">for r-scripts&lt;br&gt;tidy text&lt;br/&gt;bigram&lt;br/&gt;pairwise count</description>
          </operator>
          <operator activated="true" class="multiply" compatibility="8.1.003" expanded="true" height="103" name="Multiply" width="90" x="782" y="34"/>
          <connect from_port="in 1" to_op="Set Role (2)" to_port="example set input"/>
          <connect from_op="Set Role (2)" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
          <connect from_op="Generate ID" from_port="example set output" to_op="Reorder Attributes" to_port="example set input"/>
          <connect from_op="Reorder Attributes" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
          <connect from_op="Filter Examples" from_port="example set output" to_op="Set Macros" to_port="through 1"/>
          <connect from_op="Set Macros" from_port="through 1" to_op="Multiply uncut" to_port="input"/>
          <connect from_op="Multiply uncut" from_port="output 1" to_op="cut in sentences" to_port="example set"/>
          <connect from_op="Multiply uncut" from_port="output 2" to_port="out 2"/>
          <connect from_op="cut in sentences" from_port="example set" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_port="out 1"/>
          <connect from_op="Multiply" from_port="output 2" to_port="out 3"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
          <portSpacing port="sink_out 3" spacing="0"/>
          <portSpacing port="sink_out 4" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="subprocess" compatibility="8.1.003" expanded="true" height="124" name="RM Co-occurrence (3)" width="90" x="715" y="85">
        <process expanded="true">
          <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data (4)" width="90" x="112" y="136">
            <parameter key="prune_method" value="percentual"/>
            <parameter key="prune_below_percent" value="0.01"/>
            <parameter key="prune_above_percent" value="100.0"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize Non-letters (3)" width="90" x="112" y="34"/>
              <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize Linguistic (3)" width="90" x="246" y="34">
                <parameter key="mode" value="linguistic sentences"/>
                <parameter key="language" value="German"/>
              </operator>
              <operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (3)" width="90" x="514" y="34">
                <parameter key="min_chars" value="2"/>
              </operator>
              <operator activated="false" class="text:filter_stopwords_german" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (3)" width="90" x="380" y="34"/>
              <operator activated="false" class="text:stem_porter" compatibility="8.1.000" expanded="true" height="68" name="Stem (3)" width="90" x="648" y="34"/>
              <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases (3)" width="90" x="782" y="34"/>
              <connect from_port="document" to_op="Tokenize Non-letters (3)" to_port="document"/>
              <connect from_op="Tokenize Non-letters (3)" from_port="document" to_op="Tokenize Linguistic (3)" to_port="document"/>
              <connect from_op="Tokenize Linguistic (3)" from_port="document" to_op="Filter Tokens (3)" to_port="document"/>
              <connect from_op="Filter Tokens (3)" from_port="document" to_op="Transform Cases (3)" to_port="document"/>
              <connect from_op="Transform Cases (3)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="text_to_nominal" compatibility="8.1.003" expanded="true" height="82" name="Text to Nominal (3)" width="90" x="246" y="34"/>
          <operator activated="true" class="numerical_to_binominal" compatibility="8.1.003" expanded="true" height="82" name="Numerical to Binominal (3)" width="90" x="380" y="34"/>
          <operator activated="true" class="fp_growth" compatibility="8.1.003" expanded="true" height="82" name="FP-Growth (3)" width="90" x="514" y="34">
            <parameter key="find_min_number_of_itemsets" value="false"/>
            <parameter key="min_support" value="0.2"/>
            <parameter key="max_items" value="2"/>
          </operator>
          <operator activated="true" class="create_association_rules" compatibility="8.1.003" expanded="true" height="82" name="Create Association Rules (3)" width="90" x="715" y="136">
            <parameter key="min_confidence" value="0.01"/>
            <parameter key="gain_theta" value="1.0"/>
          </operator>
          <connect from_port="in 1" to_op="Process Documents from Data (4)" to_port="example set"/>
          <connect from_op="Process Documents from Data (4)" from_port="example set" to_op="Text to Nominal (3)" to_port="example set input"/>
          <connect from_op="Process Documents from Data (4)" from_port="word list" to_port="out 3"/>
          <connect from_op="Text to Nominal (3)" from_port="example set output" to_op="Numerical to Binominal (3)" to_port="example set input"/>
          <connect from_op="Numerical to Binominal (3)" from_port="example set output" to_op="FP-Growth (3)" to_port="example set"/>
          <connect from_op="FP-Growth (3)" from_port="example set" to_port="out 1"/>
          <connect from_op="FP-Growth (3)" from_port="frequent sets" to_op="Create Association Rules (3)" to_port="item sets"/>
          <connect from_op="Create Association Rules (3)" from_port="rules" to_port="out 2"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
          <portSpacing port="sink_out 3" spacing="0"/>
          <portSpacing port="sink_out 4" spacing="0"/>
        </process>
      </operator>
      <operator activated="false" class="concurrency:loop" compatibility="8.1.003" expanded="true" height="124" name="Loop" width="90" x="715" y="391">
        <parameter key="number_of_iterations" value="1"/>
        <parameter key="iteration_macro" value="%{attribute_id}"/>
        <parameter key="enable_parallel_execution" value="false"/>
        <process expanded="true">
          <operator activated="true" class="subprocess" compatibility="8.1.003" expanded="true" height="124" name="RM Co-occurrence (2)" width="90" x="179" y="34">
            <process expanded="true">
              <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data (3)" width="90" x="112" y="136">
                <parameter key="prune_method" value="percentual"/>
                <parameter key="prune_below_percent" value="0.01"/>
                <parameter key="prune_above_percent" value="100.0"/>
                <list key="specify_weights"/>
                <process expanded="true">
                  <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize Non-letters (2)" width="90" x="112" y="34"/>
                  <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize Linguistic (2)" width="90" x="246" y="34">
                    <parameter key="mode" value="linguistic sentences"/>
                    <parameter key="language" value="German"/>
                  </operator>
                  <operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (2)" width="90" x="514" y="34">
                    <parameter key="min_chars" value="2"/>
                  </operator>
                  <operator activated="false" class="text:filter_stopwords_german" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (2)" width="90" x="380" y="34"/>
                  <operator activated="false" class="text:stem_porter" compatibility="8.1.000" expanded="true" height="68" name="Stem (2)" width="90" x="648" y="34"/>
                  <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="782" y="34"/>
                  <connect from_port="document" to_op="Tokenize Non-letters (2)" to_port="document"/>
                  <connect from_op="Tokenize Non-letters (2)" from_port="document" to_op="Tokenize Linguistic (2)" to_port="document"/>
                  <connect from_op="Tokenize Linguistic (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
                  <connect from_op="Filter Tokens (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
                  <connect from_op="Transform Cases (2)" from_port="document" to_port="document 1"/>
                  <portSpacing port="source_document" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                </process>
              </operator>
              <operator activated="true" class="text_to_nominal" compatibility="8.1.003" expanded="true" height="82" name="Text to Nominal (2)" width="90" x="246" y="34"/>
              <operator activated="true" class="numerical_to_binominal" compatibility="8.1.003" expanded="true" height="82" name="Numerical to Binominal (2)" width="90" x="380" y="34"/>
              <operator activated="true" class="fp_growth" compatibility="8.1.003" expanded="true" height="82" name="FP-Growth (2)" width="90" x="514" y="34">
                <parameter key="find_min_number_of_itemsets" value="false"/>
                <parameter key="min_support" value="0.2"/>
                <parameter key="max_items" value="2"/>
              </operator>
              <operator activated="true" class="create_association_rules" compatibility="8.1.003" expanded="true" height="82" name="Create Association Rules (2)" width="90" x="715" y="85">
                <parameter key="min_confidence" value="0.01"/>
                <parameter key="gain_theta" value="1.0"/>
              </operator>
              <connect from_port="in 1" to_op="Process Documents from Data (3)" to_port="example set"/>
              <connect from_op="Process Documents from Data (3)" from_port="example set" to_op="Text to Nominal (2)" to_port="example set input"/>
              <connect from_op="Process Documents from Data (3)" from_port="word list" to_port="out 3"/>
              <connect from_op="Text to Nominal (2)" from_port="example set output" to_op="Numerical to Binominal (2)" to_port="example set input"/>
              <connect from_op="Numerical to Binominal (2)" from_port="example set output" to_op="FP-Growth (2)" to_port="example set"/>
              <connect from_op="FP-Growth (2)" from_port="example set" to_port="out 1"/>
              <connect from_op="FP-Growth (2)" from_port="frequent sets" to_op="Create Association Rules (2)" to_port="item sets"/>
              <connect from_op="Create Association Rules (2)" from_port="rules" to_port="out 2"/>
              <portSpacing port="source_in 1" spacing="0"/>
              <portSpacing port="source_in 2" spacing="0"/>
              <portSpacing port="sink_out 1" spacing="0"/>
              <portSpacing port="sink_out 2" spacing="0"/>
              <portSpacing port="sink_out 3" spacing="0"/>
              <portSpacing port="sink_out 4" spacing="0"/>
            </process>
          </operator>
          <connect from_port="input 1" to_op="RM Co-occurrence (2)" to_port="in 1"/>
          <connect from_op="RM Co-occurrence (2)" from_port="out 1" to_port="output 1"/>
          <connect from_op="RM Co-occurrence (2)" from_port="out 2" to_port="output 2"/>
          <connect from_op="RM Co-occurrence (2)" from_port="out 3" to_port="output 3"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
          <portSpacing port="sink_output 3" spacing="0"/>
          <portSpacing port="sink_output 4" spacing="0"/>
        </process>
      </operator>
      <operator activated="false" class="collect" compatibility="8.1.003" expanded="true" height="68" name="Collect" width="90" x="514" y="238"/>
      <operator activated="false" class="loop_collection" compatibility="8.1.003" expanded="true" height="124" name="Loop Collection" width="90" x="715" y="238">
        <process expanded="true">
          <operator activated="true" class="subprocess" compatibility="8.1.003" expanded="true" height="124" name="RM Co-occurrence (4)" width="90" x="179" y="34">
            <process expanded="true">
              <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data (6)" width="90" x="112" y="136">
                <parameter key="prune_method" value="percentual"/>
                <parameter key="prune_below_percent" value="0.01"/>
                <parameter key="prune_above_percent" value="100.0"/>
                <list key="specify_weights"/>
                <process expanded="true">
                  <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize Non-letters (4)" width="90" x="112" y="34"/>
                  <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize Linguistic (4)" width="90" x="246" y="34">
                    <parameter key="mode" value="linguistic sentences"/>
                    <parameter key="language" value="German"/>
                  </operator>
                  <operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (4)" width="90" x="514" y="34">
                    <parameter key="min_chars" value="2"/>
                  </operator>
                  <operator activated="false" class="text:filter_stopwords_german" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (4)" width="90" x="380" y="34"/>
                  <operator activated="false" class="text:stem_porter" compatibility="8.1.000" expanded="true" height="68" name="Stem (4)" width="90" x="648" y="34"/>
                  <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases (4)" width="90" x="782" y="34"/>
                  <connect from_port="document" to_op="Tokenize Non-letters (4)" to_port="document"/>
                  <connect from_op="Tokenize Non-letters (4)" from_port="document" to_op="Tokenize Linguistic (4)" to_port="document"/>
                  <connect from_op="Tokenize Linguistic (4)" from_port="document" to_op="Filter Tokens (4)" to_port="document"/>
                  <connect from_op="Filter Tokens (4)" from_port="document" to_op="Transform Cases (4)" to_port="document"/>
                  <connect from_op="Transform Cases (4)" from_port="document" to_port="document 1"/>
                  <portSpacing port="source_document" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                </process>
              </operator>
              <operator activated="true" class="text_to_nominal" compatibility="8.1.003" expanded="true" height="82" name="Text to Nominal (5)" width="90" x="246" y="34"/>
              <operator activated="true" class="numerical_to_binominal" compatibility="8.1.003" expanded="true" height="82" name="Numerical to Binominal (5)" width="90" x="380" y="34"/>
              <operator activated="true" class="fp_growth" compatibility="8.1.003" expanded="true" height="82" name="FP-Growth (5)" width="90" x="514" y="34">
                <parameter key="find_min_number_of_itemsets" value="false"/>
                <parameter key="min_support" value="0.2"/>
                <parameter key="max_items" value="2"/>
              </operator>
              <operator activated="true" class="create_association_rules" compatibility="8.1.003" expanded="true" height="82" name="Create Association Rules (5)" width="90" x="715" y="136">
                <parameter key="min_confidence" value="0.01"/>
                <parameter key="gain_theta" value="1.0"/>
              </operator>
              <connect from_port="in 1" to_op="Process Documents from Data (6)" to_port="example set"/>
              <connect from_op="Process Documents from Data (6)" from_port="example set" to_op="Text to Nominal (5)" to_port="example set input"/>
              <connect from_op="Process Documents from Data (6)" from_port="word list" to_port="out 3"/>
              <connect from_op="Text to Nominal (5)" from_port="example set output" to_op="Numerical to Binominal (5)" to_port="example set input"/>
              <connect from_op="Numerical to Binominal (5)" from_port="example set output" to_op="FP-Growth (5)" to_port="example set"/>
              <connect from_op="FP-Growth (5)" from_port="example set" to_port="out 1"/>
              <connect from_op="FP-Growth (5)" from_port="frequent sets" to_op="Create Association Rules (5)" to_port="item sets"/>
              <connect from_op="Create Association Rules (5)" from_port="rules" to_port="out 2"/>
              <portSpacing port="source_in 1" spacing="0"/>
              <portSpacing port="source_in 2" spacing="0"/>
              <portSpacing port="sink_out 1" spacing="0"/>
              <portSpacing port="sink_out 2" spacing="0"/>
              <portSpacing port="sink_out 3" spacing="0"/>
              <portSpacing port="sink_out 4" spacing="0"/>
            </process>
          </operator>
          <connect from_port="single" to_op="RM Co-occurrence (4)" to_port="in 1"/>
          <connect from_op="RM Co-occurrence (4)" from_port="out 1" to_port="output 1"/>
          <connect from_op="RM Co-occurrence (4)" from_port="out 2" to_port="output 2"/>
          <connect from_op="RM Co-occurrence (4)" from_port="out 3" to_port="output 3"/>
          <portSpacing port="source_single" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
          <portSpacing port="sink_output 3" spacing="0"/>
          <portSpacing port="sink_output 4" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Crawler Spon 10 pages" from_port="out 1" to_op="Process Documents from Data (2)" to_port="example set"/>
      <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Prepare Data" to_port="in 1"/>
      <connect from_op="Prepare Data" from_port="out 1" to_port="result 1"/>
      <connect from_op="Prepare Data" from_port="out 2" to_op="RM Co-occurrence (3)" to_port="in 1"/>
      <connect from_op="RM Co-occurrence (3)" from_port="out 1" to_port="result 2"/>
      <connect from_op="RM Co-occurrence (3)" from_port="out 2" to_port="result 3"/>
      <connect from_op="RM Co-occurrence (3)" from_port="out 3" to_port="result 4"/>
      <connect from_op="Collect" from_port="collection" to_op="Loop Collection" to_port="collection"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
      <description align="center" color="yellow" colored="false" height="286" resized="true" width="434" x="10" y="480">Crawler &lt;br/&gt;</description>
      <description align="center" color="yellow" colored="false" height="278" resized="true" width="173" x="477" y="488">R-Scripts&lt;br/&gt;</description>
    </process>
  </operator>
</process>

 

Kind regards

 

Tobias

 

 

 

Answers

  • Telcontar120
    Telcontar120 New Altair Community Member

    In theory, you should be able to retrieve your web pages and then store them as documents (you might need "Data to Documents" depending on how you retrieve them).  After that,you should be able to use "Loop Collection" to process each one separately, but that doesn't seem to work with Process Documents because it's not returning any wordlists or word vectors at all. So I agree with you, something here isn't working properly.

    Another alternative should be to store the web pages as examplesets and then use "Loop Examples" but that also doesn't seem to work---it returns the same wordlist and word vector across all documents.

    So I think this probably needs to be looked at by RapidMiner developers to understand what is breaking down inside the loops with respect to processing documents. @sgenzer can you forward this to their attention?

    <?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="8.1.003" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="45" y="187">
    <parameter key="url" value="https://en.wikipedia.org/wiki/Data_science"/>
    <list key="crawling_rules">
    <parameter key="follow_link_with_matching_url" value=".+en.wikipedia.org.+"/>
    </list>
    <parameter key="retrieve_as_html" value="true"/>
    <parameter key="add_content_as_attribute" value="true"/>
    <parameter key="max_pages" value="10"/>
    </operator>
    <operator activated="true" class="loop_examples" compatibility="8.1.003" expanded="true" height="103" name="Loop Examples" width="90" x="380" y="187">
    <process expanded="true">
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="246" y="136">
    <parameter key="keep_text" value="true"/>
    <parameter key="prune_method" value="percentual"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="313" y="85"/>
    <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
    <connect from_op="Tokenize (2)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_port="example set" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="word list" to_port="output 1"/>
    <portSpacing port="source_example set" spacing="0"/>
    <portSpacing port="sink_example set" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web (2)" width="90" x="45" y="34">
    <parameter key="url" value="https://en.wikipedia.org/wiki/Data_science"/>
    <list key="crawling_rules">
    <parameter key="follow_link_with_matching_url" value=".+en.wikipedia.org.+"/>
    </list>
    <parameter key="retrieve_as_html" value="true"/>
    <parameter key="add_content_as_attribute" value="true"/>
    <parameter key="max_pages" value="10"/>
    </operator>
    <operator activated="true" class="text:data_to_documents" compatibility="8.1.000" expanded="true" height="68" name="Data to Documents" width="90" x="246" y="34">
    <list key="specify_weights"/>
    </operator>
    <operator activated="true" class="loop_collection" compatibility="8.1.003" expanded="true" height="103" name="Loop Collection" width="90" x="447" y="34">
    <parameter key="set_iteration_macro" value="true"/>
    <process expanded="true">
    <operator activated="true" class="text:process_documents" compatibility="8.1.000" expanded="true" height="103" name="Process Documents" width="90" x="246" y="34">
    <parameter key="keep_text" value="true"/>
    <parameter key="prune_method" value="percentual"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="246" y="85"/>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_port="single" to_op="Process Documents" to_port="documents 1"/>
    <connect from_op="Process Documents" from_port="example set" to_port="output 1"/>
    <connect from_op="Process Documents" from_port="word list" to_port="output 2"/>
    <portSpacing port="source_single" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    <portSpacing port="sink_output 3" spacing="0"/>
    </process>
    </operator>
    <connect from_op="Crawl Web" from_port="example set" to_op="Loop Examples" to_port="example set"/>
    <connect from_op="Loop Examples" from_port="example set" to_port="result 3"/>
    <connect from_op="Loop Examples" from_port="output 1" to_port="result 4"/>
    <connect from_op="Crawl Web (2)" from_port="example set" to_op="Data to Documents" to_port="example set"/>
    <connect from_op="Data to Documents" from_port="documents" to_op="Loop Collection" to_port="collection"/>
    <connect from_op="Loop Collection" from_port="output 1" to_port="result 1"/>
    <connect from_op="Loop Collection" from_port="output 2" to_port="result 2"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    <portSpacing port="sink_result 5" spacing="0"/>
    </process>
    </operator>
    </process>

    See the example process attached (it's much simpler than the OP which contains a lot of unecessary extras not needed for isolating this specific issue).

    Brian

     

     

  • TobiasNehrig
    TobiasNehrig New Altair Community Member

    Hi @Telcontar120

    thank you. I thought there is a failure in this routine of mine.

    Tobias

     

     

     

  • sgenzer
    sgenzer
    Altair Employee

    Hi...so I'm not sure I completely understand the problem. You can use "Loop Collection" on a collection of documents and do whatever you want inside the Loop Collection operator. For example, I just used a piece of your process and did Transform Cases inside the Loop Collection. It works fine.  Am I missing something?

     

    <?xml version="1.0" encoding="UTF-8"?><process version="8.1.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="8.1.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web (2)" width="90" x="45" y="34">
    <parameter key="url" value="https://en.wikipedia.org/wiki/Data_science"/>
    <list key="crawling_rules">
    <parameter key="follow_link_with_matching_url" value=".+en.wikipedia.org.+"/>
    </list>
    <parameter key="retrieve_as_html" value="true"/>
    <parameter key="add_content_as_attribute" value="true"/>
    <parameter key="max_pages" value="10"/>
    </operator>
    <operator activated="true" class="text:data_to_documents" compatibility="7.5.000" expanded="true" height="68" name="Data to Documents" width="90" x="179" y="34">
    <list key="specify_weights"/>
    </operator>
    <operator activated="true" class="loop_collection" compatibility="8.1.001" expanded="true" height="82" name="Loop Collection" width="90" x="313" y="34">
    <parameter key="set_iteration_macro" value="true"/>
    <process expanded="true">
    <operator activated="false" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="246" y="187"/>
    <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="447" y="34"/>
    <connect from_port="single" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_port="output 1"/>
    <portSpacing port="source_single" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <connect from_op="Crawl Web (2)" from_port="example set" to_op="Data to Documents" to_port="example set"/>
    <connect from_op="Data to Documents" from_port="documents" to_op="Loop Collection" to_port="collection"/>
    <connect from_op="Loop Collection" from_port="output 1" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    </process>
    </operator>
    </process>

    Scott

     

     

  • Telcontar120
    Telcontar120 New Altair Community Member

    Hey @sgenzer , thanks for looking at this. 

    I think the problem with Loop Collection is specifically with "Process Documents" and specifically with the Word Vector creation part of it.  Did you try running my entire process that I posted?  If you do that, the errors that I describe should be evident (mainly, no word vector output!).

    With Loop Examples and a Macro, there still seems to be a problem, which is that it is returning only a single Word Vector instead of one per example, which is what it logically should be doing.

     

  • sgenzer
    sgenzer
    Altair Employee

    hi @Telcontar120 - so I have no problem seeing the full word vectors if I turn off the pruning with Process Documents:

     

    Screen Shot 2018-04-26 at 5.34.18 PM.pngScreen Shot 2018-04-26 at 5.34.22 PM.png

     

    Let me look into the other one while you take a look at this...

     

    Scott

     

  • sgenzer
    sgenzer
    Altair Employee

    so if I run just the Loop Examples part, I do see a full example set...?

     

    <?xml version="1.0" encoding="UTF-8"?><process version="8.1.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="8.1.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="45" y="187">
    <parameter key="url" value="https://en.wikipedia.org/wiki/Data_science"/>
    <list key="crawling_rules">
    <parameter key="follow_link_with_matching_url" value=".+en.wikipedia.org.+"/>
    </list>
    <parameter key="retrieve_as_html" value="true"/>
    <parameter key="add_content_as_attribute" value="true"/>
    <parameter key="max_pages" value="10"/>
    </operator>
    <operator activated="true" class="loop_examples" compatibility="8.1.001" expanded="true" height="103" name="Loop Examples" width="90" x="380" y="187">
    <process expanded="true">
    <operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="246" y="136">
    <parameter key="keep_text" value="true"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="313" y="85"/>
    <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
    <connect from_op="Tokenize (2)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_port="example set" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="word list" to_port="output 1"/>
    <portSpacing port="source_example set" spacing="0"/>
    <portSpacing port="sink_example set" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="false" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web (2)" width="90" x="45" y="34">
    <parameter key="url" value="https://en.wikipedia.org/wiki/Data_science"/>
    <list key="crawling_rules">
    <parameter key="follow_link_with_matching_url" value=".+en.wikipedia.org.+"/>
    </list>
    <parameter key="retrieve_as_html" value="true"/>
    <parameter key="add_content_as_attribute" value="true"/>
    <parameter key="max_pages" value="10"/>
    </operator>
    <operator activated="false" class="text:data_to_documents" compatibility="7.5.000" expanded="true" height="68" name="Data to Documents" width="90" x="246" y="34">
    <list key="specify_weights"/>
    </operator>
    <operator activated="false" class="loop_collection" compatibility="8.1.001" expanded="true" height="103" name="Loop Collection" width="90" x="447" y="34">
    <parameter key="set_iteration_macro" value="true"/>
    <process expanded="true">
    <operator activated="true" class="text:process_documents" compatibility="7.5.000" expanded="true" height="103" name="Process Documents" width="90" x="246" y="34">
    <parameter key="keep_text" value="true"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="246" y="85"/>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_port="single" to_op="Process Documents" to_port="documents 1"/>
    <connect from_op="Process Documents" from_port="example set" to_port="output 1"/>
    <connect from_op="Process Documents" from_port="word list" to_port="output 2"/>
    <portSpacing port="source_single" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    <portSpacing port="sink_output 3" spacing="0"/>
    </process>
    </operator>
    <connect from_op="Crawl Web" from_port="example set" to_op="Loop Examples" to_port="example set"/>
    <connect from_op="Loop Examples" from_port="example set" to_port="result 3"/>
    <connect from_op="Loop Examples" from_port="output 1" to_port="result 4"/>
    <connect from_op="Crawl Web (2)" from_port="example set" to_op="Data to Documents" to_port="example set"/>
    <connect from_op="Data to Documents" from_port="documents" to_op="Loop Collection" to_port="collection"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="168"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    <portSpacing port="sink_result 5" spacing="0"/>
    </process>
    </operator>
    </process>

    Screen Shot 2018-04-26 at 5.41.18 PM.png

     

    Scott

  • Telcontar120
    Telcontar120 New Altair Community Member

    Yep, for Loop Collections, you're right---I should have tested it that way too!  I figured out one problem was the default word vector calculation method of TF-IDF.  Because we are only doing one document at a time, it's going to generate all zero values because there's no document collection to calculate IDF!  Term occurrences works ok, though.  But shouldn't this setup still work if pruning is turned on? (but it doesn't!)

    In terms of the Loop Examples output, I think the problem is different. It is only returning one wordlist with the same values across all documents, but it should be returning one separate wordlist for each document, right?

     

     

  • Telcontar120
    Telcontar120 New Altair Community Member

    @sgenzer  Were you able to duplicate the 2nd error I described in more detail in my response?  And have you already filed a bug report on the first item (no word vector generated with pruning on), or do you want me to do that?  

  • sgenzer
    sgenzer
    Altair Employee

    Hi @Telcontar120 sorry for the delay. So I poked around for a while on the Loop Collection issue and I don't think it's a bug. You see when you Loop Collection and use Process Documents inside, you're only using Process Document on one document at a time. I'm not sure this really makes sense. If you want to create word vectors on a collection of documents, I would just feed the collection to the Process Documents directly. And then pruning works and so on:

     

    Screen Shot 2018-05-01 at 10.14.35 AM.pngScreen Shot 2018-05-01 at 10.15.47 AM.png

     

    When I played around with breakpoints etc I also saw that it was not only pruning that "did not work" (did not product word vectors); ranking failed and absolute worked. This also makes sense - it's only looking at one document so of course anything that creates a subset via statistics on one document is going to fail. But "absolute" works because, well, there is only one document.

     

    Does this make sense?

     

    Scott

     

  • Telcontar120
    Telcontar120 New Altair Community Member

    @sgenzer  I agree that this isn't the ordinary way of doing things, but I still think the Process Documents operator is not behaving according to its intended design.  Take a look at the example process now.

     

    After some additional testing, it looks like the problem is really with Process Documents and doesn't have to do with the Loop Collections portion.  That is, if you feed Process Documents a single document (no loop involved), it will produce a word vector on that document, but NOT if you select pruning with certain options. 

     

    This doesn't make sense because there isn't anything inherently collective about pruning---it should be able to be done via any method (absolute, percentual, or ranking) on the word vector itself.   And in fact, the process still works if you select pruning "by ranking" as the method, although it doesn't actually do the pruning!  But it fails to produce any word vector output at all if you select pruning by "absolute" or "percentual" methods.  So the only method that is working as expected is when there is no pruning at all.

     

    So basically putting this inside the Loop Collections is irrelevant, since the strange behavior is occurring if you simply use one document with Process Documents alone.  At the very least, I would expect either to get an error or warning message, or to get an unpruned word vector for an individual document, but never to get no word vectors or wordlists at all!

     

    And the problem still exists with the unified wordlist being created when using "Process Documents" inside the generic Loop.  That definitely should not be happening since each document is supposed to be processed separately.

     

    <?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="8.1.003" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="false" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="45" y="187">
    <parameter key="url" value="https://en.wikipedia.org/wiki/Data_science"/>
    <list key="crawling_rules">
    <parameter key="follow_link_with_matching_url" value=".+en.wikipedia.org.+"/>
    </list>
    <parameter key="retrieve_as_html" value="true"/>
    <parameter key="add_content_as_attribute" value="true"/>
    <parameter key="max_pages" value="10"/>
    </operator>
    <operator activated="false" class="loop_examples" compatibility="8.1.003" expanded="true" height="103" name="Loop Examples" width="90" x="380" y="187">
    <process expanded="true">
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="246" y="136">
    <parameter key="keep_text" value="true"/>
    <parameter key="prune_method" value="percentual"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="313" y="85"/>
    <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
    <connect from_op="Tokenize (2)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_port="example set" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="word list" to_port="output 1"/>
    <portSpacing port="source_example set" spacing="0"/>
    <portSpacing port="sink_example set" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="false" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web (2)" width="90" x="45" y="34">
    <parameter key="url" value="https://en.wikipedia.org/wiki/Data_science"/>
    <list key="crawling_rules">
    <parameter key="follow_link_with_matching_url" value=".+en.wikipedia.org.+"/>
    </list>
    <parameter key="retrieve_as_html" value="true"/>
    <parameter key="add_content_as_attribute" value="true"/>
    <parameter key="max_pages" value="10"/>
    </operator>
    <operator activated="false" class="text:data_to_documents" compatibility="8.1.000" expanded="true" height="68" name="Data to Documents" width="90" x="246" y="34">
    <list key="specify_weights"/>
    </operator>
    <operator activated="false" class="loop_collection" compatibility="8.1.003" expanded="true" height="103" name="Loop Collection" width="90" x="447" y="34">
    <parameter key="set_iteration_macro" value="true"/>
    <process expanded="true">
    <operator activated="true" class="text:process_documents" compatibility="8.1.000" expanded="true" height="103" name="Process Documents" width="90" x="246" y="34">
    <parameter key="keep_text" value="true"/>
    <parameter key="prune_method" value="percentual"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="246" y="85"/>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_port="single" to_op="Process Documents" to_port="documents 1"/>
    <connect from_op="Process Documents" from_port="example set" to_port="output 1"/>
    <connect from_op="Process Documents" from_port="word list" to_port="output 2"/>
    <portSpacing port="source_single" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    <portSpacing port="sink_output 3" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="web:get_webpage" compatibility="7.3.000" expanded="true" height="68" name="Get Page" width="90" x="45" y="340">
    <parameter key="url" value="https://en.wikipedia.org/wiki/Data_science"/>
    <list key="query_parameters"/>
    <list key="request_properties"/>
    </operator>
    <operator activated="true" class="text:process_documents" compatibility="8.1.000" expanded="true" height="103" name="Process Documents (2)" width="90" x="313" y="340">
    <parameter key="vector_creation" value="Term Occurrences"/>
    <parameter key="keep_text" value="true"/>
    <parameter key="prune_below_absolute" value="5"/>
    <parameter key="prune_above_absolute" value="2000"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize (3)" width="90" x="246" y="34"/>
    <connect from_port="document" to_op="Tokenize (3)" to_port="document"/>
    <connect from_op="Tokenize (3)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_op="Crawl Web" from_port="example set" to_op="Loop Examples" to_port="example set"/>
    <connect from_op="Crawl Web (2)" from_port="example set" to_op="Data to Documents" to_port="example set"/>
    <connect from_op="Data to Documents" from_port="documents" to_op="Loop Collection" to_port="collection"/>
    <connect from_op="Get Page" from_port="output" to_op="Process Documents (2)" to_port="documents 1"/>
    <connect from_op="Process Documents (2)" from_port="example set" to_port="result 3"/>
    <connect from_op="Process Documents (2)" from_port="word list" to_port="result 4"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    <portSpacing port="sink_result 5" spacing="0"/>
    </process>
    </operator>
    </process>

     

  • TobiasNehrig
    TobiasNehrig New Altair Community Member

    Hi @sgenzer,

    Hi @Telcontar120,

     

    my problem is that I'd like to analyze the downloaded web pages. For each page I've to create co-occurrence lists and to find associations. I'm looking for a operator with which I can create the graphs for each page by using a r-script for co-occurrence and associations. For both, associations and co-occurrence, I'd like to see the results for each page.

     

    When I try Scotts loop approach with this code:

    <?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="8.1.003" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="subprocess" compatibility="8.1.003" expanded="true" height="82" name="Crawler" width="90" x="45" y="136">
    <process expanded="true">
    <operator activated="true" class="subprocess" compatibility="8.1.003" expanded="true" height="82" name="Crawler Spon 10 pages" width="90" x="45" y="34">
    <process expanded="true">
    <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web (2)" width="90" x="112" y="34">
    <parameter key="url" value="http://www.spiegel.de"/>
    <list key="crawling_rules">
    <parameter key="store_with_matching_url" value=".+www.spiegel.+"/>
    <parameter key="follow_link_with_matching_url" value=".+spiegel.+|.+de.+"/>
    </list>
    <parameter key="max_crawl_depth" value="10"/>
    <parameter key="retrieve_as_html" value="true"/>
    <parameter key="add_content_as_attribute" value="true"/>
    <parameter key="max_pages" value="10"/>
    <parameter key="delay" value="100"/>
    <parameter key="max_concurrent_connections" value="200"/>
    <parameter key="max_connections_per_host" value="100"/>
    <parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"/>
    </operator>
    <operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages (2)" width="90" x="246" y="34">
    <parameter key="link_attribute" value="Link"/>
    <parameter key="page_attribute" value="link"/>
    <parameter key="random_user_agent" value="true"/>
    </operator>
    <connect from_op="Crawl Web (2)" from_port="example set" to_op="Get Pages (2)" to_port="Example Set"/>
    <connect from_op="Get Pages (2)" from_port="Example Set" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="246" y="34">
    <parameter key="create_word_vector" value="false"/>
    <parameter key="keep_text" value="true"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="179" y="34">
    <parameter key="ignore_non_html_tags" value="false"/>
    </operator>
    <connect from_port="document" to_op="Extract Content" to_port="document"/>
    <connect from_op="Extract Content" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_op="Crawler Spon 10 pages" from_port="out 1" to_op="Process Documents from Data (2)" to_port="example set"/>
    <connect from_op="Process Documents from Data (2)" from_port="example set" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="subprocess" compatibility="8.1.003" expanded="true" height="124" name="Prepare Data" width="90" x="246" y="136">
    <process expanded="true">
    <operator activated="true" class="set_role" compatibility="8.1.003" expanded="true" height="82" name="Set Role (2)" width="90" x="45" y="34">
    <parameter key="attribute_name" value="text"/>
    <list key="set_additional_roles">
    <parameter key="Title" value="regular"/>
    </list>
    </operator>
    <operator activated="true" class="generate_id" compatibility="8.1.003" expanded="true" height="82" name="Generate ID" width="90" x="45" y="187"/>
    <operator activated="true" class="order_attributes" compatibility="8.1.003" expanded="true" height="82" name="Reorder Attributes" width="90" x="45" y="340">
    <parameter key="attribute_ordering" value="Title|text"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="8.1.003" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="493">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attributes" value="Title|text"/>
    </operator>
    <operator activated="true" class="filter_examples" compatibility="8.1.003" expanded="true" height="103" name="Filter Examples" width="90" x="246" y="34">
    <list key="filters_list">
    <parameter key="filters_entry_key" value="Title.is_not_missing."/>
    </list>
    <parameter key="filters_logic_and" value="false"/>
    <parameter key="filters_check_metadata" value="false"/>
    </operator>
    <operator activated="true" class="set_macros" compatibility="8.1.003" expanded="true" height="82" name="Set Macros" width="90" x="246" y="187">
    <list key="macros">
    <parameter key="attribute_id" value="id"/>
    </list>
    </operator>
    <operator activated="true" class="multiply" compatibility="8.1.003" expanded="true" height="103" name="Multiply uncut" width="90" x="380" y="187"/>
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="cut in sentences" width="90" x="581" y="34">
    <parameter key="create_word_vector" value="false"/>
    <parameter key="keep_text" value="true"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:cut_document" compatibility="8.1.000" expanded="true" height="68" name="Cut Document" width="90" x="112" y="34">
    <parameter key="query_type" value="Regular Region"/>
    <list key="string_machting_queries"/>
    <list key="regular_expression_queries"/>
    <list key="regular_region_queries">
    <parameter key="sentences" value="\\\.\\s[A-Z]| \\!\\s[A-Z]|\\?\\s[A-Z].\\\.|\\!|\\?"/>
    </list>
    <list key="xpath_queries"/>
    <list key="namespaces"/>
    <list key="index_queries"/>
    <list key="jsonpath_queries"/>
    <process expanded="true">
    <connect from_port="segment" to_port="document 1"/>
    <portSpacing port="source_segment" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_port="document" to_op="Cut Document" to_port="document"/>
    <connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    <description align="center" color="transparent" colored="false" width="126">for r-scripts&lt;br&gt;tidy text&lt;br/&gt;bigram&lt;br/&gt;pairwise count</description>
    </operator>
    <operator activated="true" class="multiply" compatibility="8.1.003" expanded="true" height="103" name="Multiply" width="90" x="782" y="34"/>
    <connect from_port="in 1" to_op="Set Role (2)" to_port="example set input"/>
    <connect from_op="Set Role (2)" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
    <connect from_op="Generate ID" from_port="example set output" to_op="Reorder Attributes" to_port="example set input"/>
    <connect from_op="Reorder Attributes" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
    <connect from_op="Filter Examples" from_port="example set output" to_op="Set Macros" to_port="through 1"/>
    <connect from_op="Set Macros" from_port="through 1" to_op="Multiply uncut" to_port="input"/>
    <connect from_op="Multiply uncut" from_port="output 1" to_op="cut in sentences" to_port="example set"/>
    <connect from_op="Multiply uncut" from_port="output 2" to_port="out 2"/>
    <connect from_op="cut in sentences" from_port="example set" to_op="Multiply" to_port="input"/>
    <connect from_op="Multiply" from_port="output 1" to_port="out 1"/>
    <connect from_op="Multiply" from_port="output 2" to_port="out 3"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="source_in 2" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    <portSpacing port="sink_out 3" spacing="0"/>
    <portSpacing port="sink_out 4" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="loop_examples" compatibility="8.1.003" expanded="true" height="124" name="Loop Examples R-Script" width="90" x="447" y="34">
    <process expanded="true">
    <operator activated="true" class="subprocess" compatibility="8.1.003" expanded="true" height="103" name="R-Scripts Co-occurrence" width="90" x="179" y="34">
    <process expanded="true">
    <operator activated="false" class="r_scripting:execute_r" compatibility="8.1.000" expanded="true" height="68" name="R-Script-Bigram" width="90" x="313" y="136">
    <parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;library(widyr)&#10;&#10;rm_main = function(data)&#10;{&#10;korpus &lt;- data_frame(id =data$id, text = data$text)&#10;&#10;print(korpus)&#10;&#10;woerter &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; group_by(id)%&gt;%&#10; count(word, sort =TRUE)%&gt;%&#10; filter(n&gt;=10)&#10; print(woerter)&#10;woerter &lt;- as.data.table(woerter)&#10;&#10;cooccurre &lt;- korpus %&gt;%&#10; unnest_tokens(bigram, text, token= &quot;ngrams&quot;, n= 2)%&gt;%&#10; count(bigram, sort = TRUE)&#10; #pairwise_count(word, id, sort = TRUE)%&gt;%&#10; # filter(n&gt;=10)&#10; print(cooccurre)&#10;&#10; cooccurre &lt;- as.data.frame(cooccurre)&#10;&#10; return(list(woerter, cooccurre))&#10;}&#10;"/>
    </operator>
    <operator activated="true" class="r_scripting:execute_r" compatibility="8.1.000" expanded="true" height="103" name="R-Script-Pairwise-Count" width="90" x="313" y="34">
    <parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;library(widyr)&#10;&#10;rm_main = function(data)&#10;{&#10;korpus &lt;- data_frame(id =data$id, text = data$text)&#10;&#10;print(korpus)&#10;&#10;woerter &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; group_by(id)%&gt;%&#10; count(word, sort =TRUE)%&gt;%&#10; filter(n&gt;=10)&#10; print(woerter)&#10;woerter &lt;- as.data.table(woerter)&#10;&#10;cooccurre &lt;- korpus %&gt;%&#10; unnest_tokens(word, text)%&gt;%&#10; pairwise_count(word, id, sort = TRUE)%&gt;%&#10; # filter(n&gt;=10)&#10; print(cooccurre)&#10;&#10; cooccurre &lt;- as.data.frame(cooccurre)&#10; &#10; return(list(woerter, cooccurre))&#10;}&#10;"/>
    </operator>
    <connect from_port="in 1" to_op="R-Script-Pairwise-Count" to_port="input 1"/>
    <connect from_op="R-Script-Pairwise-Count" from_port="output 1" to_port="out 1"/>
    <connect from_op="R-Script-Pairwise-Count" from_port="output 2" to_port="out 2"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="source_in 2" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    <portSpacing port="sink_out 3" spacing="0"/>
    </process>
    </operator>
    <connect from_port="example set" to_op="R-Scripts Co-occurrence" to_port="in 1"/>
    <connect from_op="R-Scripts Co-occurrence" from_port="out 1" to_port="output 1"/>
    <connect from_op="R-Scripts Co-occurrence" from_port="out 2" to_port="output 2"/>
    <portSpacing port="source_example set" spacing="0"/>
    <portSpacing port="sink_example set" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    <portSpacing port="sink_output 3" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="loop_examples" compatibility="8.1.003" expanded="true" height="145" name="Loop Examples Associations" width="90" x="447" y="238">
    <process expanded="true">
    <operator activated="true" class="subprocess" compatibility="8.1.003" expanded="true" height="124" name="Word Associstion Rules" width="90" x="313" y="34">
    <process expanded="true">
    <operator activated="true" class="subprocess" compatibility="8.1.003" expanded="true" height="124" name="RM Co-occurrence (3)" width="90" x="246" y="34">
    <process expanded="true">
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data (4)" width="90" x="112" y="136">
    <parameter key="prune_method" value="percentual"/>
    <parameter key="prune_below_percent" value="0.01"/>
    <parameter key="prune_above_percent" value="100.0"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize Non-letters (3)" width="90" x="112" y="34"/>
    <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize Linguistic (3)" width="90" x="246" y="34">
    <parameter key="mode" value="linguistic sentences"/>
    <parameter key="language" value="German"/>
    </operator>
    <operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (3)" width="90" x="514" y="34">
    <parameter key="min_chars" value="2"/>
    </operator>
    <operator activated="false" class="text:filter_stopwords_german" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (3)" width="90" x="380" y="34"/>
    <operator activated="false" class="text:stem_porter" compatibility="8.1.000" expanded="true" height="68" name="Stem (3)" width="90" x="648" y="34"/>
    <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases (3)" width="90" x="782" y="34"/>
    <connect from_port="document" to_op="Tokenize Non-letters (3)" to_port="document"/>
    <connect from_op="Tokenize Non-letters (3)" from_port="document" to_op="Tokenize Linguistic (3)" to_port="document"/>
    <connect from_op="Tokenize Linguistic (3)" from_port="document" to_op="Filter Tokens (3)" to_port="document"/>
    <connect from_op="Filter Tokens (3)" from_port="document" to_op="Transform Cases (3)" to_port="document"/>
    <connect from_op="Transform Cases (3)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text_to_nominal" compatibility="8.1.003" expanded="true" height="82" name="Text to Nominal (3)" width="90" x="246" y="34"/>
    <operator activated="true" class="numerical_to_binominal" compatibility="8.1.003" expanded="true" height="82" name="Numerical to Binominal (3)" width="90" x="380" y="34"/>
    <operator activated="true" class="fp_growth" compatibility="8.1.003" expanded="true" height="82" name="FP-Growth (3)" width="90" x="514" y="34">
    <parameter key="find_min_number_of_itemsets" value="false"/>
    <parameter key="min_support" value="0.2"/>
    <parameter key="max_items" value="2"/>
    </operator>
    <operator activated="true" class="create_association_rules" compatibility="8.1.003" expanded="true" height="82" name="Create Association Rules (3)" width="90" x="715" y="136">
    <parameter key="min_confidence" value="0.01"/>
    <parameter key="gain_theta" value="1.0"/>
    </operator>
    <connect from_port="in 1" to_op="Process Documents from Data (4)" to_port="example set"/>
    <connect from_op="Process Documents from Data (4)" from_port="example set" to_op="Text to Nominal (3)" to_port="example set input"/>
    <connect from_op="Process Documents from Data (4)" from_port="word list" to_port="out 3"/>
    <connect from_op="Text to Nominal (3)" from_port="example set output" to_op="Numerical to Binominal (3)" to_port="example set input"/>
    <connect from_op="Numerical to Binominal (3)" from_port="example set output" to_op="FP-Growth (3)" to_port="example set"/>
    <connect from_op="FP-Growth (3)" from_port="example set" to_port="out 1"/>
    <connect from_op="FP-Growth (3)" from_port="frequent sets" to_op="Create Association Rules (3)" to_port="item sets"/>
    <connect from_op="Create Association Rules (3)" from_port="rules" to_port="out 2"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="source_in 2" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    <portSpacing port="sink_out 3" spacing="0"/>
    <portSpacing port="sink_out 4" spacing="0"/>
    </process>
    </operator>
    <connect from_port="in 1" to_op="RM Co-occurrence (3)" to_port="in 1"/>
    <connect from_op="RM Co-occurrence (3)" from_port="out 1" to_port="out 1"/>
    <connect from_op="RM Co-occurrence (3)" from_port="out 2" to_port="out 2"/>
    <connect from_op="RM Co-occurrence (3)" from_port="out 3" to_port="out 3"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="source_in 2" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    <portSpacing port="sink_out 3" spacing="0"/>
    <portSpacing port="sink_out 4" spacing="0"/>
    </process>
    </operator>
    <connect from_port="example set" to_op="Word Associstion Rules" to_port="in 1"/>
    <connect from_op="Word Associstion Rules" from_port="out 1" to_port="output 1"/>
    <connect from_op="Word Associstion Rules" from_port="out 2" to_port="output 2"/>
    <connect from_op="Word Associstion Rules" from_port="out 3" to_port="output 3"/>
    <portSpacing port="source_example set" spacing="0"/>
    <portSpacing port="sink_example set" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    <portSpacing port="sink_output 3" spacing="0"/>
    <portSpacing port="sink_output 4" spacing="0"/>
    </process>
    </operator>
    <connect from_op="Crawler" from_port="out 1" to_op="Prepare Data" to_port="in 1"/>
    <connect from_op="Prepare Data" from_port="out 1" to_op="Loop Examples R-Script" to_port="example set"/>
    <connect from_op="Prepare Data" from_port="out 2" to_op="Loop Examples Associations" to_port="example set"/>
    <connect from_op="Loop Examples R-Script" from_port="output 1" to_port="result 1"/>
    <connect from_op="Loop Examples R-Script" from_port="output 2" to_port="result 2"/>
    <connect from_op="Loop Examples Associations" from_port="output 1" to_port="result 3"/>
    <connect from_op="Loop Examples Associations" from_port="output 2" to_port="result 4"/>
    <connect from_op="Loop Examples Associations" from_port="output 3" to_port="result 5"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    <portSpacing port="sink_result 5" spacing="0"/>
    <portSpacing port="sink_result 6" spacing="0"/>
    </process>
    </operator>
    </process>

    All items of the collections are the same, I didn't find any difference between them.

     

    Regards

    Tobias

  • Telcontar120
    Telcontar120 New Altair Community Member

    Agreed, that is the same behavior I referenced earlier in this thread.  I believe it is a bug that the developers are going to need to look at.

  • TobiasNehrig
    TobiasNehrig New Altair Community Member

    Hi @sgenzer

    Hi @Telcontar120

    I think I have found finally a solution with the Group into Collection Operator from the Operatior Toolbox. But is there a way to combine the results and compare them?

     

    Kind regards

    Tobias

  • MartinLiebig
    MartinLiebig
    Altair Employee

    @TobiasNehrig,

    i think Converters Extension got a Ass. Rule to ExampleSet? That could help.

     

    BR,

    Martin

  • TobiasNehrig
    TobiasNehrig New Altair Community Member

    Hi @sgenzer,

    hi @Telcontar120,

     

    the previously discussed point works so far fine that I've got some results. But in the end the results are not so good, so that I've to do a better job in preparing the data. My new concept looks like that I'll crawl the webpages, prepare the data and then cascade the text mining process in the Loop Collection operator. At first I'll splitt each text of a web page in sentences in an ExampleSet per web page.  After that to tokenize the sentences words for each web page and sencenes in a seperate ExampleSet. My aim is to have for each page an ExampleSet where I can calculate the tf-idf for each page. So I use again the Loop Collection operator.  But I miss something, in the results my senteces are not further tokenized.

     

    <?xml version="1.0" encoding="UTF-8"?><process version="8.2.001">
    <operator activated="true" class="subprocess" compatibility="8.2.001" expanded="true" height="82" name="Crawler" width="90" x="45" y="34">
    <process expanded="true">
    <operator activated="true" class="subprocess" compatibility="8.2.001" expanded="true" height="82" name="Crawler Spon" width="90" x="45" y="34">
    <process expanded="true">
    <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="112" y="34">
    <parameter key="url" value="http://www.spiegel.de"/>
    <list key="crawling_rules">
    <parameter key="store_with_matching_url" value=".+www.spiegel.+"/>
    <parameter key="follow_link_with_matching_url" value=".+spiegel.+|.+de.+"/>
    </list>
    <parameter key="max_crawl_depth" value="10"/>
    <parameter key="retrieve_as_html" value="true"/>
    <parameter key="enable_basic_auth" value="false"/>
    <parameter key="add_content_as_attribute" value="true"/>
    <parameter key="write_pages_to_disk" value="false"/>
    <parameter key="include_binary_content" value="false"/>
    <parameter key="output_file_extension" value="txt"/>
    <parameter key="max_pages" value="10"/>
    <parameter key="max_page_size" value="100000"/>
    <parameter key="delay" value="100"/>
    <parameter key="max_concurrent_connections" value="200"/>
    <parameter key="max_connections_per_host" value="100"/>
    <parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0"/>
    <parameter key="ignore_robot_exclusion" value="false"/>
    </operator>
    <operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="246" y="34">
    <parameter key="link_attribute" value="Link"/>
    <parameter key="page_attribute" value="link"/>
    <parameter key="random_user_agent" value="true"/>
    <parameter key="connection_timeout" value="10000"/>
    <parameter key="read_timeout" value="10000"/>
    <parameter key="follow_redirects" value="true"/>
    <parameter key="accept_cookies" value="none"/>
    <parameter key="cookie_scope" value="global"/>
    <parameter key="request_method" value="GET"/>
    <parameter key="delay" value="none"/>
    <parameter key="delay_amount" value="1000"/>
    <parameter key="min_delay_amount" value="0"/>
    <parameter key="max_delay_amount" value="1000"/>
    </operator>
    <connect from_op="Crawl Web" from_port="example set" to_op="Get Pages" to_port="Example Set"/>
    <connect from_op="Get Pages" from_port="Example Set" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data (3)" width="90" x="246" y="34">
    <parameter key="create_word_vector" value="false"/>
    <parameter key="vector_creation" value="TF-IDF"/>
    <parameter key="add_meta_information" value="true"/>
    <parameter key="keep_text" value="true"/>
    <parameter key="prune_method" value="none"/>
    <parameter key="prune_below_percent" value="3.0"/>
    <parameter key="prune_above_percent" value="30.0"/>
    <parameter key="prune_below_rank" value="0.05"/>
    <parameter key="prune_above_rank" value="0.95"/>
    <parameter key="datamanagement" value="double_sparse_array"/>
    <parameter key="data_management" value="auto"/>
    <parameter key="select_attributes_and_weights" value="false"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="179" y="34">
    <parameter key="extract_content" value="true"/>
    <parameter key="minimum_text_block_length" value="10"/>
    <parameter key="override_content_type_information" value="true"/>
    <parameter key="neglegt_span_tags" value="true"/>
    <parameter key="neglect_p_tags" value="true"/>
    <parameter key="neglect_b_tags" value="true"/>
    <parameter key="neglect_i_tags" value="true"/>
    <parameter key="neglect_br_tags" value="true"/>
    <parameter key="ignore_non_html_tags" value="false"/>
    </operator>
    <connect from_port="document" to_op="Extract Content" to_port="document"/>
    <connect from_op="Extract Content" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_op="Crawler Spon" from_port="out 1" to_op="Process Documents from Data (3)" to_port="example set"/>
    <connect from_op="Process Documents from Data (3)" from_port="example set" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    </process>
    <?xml version="1.0" encoding="UTF-8"?><process version="8.2.001">
    <operator activated="true" class="subprocess" compatibility="8.2.001" expanded="true" height="82" name="Organize Data" width="90" x="246" y="34">
    <process expanded="true">
    <operator activated="true" class="set_role" compatibility="8.2.001" expanded="true" height="82" name="Set Role (2)" width="90" x="45" y="34">
    <parameter key="attribute_name" value="text"/>
    <parameter key="target_role" value="regular"/>
    <list key="set_additional_roles">
    <parameter key="Title" value="label"/>
    </list>
    </operator>
    <operator activated="true" class="generate_id" compatibility="8.2.001" expanded="true" height="82" name="Generate ID" width="90" x="179" y="34">
    <parameter key="create_nominal_ids" value="false"/>
    <parameter key="offset" value="0"/>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="8.2.001" expanded="true" height="82" name="Select Attributes" width="90" x="447" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attribute" value=""/>
    <parameter key="attributes" value="id|text|Title"/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="attribute_value"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="time"/>
    <parameter key="block_type" value="attribute_block"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="value_matrix_row_start"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    </operator>
    <operator activated="true" class="filter_examples" compatibility="8.2.001" expanded="true" height="103" name="Filter Examples" width="90" x="581" y="34">
    <parameter key="parameter_expression" value=""/>
    <parameter key="condition_class" value="custom_filters"/>
    <parameter key="invert_filter" value="false"/>
    <list key="filters_list">
    <parameter key="filters_entry_key" value="Title.is_not_missing."/>
    </list>
    <parameter key="filters_logic_and" value="false"/>
    <parameter key="filters_check_metadata" value="false"/>
    </operator>
    <connect from_port="in 1" to_op="Set Role (2)" to_port="example set input"/>
    <connect from_op="Set Role (2)" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
    <connect from_op="Generate ID" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
    <connect from_op="Filter Examples" from_port="example set output" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="source_in 2" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    </process>
    <?xml version="1.0" encoding="UTF-8"?><process version="8.2.001">
    <operator activated="true" class="operator_toolbox:group_into_collection" compatibility="1.2.000" expanded="true" height="82" name="Group Into Collection (3)" width="90" x="380" y="34">
    <parameter key="group_by_attribute" value="id"/>
    <parameter key="group_by_attribute (numerical)" value="id"/>
    <parameter key="sorting_order" value="numerical"/>
    </operator>
    </process>
    <?xml version="1.0" encoding="UTF-8"?><process version="8.2.001">
    <operator activated="true" class="loop_collection" compatibility="8.2.001" expanded="true" height="103" name="Loop Collection (3)" width="90" x="514" y="34">
    <parameter key="set_iteration_macro" value="false"/>
    <parameter key="macro_name" value="iteration"/>
    <parameter key="macro_start_value" value="1"/>
    <parameter key="unfold" value="false"/>
    <process expanded="true">
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="179" y="34">
    <parameter key="create_word_vector" value="true"/>
    <parameter key="vector_creation" value="Binary Term Occurrences"/>
    <parameter key="add_meta_information" value="true"/>
    <parameter key="keep_text" value="false"/>
    <parameter key="prune_method" value="none"/>
    <parameter key="prune_below_percent" value="0.5"/>
    <parameter key="prune_above_percent" value="100.0"/>
    <parameter key="prune_below_absolute" value="20"/>
    <parameter key="prune_above_absolute" value="2000"/>
    <parameter key="prune_below_rank" value="0.05"/>
    <parameter key="prune_above_rank" value="0.95"/>
    <parameter key="datamanagement" value="double_sparse_array"/>
    <parameter key="data_management" value="auto"/>
    <parameter key="select_attributes_and_weights" value="false"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="false" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize Non-letters (4)" width="90" x="45" y="34">
    <parameter key="mode" value="non letters"/>
    <parameter key="characters" value=".:"/>
    <parameter key="language" value="English"/>
    <parameter key="max_token_length" value="3"/>
    </operator>
    <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize Linguistic (4)" width="90" x="380" y="34">
    <parameter key="mode" value="linguistic sentences"/>
    <parameter key="characters" value=".:"/>
    <parameter key="language" value="German"/>
    <parameter key="max_token_length" value="3"/>
    </operator>
    <connect from_port="document" to_op="Tokenize Linguistic (4)" to_port="document"/>
    <connect from_op="Tokenize Linguistic (4)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="multiply" compatibility="8.2.001" expanded="true" height="103" name="Multiply" width="90" x="313" y="34"/>
    <operator activated="true" class="operator_toolbox:group_into_collection" compatibility="1.2.000" expanded="true" height="82" name="Group Into Collection (4)" width="90" x="447" y="34">
    <parameter key="group_by_attribute" value="id"/>
    <parameter key="group_by_attribute (numerical)" value="id"/>
    <parameter key="sorting_order" value="numerical"/>
    </operator>
    <operator activated="true" class="loop_collection" compatibility="8.2.001" expanded="true" height="82" name="Loop Collection (4)" width="90" x="648" y="34">
    <parameter key="set_iteration_macro" value="false"/>
    <parameter key="macro_name" value="iteration"/>
    <parameter key="macro_start_value" value="1"/>
    <parameter key="unfold" value="false"/>
    <process expanded="true">
    <operator activated="true" class="transpose" compatibility="8.2.001" expanded="true" height="82" name="Transpose (2)" width="90" x="112" y="34"/>
    <operator activated="true" class="select_attributes" compatibility="8.2.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="246" y="34">
    <parameter key="attribute_filter_type" value="subset"/>
    <parameter key="attribute" value=""/>
    <parameter key="attributes" value="id"/>
    <parameter key="use_except_expression" value="false"/>
    <parameter key="value_type" value="attribute_value"/>
    <parameter key="use_value_type_exception" value="false"/>
    <parameter key="except_value_type" value="time"/>
    <parameter key="block_type" value="attribute_block"/>
    <parameter key="use_block_type_exception" value="false"/>
    <parameter key="except_block_type" value="value_matrix_row_start"/>
    <parameter key="invert_selection" value="false"/>
    <parameter key="include_special_attributes" value="false"/>
    </operator>
    <operator activated="true" class="rename" compatibility="8.2.001" expanded="true" height="82" name="Rename" width="90" x="380" y="34">
    <parameter key="old_name" value="id"/>
    <parameter key="new_name" value="text"/>
    <list key="rename_additional_attributes"/>
    </operator>
    <operator activated="true" class="set_role" compatibility="8.2.001" expanded="true" height="82" name="Set Role" width="90" x="514" y="34">
    <parameter key="attribute_name" value="text"/>
    <parameter key="target_role" value="label"/>
    <list key="set_additional_roles"/>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="648" y="34">
    <parameter key="create_word_vector" value="true"/>
    <parameter key="vector_creation" value="TF-IDF"/>
    <parameter key="add_meta_information" value="true"/>
    <parameter key="keep_text" value="false"/>
    <parameter key="prune_method" value="none"/>
    <parameter key="prune_below_percent" value="0.1"/>
    <parameter key="prune_above_percent" value="100.0"/>
    <parameter key="prune_below_rank" value="0.05"/>
    <parameter key="prune_above_rank" value="0.95"/>
    <parameter key="datamanagement" value="double_sparse_array"/>
    <parameter key="data_management" value="auto"/>
    <parameter key="select_attributes_and_weights" value="false"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="112" y="85">
    <parameter key="mode" value="non letters"/>
    <parameter key="characters" value=".:"/>
    <parameter key="language" value="English"/>
    <parameter key="max_token_length" value="3"/>
    </operator>
    <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="246" y="85">
    <parameter key="mode" value="linguistic sentences"/>
    <parameter key="characters" value=".:"/>
    <parameter key="language" value="German"/>
    <parameter key="max_token_length" value="3"/>
    </operator>
    <operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="514" y="85">
    <parameter key="min_chars" value="2"/>
    <parameter key="max_chars" value="25"/>
    </operator>
    <operator activated="true" class="subprocess" compatibility="8.2.001" expanded="true" height="82" name="Adblocker" width="90" x="648" y="85">
    <process expanded="true">
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Token" width="90" x="45" y="34">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="Bitte"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (5)" width="90" x="179" y="34">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="deaktivieren"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (11)" width="90" x="313" y="34">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="Ihren"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (12)" width="90" x="447" y="34">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="Adblocker"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (13)" width="90" x="581" y="34">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="warum"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (14)" width="90" x="715" y="34">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="sehe"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (15)" width="90" x="849" y="34">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="nicht"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (16)" width="90" x="45" y="136">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="mehr"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (9)" width="90" x="179" y="136">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="Ausnahme"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (17)" width="90" x="313" y="136">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="Erweiterungen"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (19)" width="90" x="447" y="136">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="modus"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (3)" width="90" x="581" y="136">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="werbung"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (21)" width="90" x="715" y="136">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="informationen"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (20)" width="90" x="849" y="136">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="bedeutung"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (4)" width="90" x="45" y="238">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="Browser"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (8)" width="90" x="179" y="238">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="redaktion"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (10)" width="90" x="313" y="238">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="forum"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Token (2)" width="90" x="447" y="238">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="Spiegel"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (6)" width="90" x="581" y="238">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="Einstellungen"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (7)" width="90" x="715" y="238">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="klicken"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (22)" width="90" x="849" y="238">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="Sicherheit"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (2)" width="90" x="45" y="340">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="Do"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (18)" width="90" x="179" y="340">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="not"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (24)" width="90" x="313" y="340">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="Track"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (25)" width="90" x="447" y="340">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="Addons"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (26)" width="90" x="581" y="340">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="SPON"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (27)" width="90" x="715" y="340">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="Inkognito"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (23)" width="90" x="849" y="340">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="Netz"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <operator activated="true" class="text:filter_tokens_by_content" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (28)" width="90" x="849" y="442">
    <parameter key="condition" value="contains"/>
    <parameter key="string" value="de"/>
    <parameter key="case_sensitive" value="false"/>
    <parameter key="invert condition" value="true"/>
    </operator>
    <connect from_port="in 1" to_op="Filter Token" to_port="document"/>
    <connect from_op="Filter Token" from_port="document" to_op="Filter Tokens (5)" to_port="document"/>
    <connect from_op="Filter Tokens (5)" from_port="document" to_op="Filter Tokens (11)" to_port="document"/>
    <connect from_op="Filter Tokens (11)" from_port="document" to_op="Filter Tokens (12)" to_port="document"/>
    <connect from_op="Filter Tokens (12)" from_port="document" to_op="Filter Tokens (13)" to_port="document"/>
    <connect from_op="Filter Tokens (13)" from_port="document" to_op="Filter Tokens (14)" to_port="document"/>
    <connect from_op="Filter Tokens (14)" from_port="document" to_op="Filter Tokens (15)" to_port="document"/>
    <connect from_op="Filter Tokens (15)" from_port="document" to_op="Filter Tokens (16)" to_port="document"/>
    <connect from_op="Filter Tokens (16)" from_port="document" to_op="Filter Tokens (9)" to_port="document"/>
    <connect from_op="Filter Tokens (9)" from_port="document" to_op="Filter Tokens (17)" to_port="document"/>
    <connect from_op="Filter Tokens (17)" from_port="document" to_op="Filter Tokens (19)" to_port="document"/>
    <connect from_op="Filter Tokens (19)" from_port="document" to_op="Filter Tokens (3)" to_port="document"/>
    <connect from_op="Filter Tokens (3)" from_port="document" to_op="Filter Tokens (21)" to_port="document"/>
    <connect from_op="Filter Tokens (21)" from_port="document" to_op="Filter Tokens (20)" to_port="document"/>
    <connect from_op="Filter Tokens (20)" from_port="document" to_op="Filter Tokens (4)" to_port="document"/>
    <connect from_op="Filter Tokens (4)" from_port="document" to_op="Filter Tokens (8)" to_port="document"/>
    <connect from_op="Filter Tokens (8)" from_port="document" to_op="Filter Tokens (10)" to_port="document"/>
    <connect from_op="Filter Tokens (10)" from_port="document" to_op="Filter Token (2)" to_port="document"/>
    <connect from_op="Filter Token (2)" from_port="document" to_op="Filter Tokens (6)" to_port="document"/>
    <connect from_op="Filter Tokens (6)" from_port="document" to_op="Filter Tokens (7)" to_port="document"/>
    <connect from_op="Filter Tokens (7)" from_port="document" to_op="Filter Tokens (22)" to_port="document"/>
    <connect from_op="Filter Tokens (22)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
    <connect from_op="Filter Tokens (2)" from_port="document" to_op="Filter Tokens (18)" to_port="document"/>
    <connect from_op="Filter Tokens (18)" from_port="document" to_op="Filter Tokens (24)" to_port="document"/>
    <connect from_op="Filter Tokens (24)" from_port="document" to_op="Filter Tokens (25)" to_port="document"/>
    <connect from_op="Filter Tokens (25)" from_port="document" to_op="Filter Tokens (26)" to_port="document"/>
    <connect from_op="Filter Tokens (26)" from_port="document" to_op="Filter Tokens (27)" to_port="document"/>
    <connect from_op="Filter Tokens (27)" from_port="document" to_op="Filter Tokens (23)" to_port="document"/>
    <connect from_op="Filter Tokens (23)" from_port="document" to_op="Filter Tokens (28)" to_port="document"/>
    <connect from_op="Filter Tokens (28)" from_port="document" to_port="out 1"/>
    <portSpacing port="source_in 1" spacing="0"/>
    <portSpacing port="source_in 2" spacing="0"/>
    <portSpacing port="sink_out 1" spacing="0"/>
    <portSpacing port="sink_out 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="text:filter_stopwords_german" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (4)" width="90" x="782" y="85">
    <parameter key="stop_word_list" value="Standard"/>
    </operator>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Tokenize (2)" to_port="document"/>
    <connect from_op="Tokenize (2)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
    <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Adblocker" to_port="in 1"/>
    <connect from_op="Adblocker" from_port="out 1" to_op="Filter Stopwords (4)" to_port="document"/>
    <connect from_op="Filter Stopwords (4)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <connect from_port="single" to_op="Transpose (2)" to_port="example set input"/>
    <connect from_op="Transpose (2)" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/>
    <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Rename" to_port="example set input"/>
    <connect from_op="Rename" from_port="example set output" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="Process Documents from Data (2)" to_port="example set"/>
    <connect from_op="Process Documents from Data (2)" from_port="example set" to_port="output 1"/>
    <portSpacing port="source_single" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    </process>
    </operator>
    <connect from_port="single" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
    <connect from_op="Multiply" from_port="output 1" to_op="Group Into Collection (4)" to_port="exa"/>
    <connect from_op="Multiply" from_port="output 2" to_port="output 2"/>
    <connect from_op="Group Into Collection (4)" from_port="col" to_op="Loop Collection (4)" to_port="collection"/>
    <connect from_op="Loop Collection (4)" from_port="output 1" to_port="output 1"/>
    <portSpacing port="source_single" spacing="0"/>
    <portSpacing port="sink_output 1" spacing="0"/>
    <portSpacing port="sink_output 2" spacing="0"/>
    <portSpacing port="sink_output 3" spacing="0"/>
    </process>
    </operator>
    </process>

    best regards

    Tobias