Get Page operator stalls Rapidminer (SOLVED)

MarkusH23
MarkusH23 New Altair Community Member
edited November 5 in Community Q&A
When running the process below (with web mining and text mining extensions loaded) RapidMiner stalls when trying to display the results. It eventually shows the results but something seems to be running in the background and it makes RapidMiner very sluggish.

I've been using this for years. Also tried version 10 and I'm experiencing the same issue.

Note: I wasn't allowed to post links which were in the XML code. To replicate just add 2 random links to the Get Page operator.

Any ideas?

<?xml version="1.0" encoding="UTF-8"?><process version="9.10.013">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="web:get_webpage" compatibility="9.7.002" expanded="true" height="68" name="Get Page" width="90" x="45" y="85">
        <parameter key="url" value=""/>
        <parameter key="random_user_agent" value="true"/>
        <parameter key="connection_timeout" value="10000"/>
        <parameter key="read_timeout" value="10000"/>
        <parameter key="follow_redirects" value="true"/>
        <parameter key="accept_cookies" value="none"/>
        <parameter key="cookie_scope" value="global"/>
        <parameter key="request_method" value="GET"/>
        <list key="query_parameters"/>
        <list key="request_properties"/>
        <parameter key="override_encoding" value="false"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="keep_sensitive_headers" value="false"/>
      </operator>
      <operator activated="true" class="web:get_webpage" compatibility="9.7.002" expanded="true" height="68" name="Get Page (2)" width="90" x="45" y="187">
        <parameter key="url" value=""/>
        <parameter key="random_user_agent" value="true"/>
        <parameter key="connection_timeout" value="10000"/>
        <parameter key="read_timeout" value="10000"/>
        <parameter key="follow_redirects" value="true"/>
        <parameter key="accept_cookies" value="none"/>
        <parameter key="cookie_scope" value="global"/>
        <parameter key="request_method" value="GET"/>
        <list key="query_parameters"/>
        <list key="request_properties"/>
        <parameter key="override_encoding" value="false"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="keep_sensitive_headers" value="false"/>
      </operator>
      <operator activated="false" breakpoints="after" class="read_excel" compatibility="6.0.003" expanded="true" height="68" name="Read Excel" width="90" x="112" y="289">
        <parameter key="excel_file" value="C:/Users/hofma/Dropbox/ITB/2022 - 2023/Sem 1/Text Mining/Module Content/Session 6/S6 RapidMiner Files/daft_urls.xls"/>
        <parameter key="sheet_selection" value="sheet number"/>
        <parameter key="sheet_number" value="1"/>
        <parameter key="imported_cell_range" value="A1:A80"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="first_row_as_names" value="false"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <parameter key="date_format" value=""/>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="locale" value="English (United States)"/>
        <parameter key="read_all_values_as_polynominal" value="false"/>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="URLS.true.file_path.attribute"/>
        </list>
        <parameter key="read_not_matching_values_as_missings" value="true"/>
      </operator>
      <operator activated="false" class="web:retrieve_webpages" compatibility="9.7.002" expanded="true" height="68" name="Get Pages" width="90" x="380" y="289">
        <parameter key="link_attribute" value="URLS"/>
        <parameter key="random_user_agent" value="false"/>
        <parameter key="user_agent" value="RapidMiner"/>
        <parameter key="connection_timeout" value="10000"/>
        <parameter key="read_timeout" value="10000"/>
        <parameter key="follow_redirects" value="true"/>
        <parameter key="accept_cookies" value="none"/>
        <parameter key="cookie_scope" value="global"/>
        <parameter key="request_method" value="GET"/>
        <parameter key="delay" value="none"/>
        <parameter key="delay_amount" value="1000"/>
        <parameter key="min_delay_amount" value="0"/>
        <parameter key="max_delay_amount" value="1000"/>
      </operator>
      <operator activated="false" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="289">
        <parameter key="create_word_vector" value="true"/>
        <parameter key="vector_creation" value="TF-IDF"/>
        <parameter key="add_meta_information" value="true"/>
        <parameter key="keep_text" value="false"/>
        <parameter key="prune_method" value="none"/>
        <parameter key="prune_below_percent" value="3.0"/>
        <parameter key="prune_above_percent" value="30.0"/>
        <parameter key="prune_below_rank" value="0.05"/>
        <parameter key="prune_above_rank" value="0.95"/>
        <parameter key="datamanagement" value="double_sparse_array"/>
        <parameter key="data_management" value="auto"/>
        <parameter key="select_attributes_and_weights" value="false"/>
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:extract_information" compatibility="8.2.000" expanded="true" height="68" name="Extract Information (3)" width="90" x="313" y="34">
            <parameter key="query_type" value="XPath"/>
            <list key="string_machting_queries">
              <parameter key="test2" value="Department:\.*.\.*Institute Code:"/>
            </list>
            <parameter key="attribute_type" value="Nominal"/>
            <list key="regular_expression_queries">
              <parameter key="test" value="\bDepartment:\s+\K\S+"/>
            </list>
            <list key="regular_region_queries">
              <parameter key="test" value="Department:&lt;/th&gt;&lt;td&gt;.&lt;/td&gt;&lt;/tr&gt;"/>
            </list>
            <list key="xpath_queries">
              <parameter key="Title" value="h:html/h:head/h:title/text()"/>
              <parameter key="Price" value="//*[@id=&amp;quot;__next&quot;]/h:main/h:div[3]/h:div[1]/h:div[1]/h:div/h:div[3]/h:div[1]/h:span/text()"/>
              <parameter key="Bed" value="//*[@data-testid=&amp;quot;beds&quot;]/text()"/>
              <parameter key="DescriptionText" value="//*[@data-testid=&amp;quot;description&quot;]/text()"/>
            </list>
            <list key="namespaces"/>
            <parameter key="ignore_CDATA" value="true"/>
            <parameter key="assume_html" value="true"/>
            <list key="index_queries"/>
            <list key="jsonpath_queries"/>
          </operator>
          <operator activated="true" class="web:extract_html_text_content" compatibility="9.7.002" expanded="true" height="68" name="Extract Content" width="90" x="581" y="34">
            <parameter key="extract_content" value="true"/>
            <parameter key="minimum_text_block_length" value="5"/>
            <parameter key="override_content_type_information" value="true"/>
            <parameter key="neglegt_span_tags" value="true"/>
            <parameter key="neglect_p_tags" value="true"/>
            <parameter key="neglect_b_tags" value="true"/>
            <parameter key="neglect_i_tags" value="true"/>
            <parameter key="neglect_br_tags" value="true"/>
            <parameter key="ignore_non_html_tags" value="true"/>
          </operator>
          <connect from_port="document" to_op="Extract Information (3)" to_port="document"/>
          <connect from_op="Extract Information (3)" from_port="document" to_op="Extract Content" to_port="document"/>
          <connect from_op="Extract Content" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="9.4.000" expanded="true" height="124" name="Process Documents" width="90" x="447" y="85">
        <parameter key="create_word_vector" value="true"/>
        <parameter key="vector_creation" value="TF-IDF"/>
        <parameter key="add_meta_information" value="true"/>
        <parameter key="keep_text" value="false"/>
        <parameter key="prune_method" value="none"/>
        <parameter key="prune_below_percent" value="3.0"/>
        <parameter key="prune_above_percent" value="30.0"/>
        <parameter key="prune_below_rank" value="0.05"/>
        <parameter key="prune_above_rank" value="0.95"/>
        <parameter key="datamanagement" value="double_sparse_array"/>
        <parameter key="data_management" value="auto"/>
        <process expanded="true">
          <operator activated="true" class="text:extract_information" compatibility="8.2.000" expanded="true" height="68" name="Extract Information" width="90" x="179" y="34">
            <parameter key="query_type" value="XPath"/>
            <list key="string_machting_queries">
              <parameter key="test2" value="Department:\.*.\.*Institute Code:"/>
            </list>
            <parameter key="attribute_type" value="Nominal"/>
            <list key="regular_expression_queries">
              <parameter key="test" value="\bDepartment:\s+\K\S+"/>
            </list>
            <list key="regular_region_queries">
              <parameter key="test" value="Department:&lt;/th&gt;&lt;td&gt;.&lt;/td&gt;&lt;/tr&gt;"/>
            </list>
            <list key="xpath_queries">
              <parameter key="Title" value="h:html/h:head/h:title/text()"/>
              <parameter key="Price" value="//*[@id=&amp;quot;__next&quot;]/h:main/h:div[3]/h:div[1]/h:div[1]/h:div/h:div[3]/h:div[1]/h:span/text()"/>
              <parameter key="Bed" value="//*[@data-testid=&amp;quot;beds&quot;]/text()"/>
            </list>
            <list key="namespaces"/>
            <parameter key="ignore_CDATA" value="true"/>
            <parameter key="assume_html" value="true"/>
            <list key="index_queries"/>
            <list key="jsonpath_queries"/>
          </operator>
          <connect from_port="document" to_op="Extract Information" to_port="document"/>
          <connect from_op="Extract Information" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Get Page" from_port="output" to_op="Process Documents" to_port="documents 1"/>
      <connect from_op="Get Page (2)" from_port="output" to_op="Process Documents" to_port="documents 2"/>
      <connect from_op="Read Excel" from_port="output" to_op="Get Pages" to_port="Example Set"/>
      <connect from_op="Get Pages" from_port="Example Set" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents" from_port="example set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Answers

  • Caperez
    Caperez Altair Community Member
    Hi @MarkusH23
    I tested your process with another regular expression because the URL is not included in your data.
    Just changing the compatibility level into the Extract Information operator, the model run faster and more stable. 

    please try it. 

    Best, 

    Cesar 
  • MarkusH23
    MarkusH23 New Altair Community Member
    Thanks @ceaperez

    It makes a small difference but it still takes minutes to display the results from two web pages. 

    The issue was with the Document Vector creation as not producing a document vector resolved the issue. If you need a document vector of the HTML content, then a tokenizer will also eliminate the long wait time and unresponsiveness. In Rapidminer, when not using a tokenizer, the entire document is a token and RM seems to struggle to render this.

    Thanks again

    Markus