When running the process below (with web mining and text mining extensions loaded) RapidMiner stalls when trying to display the results. It eventually shows the results but something seems to be running in the background and it makes RapidMiner very sluggish.
I've been using this for years. Also tried version 10 and I'm experiencing the same issue.
Note: I wasn't allowed to post links which were in the XML code. To replicate just add 2 random links to the Get Page operator.
<?xml version="1.0" encoding="UTF-8"?><process version="9.10.013">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="web:get_webpage" compatibility="9.7.002" expanded="true" height="68" name="Get Page" width="90" x="45" y="85">
<parameter key="url" value=""/>
<parameter key="random_user_agent" value="true"/>
<parameter key="connection_timeout" value="10000"/>
<parameter key="read_timeout" value="10000"/>
<parameter key="follow_redirects" value="true"/>
<parameter key="accept_cookies" value="none"/>
<parameter key="cookie_scope" value="global"/>
<parameter key="request_method" value="GET"/>
<list key="query_parameters"/>
<list key="request_properties"/>
<parameter key="override_encoding" value="false"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="keep_sensitive_headers" value="false"/>
</operator>
<operator activated="true" class="web:get_webpage" compatibility="9.7.002" expanded="true" height="68" name="Get Page (2)" width="90" x="45" y="187">
<parameter key="url" value=""/>
<parameter key="random_user_agent" value="true"/>
<parameter key="connection_timeout" value="10000"/>
<parameter key="read_timeout" value="10000"/>
<parameter key="follow_redirects" value="true"/>
<parameter key="accept_cookies" value="none"/>
<parameter key="cookie_scope" value="global"/>
<parameter key="request_method" value="GET"/>
<list key="query_parameters"/>
<list key="request_properties"/>
<parameter key="override_encoding" value="false"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="keep_sensitive_headers" value="false"/>
</operator>
<operator activated="false" breakpoints="after" class="read_excel" compatibility="6.0.003" expanded="true" height="68" name="Read Excel" width="90" x="112" y="289">
<parameter key="excel_file" value="C:/Users/hofma/Dropbox/ITB/2022 - 2023/Sem 1/Text Mining/Module Content/Session 6/S6 RapidMiner Files/daft_urls.xls"/>
<parameter key="sheet_selection" value="sheet number"/>
<parameter key="sheet_number" value="1"/>
<parameter key="imported_cell_range" value="A1:A80"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="date_format" value=""/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<parameter key="read_all_values_as_polynominal" value="false"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="URLS.true.file_path.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="true"/>
</operator>
<operator activated="false" class="web:retrieve_webpages" compatibility="9.7.002" expanded="true" height="68" name="Get Pages" width="90" x="380" y="289">
<parameter key="link_attribute" value="URLS"/>
<parameter key="random_user_agent" value="false"/>
<parameter key="user_agent" value="RapidMiner"/>
<parameter key="connection_timeout" value="10000"/>
<parameter key="read_timeout" value="10000"/>
<parameter key="follow_redirects" value="true"/>
<parameter key="accept_cookies" value="none"/>
<parameter key="cookie_scope" value="global"/>
<parameter key="request_method" value="GET"/>
<parameter key="delay" value="none"/>
<parameter key="delay_amount" value="1000"/>
<parameter key="min_delay_amount" value="0"/>
<parameter key="max_delay_amount" value="1000"/>
</operator>
<operator activated="false" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="289">
<parameter key="create_word_vector" value="true"/>
<parameter key="vector_creation" value="TF-IDF"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="false"/>
<parameter key="prune_method" value="none"/>
<parameter key="prune_below_percent" value="3.0"/>
<parameter key="prune_above_percent" value="30.0"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="data_management" value="auto"/>
<parameter key="select_attributes_and_weights" value="false"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:extract_information" compatibility="8.2.000" expanded="true" height="68" name="Extract Information (3)" width="90" x="313" y="34">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries">
<parameter key="test2" value="Department:\.*.\.*Institute Code:"/>
</list>
<parameter key="attribute_type" value="Nominal"/>
<list key="regular_expression_queries">
<parameter key="test" value="\bDepartment:\s+\K\S+"/>
</list>
<list key="regular_region_queries">
<parameter key="test" value="Department:</th><td>.</td></tr>"/>
</list>
<list key="xpath_queries">
<parameter key="Title" value="h:html/h:head/h:title/text()"/>
<parameter key="Price" value="//*[
@id=&quot;__next"]/h:main/h:div[3]/h:div[1]/h:div[1]/h:div/h:div[3]/h:div[1]/h:span/text()"/>
<parameter key="DescriptionText" value="//*[
@data-testid=&quot;description"]/text()"/>
</list>
<list key="namespaces"/>
<parameter key="ignore_CDATA" value="true"/>
<parameter key="assume_html" value="true"/>
<list key="index_queries"/>
<list key="jsonpath_queries"/>
</operator>
<operator activated="true" class="web:extract_html_text_content" compatibility="9.7.002" expanded="true" height="68" name="Extract Content" width="90" x="581" y="34">
<parameter key="extract_content" value="true"/>
<parameter key="minimum_text_block_length" value="5"/>
<parameter key="override_content_type_information" value="true"/>
<parameter key="neglegt_span_tags" value="true"/>
<parameter key="neglect_p_tags" value="true"/>
<parameter key="neglect_b_tags" value="true"/>
<parameter key="neglect_i_tags" value="true"/>
<parameter key="neglect_br_tags" value="true"/>
<parameter key="ignore_non_html_tags" value="true"/>
</operator>
<connect from_port="document" to_op="Extract Information (3)" to_port="document"/>
<connect from_op="Extract Information (3)" from_port="document" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:process_documents" compatibility="9.4.000" expanded="true" height="124" name="Process Documents" width="90" x="447" y="85">
<parameter key="create_word_vector" value="true"/>
<parameter key="vector_creation" value="TF-IDF"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="false"/>
<parameter key="prune_method" value="none"/>
<parameter key="prune_below_percent" value="3.0"/>
<parameter key="prune_above_percent" value="30.0"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="data_management" value="auto"/>
<process expanded="true">
<operator activated="true" class="text:extract_information" compatibility="8.2.000" expanded="true" height="68" name="Extract Information" width="90" x="179" y="34">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries">
<parameter key="test2" value="Department:\.*.\.*Institute Code:"/>
</list>
<parameter key="attribute_type" value="Nominal"/>
<list key="regular_expression_queries">
<parameter key="test" value="\bDepartment:\s+\K\S+"/>
</list>
<list key="regular_region_queries">
<parameter key="test" value="Department:</th><td>.</td></tr>"/>
</list>
<list key="xpath_queries">
<parameter key="Title" value="h:html/h:head/h:title/text()"/>
<parameter key="Price" value="//*[
@id=&quot;__next"]/h:main/h:div[3]/h:div[1]/h:div[1]/h:div/h:div[3]/h:div[1]/h:span/text()"/>
</list>
<list key="namespaces"/>
<parameter key="ignore_CDATA" value="true"/>
<parameter key="assume_html" value="true"/>
<list key="index_queries"/>
<list key="jsonpath_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Get Page" from_port="output" to_op="Process Documents" to_port="documents 1"/>
<connect from_op="Get Page (2)" from_port="output" to_op="Process Documents" to_port="documents 2"/>
<connect from_op="Read Excel" from_port="output" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>