I'm trying to perform web scraping on a URL by using "Process Documents from Web" operator, and have set a xpath query using "Extract information" operator. I have tested the xpath query at google spreadsheet "importxml" function and it seemed to work fine. However, when I run the process in rapidminer, it does not return any results.
What could be the reason?
Would really appreciate if anyone can help me
My xml codes:
<?xml version="1.0" encoding="UTF-8"?><process version="9.8.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.8.001" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="web:process_web_modern" compatibility="9.3.001" expanded="true" height="68" name="Process Documents from Web" width="90" x="112" y="85">
<parameter key="url" value="https://en.wikipedia.org/wiki/List_of_Running_Man_episodes_(2020)"/>
<list key="crawling_rules"/>
<parameter key="max_crawl_depth" value="2"/>
<parameter key="retrieve_as_html" value="false"/>
<parameter key="enable_basic_auth" value="false"/>
<parameter key="add_content_as_attribute" value="false"/>
<parameter key="max_page_size" value="1000"/>
<parameter key="delay" value="200"/>
<parameter key="max_concurrent_connections" value="100"/>
<parameter key="max_connections_per_host" value="100"/>
<parameter key="user_agent" value="rapidminer-web-mining-extension-crawler"/>
<parameter key="ignore_robot_exclusion" value="false"/>
<process expanded="true">
<operator activated="true" breakpoints="after" class="text:extract_information" compatibility="9.3.001" expanded="true" height="68" name="Extract Information" width="90" x="112" y="34">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<parameter key="attribute_type" value="Nominal"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="Title" value="//*[@id=&quot;mw-content-text"]/div[1]/table[2]/tbody/tr[2]/td[2]/i"/>
</list>
<list key="namespaces"/>
<parameter key="ignore_CDATA" value="true"/>
<parameter key="assume_html" value="true"/>
<list key="index_queries"/>
<list key="jsonpath_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Process Documents from Web" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>