Hi everyone
I am new here! i have a problem with crawlweb which i'm not able to solve, i tried and googled for weeks now.. (anyway it seems pretty simple but I just dont get it..)
I want to crawl a newssite (here: http://www.bbc.com/) for a keyword (here: .*zuckerberg.*) and save 100 results in .txt
But it just doesn't work, i tried everything but i don't seem to get it done.
I hope you can help me, please see my process in .xml.
Thank you very much for your help in advance!
<?xml version="1.0" encoding="UTF-8"?>
-<process version="8.2.000">
-<context>
<input/>
<output/>
<macros/>
</context>
-<operator name="Process" expanded="true" compatibility="8.2.000" class="process" activated="true">
<parameter value="init" key="logverbosity"/>
<parameter value="2001" key="random_seed"/>
<parameter value="never" key="send_mail"/>
<parameter value="" key="notification_email"/>
<parameter value="30" key="process_duration_for_mail"/>
<parameter value="SYSTEM" key="encoding"/>
-<process expanded="true">
-<operator name="Crawl Web" expanded="true" compatibility="7.3.000" class="web:crawl_web" activated="true" y="34" x="112" width="90" height="68">
<parameter value="http://www.bbc.com/" key="url"/>
-<list key="crawling_rules">
<parameter value=".*tech.*" key="follow_link_with_matching_url"/>
<parameter value=".*zuckerberg.*" key="store_with_matching_url"/>
<parameter value=".*news.*" key="follow_link_with_matching_url"/>
<parameter value=".*zuckerberg.*" key="store_with_matching_content"/>
</list>
<parameter value="false" key="write_pages_into_files"/>
<parameter value="true" key="add_pages_as_attribute"/>
<parameter value="txt" key="extension"/>
<parameter value="100" key="max_pages"/>
<parameter value="4" key="max_depth"/>
<parameter value="web" key="domain"/>
<parameter value="1000" key="delay"/>
<parameter value="2" key="max_threads"/>
<parameter value="10000" key="max_page_size"/>
<parameter value="rapid-miner-crawler" key="user_agent"/>
<parameter value="true" key="obey_robot_exclusion"/>
<parameter value="false" key="really_ignore_exclusion"/>
</operator>
-<operator name="Process Documents from Data" expanded="true" compatibility="8.1.000" class="text:process_document_from_data" activated="true" y="34" x="313" width="90" height="82">
<parameter value="false" key="create_word_vector"/>
<parameter value="TF-IDF" key="vector_creation"/>
<parameter value="true" key="add_meta_information"/>
<parameter value="true" key="keep_text"/>
<parameter value="none" key="prune_method"/>
<parameter value="3.0" key="prune_below_percent"/>
<parameter value="30.0" key="prune_above_percent"/>
<parameter value="0.05" key="prune_below_rank"/>
<parameter value="0.95" key="prune_above_rank"/>
<parameter value="double_sparse_array" key="datamanagement"/>
<parameter value="auto" key="data_management"/>
<parameter value="false" key="select_attributes_and_weights"/>
<list key="specify_weights"/>
-<process expanded="true">
-<operator name="Extract Content" expanded="true" compatibility="7.3.000" class="web:extract_html_text_content" activated="true" y="34" x="45" width="90" height="68">
<parameter value="true" key="extract_content"/>
<parameter value="5" key="minimum_text_block_length"/>
<parameter value="true" key="override_content_type_information"/>
<parameter value="true" key="neglegt_span_tags"/>
<parameter value="true" key="neglect_p_tags"/>
<parameter value="true" key="neglect_b_tags"/>
<parameter value="true" key="neglect_i_tags"/>
<parameter value="true" key="neglect_br_tags"/>
<parameter value="true" key="ignore_non_html_tags"/>
</operator>
<operator name="Unescape HTML Document" expanded="true" compatibility="7.3.000" class="web:unescape_html" activated="true" y="34" x="179" width="90" height="68"/>
-<operator name="Write Document" expanded="true" compatibility="8.1.000" class="text:write_document" activated="true" y="34" x="313" width="90" height="82">
<parameter value="true" key="overwrite"/>
<parameter value="SYSTEM" key="encoding"/>
</operator>
-<operator name="Write File" expanded="true" compatibility="8.2.000" class="write_file" activated="true" y="136" x="447" width="90" height="68">
<parameter value="file" key="resource_type"/>
<parameter value="C:\Users\Ittaj\Desktop\rapidminer\tests\%{t}-%{a}.txt" key="filename"/>
<parameter value="application/octet-stream" key="mime_type"/>
</operator>
<connect to_port="document" to_op="Extract Content" from_port="document"/>
<connect to_port="document" to_op="Unescape HTML Document" from_port="document" from_op="Extract Content"/>
<connect to_port="document" to_op="Write Document" from_port="document" from_op="Unescape HTML Document"/>
<connect to_port="document 1" from_port="document" from_op="Write Document"/>
<connect to_port="file" to_op="Write File" from_port="file" from_op="Write Document"/>
<portSpacing spacing="0" port="source_document"/>
<portSpacing spacing="0" port="sink_document 1"/>
<portSpacing spacing="0" port="sink_document 2"/>
</process>
</operator>
<connect to_port="example set" to_op="Process Documents from Data" from_port="Example Set" from_op="Crawl Web"/>
<connect to_port="result 1" from_port="example set" from_op="Process Documents from Data"/>
<portSpacing spacing="0" port="source_input 1"/>
<portSpacing spacing="0" port="sink_result 1"/>
<portSpacing spacing="0" port="sink_result 2"/>
</process>
</operator>
</process>