Hi,
I have followed all the instructions with regards to
http://auburnbigdata.blogspot.com/2013/04/web-crawling-with-rapidminer.html. My web crawler folder is empty. What am I doing wrong? The system times out at 42s. Has anyone had this problem after changing to .+auburnbigdata.+?
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="web:crawl_web" compatibility="5.3.002" expanded="true" height="60" name="Crawl Web" width="90" x="447" y="75">
<parameter key="url" value="http://auburnbigdata.blogspot.com"/>
<list key="crawling_rules">
<parameter key="follow_link_with_matching_url" value=".+auburnbigdata.+"/>
<parameter key="store_with_matching_url" value=".+auburnbigdata.+"/>
</list>
<parameter key="output_dir" value="C:\Users\cec045\Desktop\CrawlData"/>
<parameter key="max_depth" value="10"/>
<parameter key="max_threads" value="2"/>
<parameter key="user_agent" value="Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"/>
</operator>
<connect from_op="Crawl Web" from_port="Example Set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>