Hi
I am attemting to apply machine learning to determine the colours that individuals prefer. As part of this I need to download brand images. I am attempting to store a library images, one of them as an example:
https://www.vodafone.co.uk/cs/groups/public/documents/webcontent/1287x929_vodafone_logo.jpg
I am using the web crawling extension and trying to save the image on the page:
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="generate_data_user_specification" compatibility="8.1.000" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="112" y="85">
<list key="attribute_values">
<parameter key="image" value="("https://www.vodafone.co.uk/cs/groups/public/documents/webcontent/1287x929_vodafone_logo.jpg")"/>
</list>
<list key="set_additional_roles"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="multiply" compatibility="8.1.000" expanded="true" height="103" name="Multiply" width="90" x="246" y="85"/>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="extract_macro" compatibility="8.1.000" expanded="true" height="68" name="Extract Macro" width="90" x="380" y="85">
<parameter key="macro" value="image"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="statistics" value="average"/>
<parameter key="attribute_name" value="image"/>
<parameter key="example_index" value="1"/>
<list key="additional_macros"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="loop_examples" compatibility="8.1.000" expanded="true" height="103" name="Loop Examples" width="90" x="380" y="238">
<parameter key="iteration_macro" value="example"/>
<process expanded="true">
<operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="313" y="289">
<parameter key="url" value="%{image}"/>
<list key="crawling_rules">
<parameter key="store_with_matching_url" value=".*"/>
</list>
<parameter key="max_crawl_depth" value="1"/>
<parameter key="retrieve_as_html" value="true"/>
<parameter key="enable_basic_auth" value="false"/>
<parameter key="add_content_as_attribute" value="true"/>
<parameter key="write_pages_to_disk" value="true"/>
<parameter key="include_binary_content" value="false"/>
<parameter key="output_dir" value="/Users/robinmeisel/Desktop/images"/>
<parameter key="output_file_extension" value="%{image}.png"/>
<parameter key="max_pages" value="1"/>
<parameter key="max_page_size" value="1000"/>
<parameter key="delay" value="200"/>
<parameter key="max_concurrent_connections" value="100"/>
<parameter key="max_connections_per_host" value="50"/>
<parameter key="user_agent" value="rapidminer-web-mining-extension-crawler"/>
<parameter key="ignore_robot_exclusion" value="true"/>
</operator>
<connect from_op="Crawl Web" from_port="example set" to_port="output 1"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
</process>
It worked once and then never again. In terms of the above there is only one image in the generate data operator, this would normally referance a database of over a thousand images to download.
What is the best approach to get images down from a web page and then store them on a local folder before proccessing through OCR?
Kind regards
Robin