Hi I am attemting to apply machine learning to determine the colours that individuals prefer. As part of this I need to download brand images. I am attempting to store a library images, one of them as an example: https://www.vodafone.co.uk/cs/groups/public/documents/webcontent/1287x929_vodafone_logo.jpg I am using the web crawling extension and trying to save the image on the page: It worked once and then never again. In terms of the above there is only one image in the generate data operator, this would normally referance a database of over a thousand images to download. What is the best approach to get images down from a web page and then store them on a local folder before proccessing through OCR? Kind regards Robin

Altair RISE

A program to recognize and reward our most engaged community members

Nominate Yourself Now!

Downloading images for OCR

I am attemting to apply machine learning to determine the colours that individuals prefer. As part of this I need to download brand images. I am attempting to store a library images, one of them as an example:

https://www.vodafone.co.uk/cs/groups/public/documents/webcontent/1287x929_vodafone_logo.jpg

I am using the web crawling extension and trying to save the image on the page:

<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
  <operator activated="true" class="generate_data_user_specification" compatibility="8.1.000" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="112" y="85">
    <list key="attribute_values">
      <parameter key="image" value="(&quot;https://www.vodafone.co.uk/cs/groups/public/documents/webcontent/1287x929_vodafone_logo.jpg&quot;)"/>
    </list>
    <list key="set_additional_roles"/>
  </operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
  <operator activated="true" class="multiply" compatibility="8.1.000" expanded="true" height="103" name="Multiply" width="90" x="246" y="85"/>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
  <operator activated="true" class="extract_macro" compatibility="8.1.000" expanded="true" height="68" name="Extract Macro" width="90" x="380" y="85">
    <parameter key="macro" value="image"/>
    <parameter key="macro_type" value="data_value"/>
    <parameter key="statistics" value="average"/>
    <parameter key="attribute_name" value="image"/>
    <parameter key="example_index" value="1"/>
    <list key="additional_macros"/>
  </operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
  <operator activated="true" class="loop_examples" compatibility="8.1.000" expanded="true" height="103" name="Loop Examples" width="90" x="380" y="238">
    <parameter key="iteration_macro" value="example"/>
    <process expanded="true">
      <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="313" y="289">
        <parameter key="url" value="%{image}"/>
        <list key="crawling_rules">
          <parameter key="store_with_matching_url" value=".*"/>
        </list>
        <parameter key="max_crawl_depth" value="1"/>
        <parameter key="retrieve_as_html" value="true"/>
        <parameter key="enable_basic_auth" value="false"/>
        <parameter key="add_content_as_attribute" value="true"/>
        <parameter key="write_pages_to_disk" value="true"/>
        <parameter key="include_binary_content" value="false"/>
        <parameter key="output_dir" value="/Users/robinmeisel/Desktop/images"/>
        <parameter key="output_file_extension" value="%{image}.png"/>
        <parameter key="max_pages" value="1"/>
        <parameter key="max_page_size" value="1000"/>
        <parameter key="delay" value="200"/>
        <parameter key="max_concurrent_connections" value="100"/>
        <parameter key="max_connections_per_host" value="50"/>
        <parameter key="user_agent" value="rapidminer-web-mining-extension-crawler"/>
        <parameter key="ignore_robot_exclusion" value="true"/>
      </operator>
      <connect from_op="Crawl Web" from_port="example set" to_port="output 1"/>
      <portSpacing port="source_example set" spacing="0"/>
      <portSpacing port="sink_example set" spacing="0"/>
      <portSpacing port="sink_output 1" spacing="0"/>
      <portSpacing port="sink_output 2" spacing="0"/>
    </process>
  </operator>
</process>

It worked once and then never again. In terms of the above there is only one image in the generate data operator, this would normally referance a database of over a thousand images to download.

What is the best approach to get images down from a web page and then store them on a local folder before proccessing through OCR?

Kind regards

Robin

Find more posts tagged with

AI Studio

Accepted answers

sgenzer

Hi @robin ok thanks for that. I now understand. This is rather "quick and dirty" but I hope you may get the idea of how I would approach it. There may be a more clever way.... ?

<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="45" y="34">
        <parameter key="url" value="https://community.rapidminer.com/"/>
        <list key="crawling_rules">
          <parameter key="follow_link_with_matching_url" value=".*"/>
        </list>
        <parameter key="retrieve_as_html" value="true"/>
        <parameter key="output_dir" value="/Users/GenzerConsulting"/>
        <parameter key="max_pages" value="5"/>
        <parameter key="max_page_size" value="1000000000"/>
      </operator>
      <operator activated="true" class="loop_examples" compatibility="8.2.000" expanded="true" height="103" name="Loop Examples" width="90" x="179" y="34">
        <process expanded="true">
          <operator activated="true" class="extract_macro" compatibility="8.2.000" expanded="true" height="68" name="Extract Macro (2)" width="90" x="45" y="34">
            <parameter key="macro" value="URL"/>
            <parameter key="macro_type" value="data_value"/>
            <parameter key="attribute_name" value="Link"/>
            <parameter key="example_index" value="%{example}"/>
            <list key="additional_macros"/>
          </operator>
          <operator activated="true" class="web:get_webpage" compatibility="7.3.000" expanded="true" height="68" name="Get Page" width="90" x="179" y="34">
            <parameter key="url" value="%{URL}"/>
            <parameter key="accept_cookies" value="all"/>
            <list key="query_parameters"/>
            <list key="request_properties"/>
          </operator>
          <operator activated="true" class="text:documents_to_data" compatibility="7.5.000" expanded="true" height="82" name="Documents to Data" width="90" x="313" y="34">
            <parameter key="text_attribute" value="text"/>
            <parameter key="add_meta_information" value="false"/>
          </operator>
          <operator activated="true" class="split" compatibility="8.2.000" expanded="true" height="82" name="Split" width="90" x="447" y="34">
            <parameter key="split_pattern" value="[&lt;]|[&gt;]"/>
          </operator>
          <operator activated="true" class="transpose" compatibility="8.2.000" expanded="true" height="82" name="Transpose" width="90" x="581" y="34"/>
          <operator activated="true" class="filter_examples" compatibility="8.2.000" expanded="true" height="103" name="Filter Examples" width="90" x="715" y="34">
            <list key="filters_list">
              <parameter key="filters_entry_key" value="att_1.contains.\.png"/>
              <parameter key="filters_entry_key" value="att_1.contains.https"/>
            </list>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="8.2.000" expanded="true" height="82" name="Generate Attributes" width="90" x="849" y="34">
            <list key="function_descriptions">
              <parameter key="att_1" value="suffix(att_1,length(att_1)-index(att_1,&quot;http&quot;))"/>
            </list>
          </operator>
          <operator activated="false" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="391">
            <parameter key="mode" value="regular expression"/>
            <parameter key="expression" value="[&lt;]|[&gt;]"/>
          </operator>
          <operator activated="false" class="text:filter_tokens_by_content" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens (by Content)" width="90" x="179" y="391">
            <parameter key="string" value=".png"/>
          </operator>
          <operator activated="false" class="text:filter_by_length" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="313" y="391">
            <parameter key="max_chars" value="250"/>
          </operator>
          <operator activated="false" breakpoints="after" class="text:write_document" compatibility="7.5.000" expanded="true" height="82" name="Write Document" width="90" x="447" y="391">
            <parameter key="file" value="/Users/GenzerConsulting/foo.txt"/>
          </operator>
          <operator activated="false" class="text:keep_document_parts" compatibility="7.5.000" expanded="true" height="68" name="Keep Document Parts" width="90" x="112" y="187">
            <parameter key="extraction_regex" value="http.*[.]png.*\s"/>
          </operator>
          <operator activated="false" breakpoints="after" class="web:unescape_html" compatibility="7.3.000" expanded="true" height="68" name="Unescape HTML Document" width="90" x="246" y="391"/>
          <operator activated="false" class="web:unescape_html_attribute" compatibility="7.3.000" expanded="true" height="82" name="Unescape HTML" width="90" x="514" y="187"/>
          <operator activated="true" class="generate_attributes" compatibility="8.2.000" expanded="true" height="82" name="Generate Attributes (2)" width="90" x="983" y="34">
            <list key="function_descriptions">
              <parameter key="att_1" value="prefix(att_1,4+index(att_1,&quot;.png&quot;))"/>
            </list>
          </operator>
          <operator activated="true" class="loop_examples" compatibility="8.2.000" expanded="true" height="82" name="Loop Examples (2)" width="90" x="1117" y="34">
            <parameter key="iteration_macro" value="example2"/>
            <process expanded="true">
              <operator activated="true" class="extract_macro" compatibility="8.2.000" expanded="true" height="68" name="Extract Macro (3)" width="90" x="45" y="34">
                <parameter key="macro" value="imageURL"/>
                <parameter key="macro_type" value="data_value"/>
                <parameter key="attribute_name" value="att_1"/>
                <parameter key="example_index" value="%{example2}"/>
                <list key="additional_macros"/>
              </operator>
              <operator activated="true" class="handle_exception" compatibility="8.2.000" expanded="true" height="82" name="Handle Exception" width="90" x="179" y="34">
                <process expanded="true">
                  <operator activated="true" class="generate_macro" compatibility="8.2.000" expanded="true" height="68" name="Generate Macro" width="90" x="112" y="34">
                    <list key="function_descriptions">
                      <parameter key="imageName" value="replace(suffix(%{imageURL},15),&quot;/&quot;,&quot;&quot;)"/>
                    </list>
                  </operator>
                  <operator activated="true" class="open_file" compatibility="8.2.000" expanded="true" height="68" name="Open File" width="90" x="246" y="34">
                    <parameter key="resource_type" value="URL"/>
                    <parameter key="url" value="%{imageURL}"/>
                  </operator>
                  <operator activated="true" class="write_file" compatibility="8.2.000" expanded="true" height="68" name="Write File" width="90" x="380" y="34">
                    <parameter key="filename" value="/Users/GenzerConsulting/%{imageName}"/>
                  </operator>
                  <connect from_op="Open File" from_port="file" to_op="Write File" to_port="file"/>
                  <portSpacing port="source_in 1" spacing="0"/>
                  <portSpacing port="sink_out 1" spacing="0"/>
                  <portSpacing port="sink_out 2" spacing="0"/>
                </process>
                <process expanded="true">
                  <portSpacing port="source_in 1" spacing="0"/>
                  <portSpacing port="sink_out 1" spacing="0"/>
                  <portSpacing port="sink_out 2" spacing="0"/>
                </process>
              </operator>
              <connect from_port="example set" to_op="Extract Macro (3)" to_port="example set"/>
              <portSpacing port="source_example set" spacing="0"/>
              <portSpacing port="sink_example set" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
            </process>
          </operator>
          <connect from_port="example set" to_op="Extract Macro (2)" to_port="example set"/>
          <connect from_op="Get Page" from_port="output" to_op="Documents to Data" to_port="documents 1"/>
          <connect from_op="Documents to Data" from_port="example set" to_op="Split" to_port="example set input"/>
          <connect from_op="Split" from_port="example set output" to_op="Transpose" to_port="example set input"/>
          <connect from_op="Transpose" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
          <connect from_op="Filter Examples" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
          <connect from_op="Generate Attributes" from_port="example set output" to_op="Generate Attributes (2)" to_port="example set input"/>
          <connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
          <connect from_op="Filter Tokens (by Content)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Write Document" to_port="document"/>
          <connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Loop Examples (2)" to_port="example set"/>
          <connect from_op="Loop Examples (2)" from_port="example set" to_port="output 1"/>
          <portSpacing port="source_example set" spacing="0"/>
          <portSpacing port="sink_example set" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Crawl Web" from_port="example set" to_op="Loop Examples" to_port="example set"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
    </process>
  </operator>
</process>

Scott

All comments

Maerkli

Hallo Robin,

I try to open your XML file inside RM Studio 8.2.000: it does not populate the process window.

Maerkli

sgenzer

@robin yes I think (?) you did "select all" and then copy/paste direct from the design canvas. This unfortunately produces broken XML. Can you please either attach your .rmp file or go to the XML panel and then copy/paste from there?

Scott

robin

Tried pasting inside of Chrome.


<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
  <operator activated="true" class="generate_data_user_specification" compatibility="8.1.000" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="112" y="85">
    <list key="attribute_values">
      <parameter key="data_value" value="(&quot;https://www.vodafone.co.uk/cs/groups/public/documents/webcontent/1287x929_vodafone_logo.jpg&quot;)"/>
    </list>
    <list key="set_additional_roles"/>
  </operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
  <operator activated="true" class="extract_macro" compatibility="8.1.000" expanded="true" height="68" name="Extract Macro" width="90" x="313" y="85">
    <parameter key="macro" value="image"/>
    <parameter key="macro_type" value="data_value"/>
    <parameter key="statistics" value="average"/>
    <parameter key="attribute_name" value="data_value"/>
    <parameter key="example_index" value="1"/>
    <list key="additional_macros"/>
  </operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
  <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="313" y="238">
    <parameter key="url" value="%{image}"/>
    <list key="crawling_rules">
      <parameter key="store_with_matching_url" value=".*"/>
    </list>
    <parameter key="max_crawl_depth" value="1"/>
    <parameter key="retrieve_as_html" value="true"/>
    <parameter key="enable_basic_auth" value="false"/>
    <parameter key="add_content_as_attribute" value="true"/>
    <parameter key="write_pages_to_disk" value="true"/>
    <parameter key="include_binary_content" value="false"/>
    <parameter key="output_dir" value="/Users/robinmeisel/Desktop/images"/>
    <parameter key="output_file_extension" value="%{image}.png"/>
    <parameter key="max_pages" value="1"/>
    <parameter key="max_page_size" value="1000"/>
    <parameter key="delay" value="200"/>
    <parameter key="max_concurrent_connections" value="100"/>
    <parameter key="max_connections_per_host" value="50"/>
    <parameter key="user_agent" value="rapidminer-web-mining-extension-crawler"/>
    <parameter key="ignore_robot_exclusion" value="true"/>
  </operator>
</process>

robin

Temporary link to file

Put a temporary link to the file.

sgenzer

Hi @robin ok thanks for that. I now understand. This is rather "quick and dirty" but I hope you may get the idea of how I would approach it. There may be a more clever way.... ?

<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="45" y="34">
        <parameter key="url" value="https://community.rapidminer.com/"/>
        <list key="crawling_rules">
          <parameter key="follow_link_with_matching_url" value=".*"/>
        </list>
        <parameter key="retrieve_as_html" value="true"/>
        <parameter key="output_dir" value="/Users/GenzerConsulting"/>
        <parameter key="max_pages" value="5"/>
        <parameter key="max_page_size" value="1000000000"/>
      </operator>
      <operator activated="true" class="loop_examples" compatibility="8.2.000" expanded="true" height="103" name="Loop Examples" width="90" x="179" y="34">
        <process expanded="true">
          <operator activated="true" class="extract_macro" compatibility="8.2.000" expanded="true" height="68" name="Extract Macro (2)" width="90" x="45" y="34">
            <parameter key="macro" value="URL"/>
            <parameter key="macro_type" value="data_value"/>
            <parameter key="attribute_name" value="Link"/>
            <parameter key="example_index" value="%{example}"/>
            <list key="additional_macros"/>
          </operator>
          <operator activated="true" class="web:get_webpage" compatibility="7.3.000" expanded="true" height="68" name="Get Page" width="90" x="179" y="34">
            <parameter key="url" value="%{URL}"/>
            <parameter key="accept_cookies" value="all"/>
            <list key="query_parameters"/>
            <list key="request_properties"/>
          </operator>
          <operator activated="true" class="text:documents_to_data" compatibility="7.5.000" expanded="true" height="82" name="Documents to Data" width="90" x="313" y="34">
            <parameter key="text_attribute" value="text"/>
            <parameter key="add_meta_information" value="false"/>
          </operator>
          <operator activated="true" class="split" compatibility="8.2.000" expanded="true" height="82" name="Split" width="90" x="447" y="34">
            <parameter key="split_pattern" value="[&lt;]|[&gt;]"/>
          </operator>
          <operator activated="true" class="transpose" compatibility="8.2.000" expanded="true" height="82" name="Transpose" width="90" x="581" y="34"/>
          <operator activated="true" class="filter_examples" compatibility="8.2.000" expanded="true" height="103" name="Filter Examples" width="90" x="715" y="34">
            <list key="filters_list">
              <parameter key="filters_entry_key" value="att_1.contains.\.png"/>
              <parameter key="filters_entry_key" value="att_1.contains.https"/>
            </list>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="8.2.000" expanded="true" height="82" name="Generate Attributes" width="90" x="849" y="34">
            <list key="function_descriptions">
              <parameter key="att_1" value="suffix(att_1,length(att_1)-index(att_1,&quot;http&quot;))"/>
            </list>
          </operator>
          <operator activated="false" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="391">
            <parameter key="mode" value="regular expression"/>
            <parameter key="expression" value="[&lt;]|[&gt;]"/>
          </operator>
          <operator activated="false" class="text:filter_tokens_by_content" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens (by Content)" width="90" x="179" y="391">
            <parameter key="string" value=".png"/>
          </operator>
          <operator activated="false" class="text:filter_by_length" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="313" y="391">
            <parameter key="max_chars" value="250"/>
          </operator>
          <operator activated="false" breakpoints="after" class="text:write_document" compatibility="7.5.000" expanded="true" height="82" name="Write Document" width="90" x="447" y="391">
            <parameter key="file" value="/Users/GenzerConsulting/foo.txt"/>
          </operator>
          <operator activated="false" class="text:keep_document_parts" compatibility="7.5.000" expanded="true" height="68" name="Keep Document Parts" width="90" x="112" y="187">
            <parameter key="extraction_regex" value="http.*[.]png.*\s"/>
          </operator>
          <operator activated="false" breakpoints="after" class="web:unescape_html" compatibility="7.3.000" expanded="true" height="68" name="Unescape HTML Document" width="90" x="246" y="391"/>
          <operator activated="false" class="web:unescape_html_attribute" compatibility="7.3.000" expanded="true" height="82" name="Unescape HTML" width="90" x="514" y="187"/>
          <operator activated="true" class="generate_attributes" compatibility="8.2.000" expanded="true" height="82" name="Generate Attributes (2)" width="90" x="983" y="34">
            <list key="function_descriptions">
              <parameter key="att_1" value="prefix(att_1,4+index(att_1,&quot;.png&quot;))"/>
            </list>
          </operator>
          <operator activated="true" class="loop_examples" compatibility="8.2.000" expanded="true" height="82" name="Loop Examples (2)" width="90" x="1117" y="34">
            <parameter key="iteration_macro" value="example2"/>
            <process expanded="true">
              <operator activated="true" class="extract_macro" compatibility="8.2.000" expanded="true" height="68" name="Extract Macro (3)" width="90" x="45" y="34">
                <parameter key="macro" value="imageURL"/>
                <parameter key="macro_type" value="data_value"/>
                <parameter key="attribute_name" value="att_1"/>
                <parameter key="example_index" value="%{example2}"/>
                <list key="additional_macros"/>
              </operator>
              <operator activated="true" class="handle_exception" compatibility="8.2.000" expanded="true" height="82" name="Handle Exception" width="90" x="179" y="34">
                <process expanded="true">
                  <operator activated="true" class="generate_macro" compatibility="8.2.000" expanded="true" height="68" name="Generate Macro" width="90" x="112" y="34">
                    <list key="function_descriptions">
                      <parameter key="imageName" value="replace(suffix(%{imageURL},15),&quot;/&quot;,&quot;&quot;)"/>
                    </list>
                  </operator>
                  <operator activated="true" class="open_file" compatibility="8.2.000" expanded="true" height="68" name="Open File" width="90" x="246" y="34">
                    <parameter key="resource_type" value="URL"/>
                    <parameter key="url" value="%{imageURL}"/>
                  </operator>
                  <operator activated="true" class="write_file" compatibility="8.2.000" expanded="true" height="68" name="Write File" width="90" x="380" y="34">
                    <parameter key="filename" value="/Users/GenzerConsulting/%{imageName}"/>
                  </operator>
                  <connect from_op="Open File" from_port="file" to_op="Write File" to_port="file"/>
                  <portSpacing port="source_in 1" spacing="0"/>
                  <portSpacing port="sink_out 1" spacing="0"/>
                  <portSpacing port="sink_out 2" spacing="0"/>
                </process>
                <process expanded="true">
                  <portSpacing port="source_in 1" spacing="0"/>
                  <portSpacing port="sink_out 1" spacing="0"/>
                  <portSpacing port="sink_out 2" spacing="0"/>
                </process>
              </operator>
              <connect from_port="example set" to_op="Extract Macro (3)" to_port="example set"/>
              <portSpacing port="source_example set" spacing="0"/>
              <portSpacing port="sink_example set" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
            </process>
          </operator>
          <connect from_port="example set" to_op="Extract Macro (2)" to_port="example set"/>
          <connect from_op="Get Page" from_port="output" to_op="Documents to Data" to_port="documents 1"/>
          <connect from_op="Documents to Data" from_port="example set" to_op="Split" to_port="example set input"/>
          <connect from_op="Split" from_port="example set output" to_op="Transpose" to_port="example set input"/>
          <connect from_op="Transpose" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
          <connect from_op="Filter Examples" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
          <connect from_op="Generate Attributes" from_port="example set output" to_op="Generate Attributes (2)" to_port="example set input"/>
          <connect from_op="Tokenize" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
          <connect from_op="Filter Tokens (by Content)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Write Document" to_port="document"/>
          <connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Loop Examples (2)" to_port="example set"/>
          <connect from_op="Loop Examples (2)" from_port="example set" to_port="output 1"/>
          <portSpacing port="source_example set" spacing="0"/>
          <portSpacing port="sink_example set" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Crawl Web" from_port="example set" to_op="Loop Examples" to_port="example set"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
    </process>
  </operator>
</process>

Scott