I want to get data from google policies in many languages.
I am using crawl web operator which points to the greek language policies.
I am setting the language as label and filtering to get only the url's which contain policies.
So i end up with an exampleset that has all url's for greek policies.
What i want to achieve is to get all data from these policies and get them in one file.
I found that "get page" operator works perfect for what i want but it is only for one url.
The "get pages" operator has a problem with utf-8 encoding and gives me wrong output.
So i tried a macro to loop for the exampleset i have.
But i cann't figure out exactly how a macro works and the process never compiles.
This is my xml code:
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
<parameter key="encoding" value="UTF-8"/>
<process expanded="true">
<operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="45" y="34">
<parameter key="url" value="https://policies.google.com/privacy/archive?hl=el&gl=gr"/>
<list key="crawling_rules"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="8.2.000" expanded="true" height="82" name="Generate Attributes" width="90" x="179" y="34">
<list key="function_descriptions">
<parameter key="policy_language" value="if(contains(Link,"intl/el_GR/policies"),"EL","Other")"/>
</list>
</operator>
<operator activated="true" class="set_role" compatibility="8.2.000" expanded="true" height="82" name="Set Role" width="90" x="313" y="34">
<parameter key="attribute_name" value="policy_language"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="8.2.000" expanded="true" height="103" name="Filter Examples" width="90" x="447" y="34">
<list key="filters_list">
<parameter key="filters_entry_key" value="policy_language.equals.EL"/>
</list>
</operator>
<operator activated="true" class="loop_examples" compatibility="8.2.000" expanded="true" height="82" name="Loop Examples" width="90" x="581" y="187">
<process expanded="true">
<operator activated="true" class="web:get_webpage" compatibility="7.3.000" expanded="true" height="68" name="Get Page" width="90" x="447" y="136">
<parameter key="url" value="%test"/>
<list key="query_parameters"/>
<list key="request_properties"/>
</operator>
<operator activated="true" class="filter_example_range" compatibility="8.2.000" expanded="true" height="82" name="Filter Example Range" width="90" x="45" y="34">
<parameter key="first_example" value="%{example}"/>
<parameter key="last_example" value="%{example}"/>
</operator>
<operator activated="true" class="extract_macro" compatibility="8.2.000" expanded="true" height="68" name="Extract Macro" width="90" x="246" y="34">
<parameter key="macro" value="test"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="Link"/>
<parameter key="example_index" value="1"/>
<list key="additional_macros"/>
</operator>
<connect from_port="example set" to_op="Filter Example Range" to_port="example set input"/>
<connect from_op="Get Page" from_port="output" to_port="example set"/>
<connect from_op="Filter Example Range" from_port="example set output" to_op="Extract Macro" to_port="example set"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
</process>
</operator>
<connect from_op="Crawl Web" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Loop Examples" to_port="example set"/>
<connect from_op="Loop Examples" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>