Hi everyone,
I'm stuck on something since several hours and can't definitely find the solution by myself.
I've searched for a solution on the forum without any success so far.
Here's the situation:
- The source file is an Excel file "input.xls" containing 4 columns/attributes:
ID ---- industry --- location ------- title
111 xyz Cayman Islands xyz
222 xyz Peru xyz
333 xyz USA xyz
- I want to remove the lines whose attribute "location" has a value contained in a blacklist "location-to-filter-out.xls".
This Excel file looks like
location2
Cayman
France
USA
- Then save the filtered file as "output.xls". Here it would look like:
ID ---- industry --- location ------- title
222 xyz Peru xyz
What I have done so far:
- In the code below, I have a "Loop Attributes" on "location"
- Inside it (subprocess), I have a "Read Excel" operator, extracting the "location2" values from the blacklist. And a second "Loop attributes" running on "location2" this time.
- Inside this second "Loop attributes" (sub-subprocess), I have the filtering condition with the operator "Filter Examples", with the condition class "attribute_value_filter".
The parameter string is: %{loop_attribute}=(.*?)%{loop_attribute2}(.*?)
Where the %{loop_attribute} is the original attribute "location" in the source file. For example "Cayman Islands".
And %{loop_attribute2} is a location blacklisted. For example "Cayman" (the blacklist contains only single words for the sake of simplicity. That's why I use "(.*?)" in the filtering condition.
Problem:
It does not work and I'm unable to find the flaw in my logic.
Maybe can somebody enlighten me?
Thanks in advance for your help.
Alex
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="5.3.015" expanded="true" height="60" name="Read Excel (3)" width="90" x="45" y="30">
<parameter key="excel_file" value="E:\Rapidminer\example\input.xls"/>
<parameter key="imported_cell_range" value="A1:G650"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="ID.true.integer.attribute"/>
<parameter key="1" value="industry.true.polynominal.attribute"/>
<parameter key="2" value="location.true.polynominal.attribute"/>
<parameter key="3" value="title.true.polynominal.attribute"/>
</list>
</operator>
<operator activated="true" class="loop_attributes" compatibility="5.3.015" expanded="true" height="94" name="Loop Attributes" width="90" x="514" y="30">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="location"/>
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="5.3.015" expanded="true" height="60" name="Read Excel (4)" width="90" x="246" y="165">
<parameter key="excel_file" value="E:\Rapidminer\example\location-to-filter-out.xls"/>
<parameter key="imported_cell_range" value="A1:E67"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="location2.true.polynominal.attribute"/>
</list>
</operator>
<operator activated="true" class="loop_attributes" compatibility="5.3.015" expanded="true" height="76" name="Loop Attributes (2)" width="90" x="514" y="165">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="location2"/>
<parameter key="iteration_macro" value="loop_attribute2"/>
<process expanded="true">
<operator activated="true" class="filter_examples" compatibility="5.3.015" expanded="true" height="76" name="Filter Examples (3)" width="90" x="447" y="30">
<parameter key="condition_class" value="attribute_value_filter"/>
<parameter key="parameter_string" value="%{loop_attribute}=(.*?)%{loop_attribute2}(.*?)"/>
<parameter key="invert_filter" value="true"/>
</operator>
<connect from_port="example set" to_op="Filter Examples (3)" to_port="example set input"/>
<connect from_op="Filter Examples (3)" from_port="example set output" to_port="example set"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
</process>
</operator>
<connect from_port="example set" to_port="example set"/>
<connect from_op="Read Excel (4)" from_port="output" to_op="Loop Attributes (2)" to_port="example set"/>
<connect from_op="Loop Attributes (2)" from_port="example set" to_port="result 1"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="write_excel" compatibility="5.3.015" expanded="true" height="76" name="Write Excel" width="90" x="849" y="30">
<parameter key="excel_file" value="E:\Rapidminer\example\output.xls"/>
</operator>
<connect from_op="Read Excel (3)" from_port="output" to_op="Loop Attributes" to_port="example set"/>
<connect from_op="Loop Attributes" from_port="example set" to_op="Write Excel" to_port="input"/>
<connect from_op="Write Excel" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>