Hello,
I'm having problems with join operator behavior when using outer join. I expect the behavior to be like a database join, but it is not. The behavior even changes if the type of id attribute changes. I will ilustrate the problem:
ExampleSet 1
id (nominal) | attr1 (integer) |
1 | 11 |
2 | 12 |
3 | 13 |
ExampleSet 2
id (nominal) | attr1 (integer) | attr2 (integer) |
3 | 13 | 23 |
4 | 14 | 24 |
5 | 15 | 25 |
Doing an outer join I obtain the following result buggy result:
id (nominal) | attr1 (integer) | attr2 (integer) |
3 | 13 | 23 |
3 | ? | ? |
4 | 14 | 24 |
4 | ? | ? |
5 | 15 | 25 |
If the type of id is changed to numeric, then we obtain the another different buggy result:
id (numeric) | attr1 (integer) | attr2 (integer) |
1 | ? | ? |
2 | ? | ? |
3 | 13 | 23 |
4 | 14 | 24 |
5 | 15 | 25 |
The result that I expect is this one
id (numeric) | attr1 (integer) | attr2 (integer) |
1 | 11 | ? |
2 | 12 | ? |
3 | 13 | 23 |
4 | 14 | 24 |
5 | 15 | 25 |
I found already a bug regarding join operator (left and right sides changed) but the one below seems to be now. I have the impression that this bug appeared after an automatic update of rapidminer, but as the version number didn't change (5.1.006), I am not sure.
Any help will be greatly appreciated! My process stopped working and I am near the deadline of a paper submission...
Best regards,
Vinicius
Follows a process that demonstates the bug generating the data automatically
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.006">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.006" expanded="true" name="Process">
<process expanded="true" height="451" width="1371">
<operator activated="true" class="generate_data" compatibility="5.1.006" expanded="true" height="60" name="Generate Data" width="90" x="45" y="165">
<parameter key="number_examples" value="1"/>
<parameter key="number_of_attributes" value="2"/>
<parameter key="attributes_lower_bound" value="1.0"/>
<parameter key="use_local_random_seed" value="true"/>
</operator>
<operator activated="true" class="rename" compatibility="5.1.006" expanded="true" height="76" name="Rename" width="90" x="179" y="165">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="id"/>
<list key="rename_additional_attributes">
<parameter key="att2" value="a1"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.1.006" expanded="true" height="76" name="Select Attributes (3)" width="90" x="313" y="165">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="label"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.1.006" expanded="true" height="76" name="Set Role (4)" width="90" x="447" y="165">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="94" name="Multiply" width="90" x="581" y="120"/>
<operator activated="true" class="generate_data" compatibility="5.1.006" expanded="true" height="60" name="Generate Data (2)" width="90" x="45" y="75">
<parameter key="number_examples" value="2"/>
<parameter key="number_of_attributes" value="2"/>
<parameter key="attributes_lower_bound" value="11.0"/>
<parameter key="attributes_upper_bound" value="20.0"/>
<parameter key="use_local_random_seed" value="true"/>
</operator>
<operator activated="true" class="rename" compatibility="5.1.006" expanded="true" height="76" name="Rename (3)" width="90" x="179" y="75">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="id"/>
<list key="rename_additional_attributes">
<parameter key="att2" value="a1"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.1.006" expanded="true" height="76" name="Select Attributes (4)" width="90" x="313" y="75">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="label"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.1.006" expanded="true" height="76" name="Set Role (3)" width="90" x="447" y="75">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="union" compatibility="5.1.006" expanded="true" height="76" name="Union" width="90" x="715" y="30"/>
<operator activated="true" class="generate_data" compatibility="5.1.006" expanded="true" height="60" name="Generate Data (3)" width="90" x="45" y="255">
<parameter key="number_examples" value="2"/>
<parameter key="number_of_attributes" value="2"/>
<parameter key="attributes_lower_bound" value="21.0"/>
<parameter key="attributes_upper_bound" value="30.0"/>
<parameter key="use_local_random_seed" value="true"/>
</operator>
<operator activated="true" class="rename" compatibility="5.1.006" expanded="true" height="76" name="Rename (4)" width="90" x="179" y="255">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="id"/>
<list key="rename_additional_attributes">
<parameter key="att2" value="a1"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.1.006" expanded="true" height="76" name="Select Attributes (5)" width="90" x="313" y="255">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="label"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.1.006" expanded="true" height="76" name="Set Role (5)" width="90" x="447" y="255">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="union" compatibility="5.1.006" expanded="true" height="76" name="Union (2)" width="90" x="715" y="255"/>
<operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="112" name="Example set 1" width="90" x="849" y="30"/>
<operator activated="true" class="numerical_to_polynominal" compatibility="5.1.006" expanded="true" height="76" name="Numerical to Polynominal (2)" width="90" x="916" y="165">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="id"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="112" name="Example set 2" width="90" x="849" y="255"/>
<operator activated="true" class="numerical_to_polynominal" compatibility="5.1.006" expanded="true" height="76" name="Numerical to Polynominal" width="90" x="983" y="345">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="id"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="join" compatibility="5.1.006" expanded="true" height="76" name="Join with nominal id" width="90" x="1184" y="300">
<parameter key="join_type" value="outer"/>
</operator>
<operator activated="true" class="join" compatibility="5.1.006" expanded="true" height="76" name="Join with numerical id" width="90" x="1184" y="120">
<parameter key="join_type" value="outer"/>
</operator>
<connect from_op="Generate Data" from_port="output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Select Attributes (3)" to_port="example set input"/>
<connect from_op="Select Attributes (3)" from_port="example set output" to_op="Set Role (4)" to_port="example set input"/>
<connect from_op="Set Role (4)" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Union (2)" to_port="example set 1"/>
<connect from_op="Multiply" from_port="output 2" to_op="Union" to_port="example set 2"/>
<connect from_op="Generate Data (2)" from_port="output" to_op="Rename (3)" to_port="example set input"/>
<connect from_op="Rename (3)" from_port="example set output" to_op="Select Attributes (4)" to_port="example set input"/>
<connect from_op="Select Attributes (4)" from_port="example set output" to_op="Set Role (3)" to_port="example set input"/>
<connect from_op="Set Role (3)" from_port="example set output" to_op="Union" to_port="example set 1"/>
<connect from_op="Union" from_port="union" to_op="Example set 1" to_port="input"/>
<connect from_op="Generate Data (3)" from_port="output" to_op="Rename (4)" to_port="example set input"/>
<connect from_op="Rename (4)" from_port="example set output" to_op="Select Attributes (5)" to_port="example set input"/>
<connect from_op="Select Attributes (5)" from_port="example set output" to_op="Set Role (5)" to_port="example set input"/>
<connect from_op="Set Role (5)" from_port="example set output" to_op="Union (2)" to_port="example set 2"/>
<connect from_op="Union (2)" from_port="union" to_op="Example set 2" to_port="input"/>
<connect from_op="Example set 1" from_port="output 1" to_op="Join with numerical id" to_port="left"/>
<connect from_op="Example set 1" from_port="output 2" to_port="result 2"/>
<connect from_op="Example set 1" from_port="output 3" to_op="Numerical to Polynominal (2)" to_port="example set input"/>
<connect from_op="Numerical to Polynominal (2)" from_port="example set output" to_op="Join with nominal id" to_port="left"/>
<connect from_op="Example set 2" from_port="output 1" to_op="Join with numerical id" to_port="right"/>
<connect from_op="Example set 2" from_port="output 2" to_port="result 3"/>
<connect from_op="Example set 2" from_port="output 3" to_op="Numerical to Polynominal" to_port="example set input"/>
<connect from_op="Numerical to Polynominal" from_port="example set output" to_op="Join with nominal id" to_port="right"/>
<connect from_op="Join with nominal id" from_port="join" to_port="result 4"/>
<connect from_op="Join with numerical id" from_port="join" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>