"Bug in Join operator - outer joins (HELP!)"
vme64
New Altair Community Member
Hello,
I'm having problems with join operator behavior when using outer join. I expect the behavior to be like a database join, but it is not. The behavior even changes if the type of id attribute changes. I will ilustrate the problem:
ExampleSet 1
ExampleSet 2
Doing an outer join I obtain the following result buggy result:
If the type of id is changed to numeric, then we obtain the another different buggy result:
The result that I expect is this one
I found already a bug regarding join operator (left and right sides changed) but the one below seems to be now. I have the impression that this bug appeared after an automatic update of rapidminer, but as the version number didn't change (5.1.006), I am not sure.
Any help will be greatly appreciated! My process stopped working and I am near the deadline of a paper submission...
Best regards,
Vinicius
Follows a process that demonstates the bug generating the data automatically
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.006">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.006" expanded="true" name="Process">
<process expanded="true" height="451" width="1371">
<operator activated="true" class="generate_data" compatibility="5.1.006" expanded="true" height="60" name="Generate Data" width="90" x="45" y="165">
<parameter key="number_examples" value="1"/>
<parameter key="number_of_attributes" value="2"/>
<parameter key="attributes_lower_bound" value="1.0"/>
<parameter key="use_local_random_seed" value="true"/>
</operator>
<operator activated="true" class="rename" compatibility="5.1.006" expanded="true" height="76" name="Rename" width="90" x="179" y="165">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="id"/>
<list key="rename_additional_attributes">
<parameter key="att2" value="a1"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.1.006" expanded="true" height="76" name="Select Attributes (3)" width="90" x="313" y="165">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="label"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.1.006" expanded="true" height="76" name="Set Role (4)" width="90" x="447" y="165">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="94" name="Multiply" width="90" x="581" y="120"/>
<operator activated="true" class="generate_data" compatibility="5.1.006" expanded="true" height="60" name="Generate Data (2)" width="90" x="45" y="75">
<parameter key="number_examples" value="2"/>
<parameter key="number_of_attributes" value="2"/>
<parameter key="attributes_lower_bound" value="11.0"/>
<parameter key="attributes_upper_bound" value="20.0"/>
<parameter key="use_local_random_seed" value="true"/>
</operator>
<operator activated="true" class="rename" compatibility="5.1.006" expanded="true" height="76" name="Rename (3)" width="90" x="179" y="75">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="id"/>
<list key="rename_additional_attributes">
<parameter key="att2" value="a1"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.1.006" expanded="true" height="76" name="Select Attributes (4)" width="90" x="313" y="75">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="label"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.1.006" expanded="true" height="76" name="Set Role (3)" width="90" x="447" y="75">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="union" compatibility="5.1.006" expanded="true" height="76" name="Union" width="90" x="715" y="30"/>
<operator activated="true" class="generate_data" compatibility="5.1.006" expanded="true" height="60" name="Generate Data (3)" width="90" x="45" y="255">
<parameter key="number_examples" value="2"/>
<parameter key="number_of_attributes" value="2"/>
<parameter key="attributes_lower_bound" value="21.0"/>
<parameter key="attributes_upper_bound" value="30.0"/>
<parameter key="use_local_random_seed" value="true"/>
</operator>
<operator activated="true" class="rename" compatibility="5.1.006" expanded="true" height="76" name="Rename (4)" width="90" x="179" y="255">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="id"/>
<list key="rename_additional_attributes">
<parameter key="att2" value="a1"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.1.006" expanded="true" height="76" name="Select Attributes (5)" width="90" x="313" y="255">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="label"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.1.006" expanded="true" height="76" name="Set Role (5)" width="90" x="447" y="255">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="union" compatibility="5.1.006" expanded="true" height="76" name="Union (2)" width="90" x="715" y="255"/>
<operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="112" name="Example set 1" width="90" x="849" y="30"/>
<operator activated="true" class="numerical_to_polynominal" compatibility="5.1.006" expanded="true" height="76" name="Numerical to Polynominal (2)" width="90" x="916" y="165">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="id"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="112" name="Example set 2" width="90" x="849" y="255"/>
<operator activated="true" class="numerical_to_polynominal" compatibility="5.1.006" expanded="true" height="76" name="Numerical to Polynominal" width="90" x="983" y="345">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="id"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="join" compatibility="5.1.006" expanded="true" height="76" name="Join with nominal id" width="90" x="1184" y="300">
<parameter key="join_type" value="outer"/>
</operator>
<operator activated="true" class="join" compatibility="5.1.006" expanded="true" height="76" name="Join with numerical id" width="90" x="1184" y="120">
<parameter key="join_type" value="outer"/>
</operator>
<connect from_op="Generate Data" from_port="output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Select Attributes (3)" to_port="example set input"/>
<connect from_op="Select Attributes (3)" from_port="example set output" to_op="Set Role (4)" to_port="example set input"/>
<connect from_op="Set Role (4)" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Union (2)" to_port="example set 1"/>
<connect from_op="Multiply" from_port="output 2" to_op="Union" to_port="example set 2"/>
<connect from_op="Generate Data (2)" from_port="output" to_op="Rename (3)" to_port="example set input"/>
<connect from_op="Rename (3)" from_port="example set output" to_op="Select Attributes (4)" to_port="example set input"/>
<connect from_op="Select Attributes (4)" from_port="example set output" to_op="Set Role (3)" to_port="example set input"/>
<connect from_op="Set Role (3)" from_port="example set output" to_op="Union" to_port="example set 1"/>
<connect from_op="Union" from_port="union" to_op="Example set 1" to_port="input"/>
<connect from_op="Generate Data (3)" from_port="output" to_op="Rename (4)" to_port="example set input"/>
<connect from_op="Rename (4)" from_port="example set output" to_op="Select Attributes (5)" to_port="example set input"/>
<connect from_op="Select Attributes (5)" from_port="example set output" to_op="Set Role (5)" to_port="example set input"/>
<connect from_op="Set Role (5)" from_port="example set output" to_op="Union (2)" to_port="example set 2"/>
<connect from_op="Union (2)" from_port="union" to_op="Example set 2" to_port="input"/>
<connect from_op="Example set 1" from_port="output 1" to_op="Join with numerical id" to_port="left"/>
<connect from_op="Example set 1" from_port="output 2" to_port="result 2"/>
<connect from_op="Example set 1" from_port="output 3" to_op="Numerical to Polynominal (2)" to_port="example set input"/>
<connect from_op="Numerical to Polynominal (2)" from_port="example set output" to_op="Join with nominal id" to_port="left"/>
<connect from_op="Example set 2" from_port="output 1" to_op="Join with numerical id" to_port="right"/>
<connect from_op="Example set 2" from_port="output 2" to_port="result 3"/>
<connect from_op="Example set 2" from_port="output 3" to_op="Numerical to Polynominal" to_port="example set input"/>
<connect from_op="Numerical to Polynominal" from_port="example set output" to_op="Join with nominal id" to_port="right"/>
<connect from_op="Join with nominal id" from_port="join" to_port="result 4"/>
<connect from_op="Join with numerical id" from_port="join" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>
I'm having problems with join operator behavior when using outer join. I expect the behavior to be like a database join, but it is not. The behavior even changes if the type of id attribute changes. I will ilustrate the problem:
ExampleSet 1
id (nominal) | attr1 (integer) |
1 | 11 |
2 | 12 |
3 | 13 |
id (nominal) | attr1 (integer) | attr2 (integer) |
3 | 13 | 23 |
4 | 14 | 24 |
5 | 15 | 25 |
id (nominal) | attr1 (integer) | attr2 (integer) |
3 | 13 | 23 |
3 | ? | ? |
4 | 14 | 24 |
4 | ? | ? |
5 | 15 | 25 |
id (numeric) | attr1 (integer) | attr2 (integer) |
1 | ? | ? |
2 | ? | ? |
3 | 13 | 23 |
4 | 14 | 24 |
5 | 15 | 25 |
id (numeric) | attr1 (integer) | attr2 (integer) |
1 | 11 | ? |
2 | 12 | ? |
3 | 13 | 23 |
4 | 14 | 24 |
5 | 15 | 25 |
Any help will be greatly appreciated! My process stopped working and I am near the deadline of a paper submission...
Best regards,
Vinicius
Follows a process that demonstates the bug generating the data automatically
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.006">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.006" expanded="true" name="Process">
<process expanded="true" height="451" width="1371">
<operator activated="true" class="generate_data" compatibility="5.1.006" expanded="true" height="60" name="Generate Data" width="90" x="45" y="165">
<parameter key="number_examples" value="1"/>
<parameter key="number_of_attributes" value="2"/>
<parameter key="attributes_lower_bound" value="1.0"/>
<parameter key="use_local_random_seed" value="true"/>
</operator>
<operator activated="true" class="rename" compatibility="5.1.006" expanded="true" height="76" name="Rename" width="90" x="179" y="165">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="id"/>
<list key="rename_additional_attributes">
<parameter key="att2" value="a1"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.1.006" expanded="true" height="76" name="Select Attributes (3)" width="90" x="313" y="165">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="label"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.1.006" expanded="true" height="76" name="Set Role (4)" width="90" x="447" y="165">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="94" name="Multiply" width="90" x="581" y="120"/>
<operator activated="true" class="generate_data" compatibility="5.1.006" expanded="true" height="60" name="Generate Data (2)" width="90" x="45" y="75">
<parameter key="number_examples" value="2"/>
<parameter key="number_of_attributes" value="2"/>
<parameter key="attributes_lower_bound" value="11.0"/>
<parameter key="attributes_upper_bound" value="20.0"/>
<parameter key="use_local_random_seed" value="true"/>
</operator>
<operator activated="true" class="rename" compatibility="5.1.006" expanded="true" height="76" name="Rename (3)" width="90" x="179" y="75">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="id"/>
<list key="rename_additional_attributes">
<parameter key="att2" value="a1"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.1.006" expanded="true" height="76" name="Select Attributes (4)" width="90" x="313" y="75">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="label"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.1.006" expanded="true" height="76" name="Set Role (3)" width="90" x="447" y="75">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="union" compatibility="5.1.006" expanded="true" height="76" name="Union" width="90" x="715" y="30"/>
<operator activated="true" class="generate_data" compatibility="5.1.006" expanded="true" height="60" name="Generate Data (3)" width="90" x="45" y="255">
<parameter key="number_examples" value="2"/>
<parameter key="number_of_attributes" value="2"/>
<parameter key="attributes_lower_bound" value="21.0"/>
<parameter key="attributes_upper_bound" value="30.0"/>
<parameter key="use_local_random_seed" value="true"/>
</operator>
<operator activated="true" class="rename" compatibility="5.1.006" expanded="true" height="76" name="Rename (4)" width="90" x="179" y="255">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="id"/>
<list key="rename_additional_attributes">
<parameter key="att2" value="a1"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.1.006" expanded="true" height="76" name="Select Attributes (5)" width="90" x="313" y="255">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="label"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.1.006" expanded="true" height="76" name="Set Role (5)" width="90" x="447" y="255">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="union" compatibility="5.1.006" expanded="true" height="76" name="Union (2)" width="90" x="715" y="255"/>
<operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="112" name="Example set 1" width="90" x="849" y="30"/>
<operator activated="true" class="numerical_to_polynominal" compatibility="5.1.006" expanded="true" height="76" name="Numerical to Polynominal (2)" width="90" x="916" y="165">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="id"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="112" name="Example set 2" width="90" x="849" y="255"/>
<operator activated="true" class="numerical_to_polynominal" compatibility="5.1.006" expanded="true" height="76" name="Numerical to Polynominal" width="90" x="983" y="345">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="id"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="join" compatibility="5.1.006" expanded="true" height="76" name="Join with nominal id" width="90" x="1184" y="300">
<parameter key="join_type" value="outer"/>
</operator>
<operator activated="true" class="join" compatibility="5.1.006" expanded="true" height="76" name="Join with numerical id" width="90" x="1184" y="120">
<parameter key="join_type" value="outer"/>
</operator>
<connect from_op="Generate Data" from_port="output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Select Attributes (3)" to_port="example set input"/>
<connect from_op="Select Attributes (3)" from_port="example set output" to_op="Set Role (4)" to_port="example set input"/>
<connect from_op="Set Role (4)" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Union (2)" to_port="example set 1"/>
<connect from_op="Multiply" from_port="output 2" to_op="Union" to_port="example set 2"/>
<connect from_op="Generate Data (2)" from_port="output" to_op="Rename (3)" to_port="example set input"/>
<connect from_op="Rename (3)" from_port="example set output" to_op="Select Attributes (4)" to_port="example set input"/>
<connect from_op="Select Attributes (4)" from_port="example set output" to_op="Set Role (3)" to_port="example set input"/>
<connect from_op="Set Role (3)" from_port="example set output" to_op="Union" to_port="example set 1"/>
<connect from_op="Union" from_port="union" to_op="Example set 1" to_port="input"/>
<connect from_op="Generate Data (3)" from_port="output" to_op="Rename (4)" to_port="example set input"/>
<connect from_op="Rename (4)" from_port="example set output" to_op="Select Attributes (5)" to_port="example set input"/>
<connect from_op="Select Attributes (5)" from_port="example set output" to_op="Set Role (5)" to_port="example set input"/>
<connect from_op="Set Role (5)" from_port="example set output" to_op="Union (2)" to_port="example set 2"/>
<connect from_op="Union (2)" from_port="union" to_op="Example set 2" to_port="input"/>
<connect from_op="Example set 1" from_port="output 1" to_op="Join with numerical id" to_port="left"/>
<connect from_op="Example set 1" from_port="output 2" to_port="result 2"/>
<connect from_op="Example set 1" from_port="output 3" to_op="Numerical to Polynominal (2)" to_port="example set input"/>
<connect from_op="Numerical to Polynominal (2)" from_port="example set output" to_op="Join with nominal id" to_port="left"/>
<connect from_op="Example set 2" from_port="output 1" to_op="Join with numerical id" to_port="right"/>
<connect from_op="Example set 2" from_port="output 2" to_port="result 3"/>
<connect from_op="Example set 2" from_port="output 3" to_op="Numerical to Polynominal" to_port="example set input"/>
<connect from_op="Numerical to Polynominal" from_port="example set output" to_op="Join with nominal id" to_port="right"/>
<connect from_op="Join with nominal id" from_port="join" to_port="result 4"/>
<connect from_op="Join with numerical id" from_port="join" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>
Tagged:
0
Answers
-
Hi,
please submit a bug report on this issue.
Regards,
Marco0 -
Hello,
Thanks. I added my description to an existing bug (368) that seems to be the same. Any workaround would be greatly appreciated, as the bug is around one year open and it was not considered a major one (!). Up to now RapidMiner proved to be a great tool, but this bug completely blocked me...
Best regards,
Vinicius0 -
Hi Vincius,
I don't know if you are using a database for your processes. If yes, a possible workaround using the db is very easy. If you are not using one, you should consider installing a local db (I suggest XAMPP as a simple solution) to quickly get around your problem.
You could write your example sets into the database and then perform joins and other operations on the db system (using SQL for example). In the recommended package you can use phpMyAdmin for this or simply do it from RapidMiner (using "Read Database" or "Execute SQL"). Moving the data around just to be able to join tables seems a bit idiotic, but if you need a quick solution...
Best regards
Matthias0 -
Hi,
you're right. I changed the bug severityfrom normal to major. Unfortunately it may still take a while.. so much to do
Regards,
Marco0 -
Hello,
Thanks for the support. The database solution would be ok, but my processes are starting to get complicated and introducing much other operators to put data, to get it back, to fix types etc. would end up introducing bugs. I manage do find an workaround: I simulated the outer join using Set Minus, Inner join and Union. Follows the subprocess that does this. I did some tests and it seems to work, but no guarantee...
Best regards,
Vinicius
<operator activated="true" class="subprocess" compatibility="5.1.006" expanded="true" height="94" name="Outer join (2)" width="90" x="179" y="30">
<process expanded="true" height="496" width="969">
<operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="94" name="Multiply (2)" width="90" x="112" y="30"/>
<operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="94" name="Multiply (7)" width="90" x="112" y="255"/>
<operator activated="true" class="join" compatibility="5.1.006" expanded="true" height="76" name="Join (3)" width="90" x="246" y="210"/>
<operator activated="true" class="multiply" compatibility="5.1.006" expanded="true" height="112" name="Multiply (9)" width="90" x="380" y="210"/>
<operator activated="true" class="set_minus" compatibility="5.1.006" expanded="true" height="76" name="Set Minus (2)" width="90" x="514" y="75"/>
<operator activated="true" class="set_minus" compatibility="5.1.006" expanded="true" height="76" name="Set Minus (3)" width="90" x="514" y="390"/>
<operator activated="true" class="union" compatibility="5.1.006" expanded="true" height="76" name="Union (2)" width="90" x="648" y="165"/>
<operator activated="true" class="union" compatibility="5.1.006" expanded="true" height="76" name="Union (3)" width="90" x="849" y="255"/>
<connect from_port="in 1" to_op="Multiply (7)" to_port="input"/>
<connect from_port="in 2" to_op="Multiply (2)" to_port="input"/>
<connect from_op="Multiply (2)" from_port="output 1" to_op="Join (3)" to_port="left"/>
<connect from_op="Multiply (2)" from_port="output 2" to_op="Set Minus (2)" to_port="example set input"/>
<connect from_op="Multiply (7)" from_port="output 1" to_op="Join (3)" to_port="right"/>
<connect from_op="Multiply (7)" from_port="output 2" to_op="Set Minus (3)" to_port="example set input"/>
<connect from_op="Join (3)" from_port="join" to_op="Multiply (9)" to_port="input"/>
<connect from_op="Multiply (9)" from_port="output 1" to_op="Set Minus (2)" to_port="subtrahend"/>
<connect from_op="Multiply (9)" from_port="output 2" to_op="Set Minus (3)" to_port="subtrahend"/>
<connect from_op="Multiply (9)" from_port="output 3" to_op="Union (3)" to_port="example set 2"/>
<connect from_op="Set Minus (2)" from_port="example set output" to_op="Union (2)" to_port="example set 1"/>
<connect from_op="Set Minus (3)" from_port="example set output" to_op="Union (2)" to_port="example set 2"/>
<connect from_op="Union (2)" from_port="union" to_op="Union (3)" to_port="example set 1"/>
<connect from_op="Union (3)" from_port="union" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="source_in 3" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
0