Hi guys,
I'm wanting to loop attributes in Radoop to get some aggregates and append it back to my original table.
The output of the data is a collection and I'd really love to be able to loop through that collection with Remember Recall and then join all of the datasets together.
Unfortunately this is Radoop so those operators aren't available to me so I need a smarter solution.
For efficiency I want to make the most of my cluster resources so using Reuse Results isn't a good option. Each loop needs to wait until the previous one it hits a bottleneck which in a test run takes 10 seconds per loop and this means (as it isn't able to run in parallel) the entire process takes 7 hours, with or without the full dataset.
Anyone got any suggestions on how to rejig it to combine the output collections so I can use more cluster nodes in the execution?
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.1.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.1.001" expanded="true" height="68" name="Retrieve Iris" width="90" x="45" y="85">
<parameter key="repository_entry" value="//Samples/data/Iris"/>
</operator>
<operator activated="true" class="radoop:radoop_nest" compatibility="8.1.001" expanded="true" height="82" name="Radoop Nest" width="90" x="179" y="85">
<parameter key="connection" value="Hive Test"/>
<enumeration key="tables_to_reload"/>
<process expanded="true">
<operator activated="true" class="radoop:loop_attributes_radoop" compatibility="8.1.001" expanded="true" height="82" name="Loop Attributes (Radoop)" width="90" x="112" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="a1|a3"/>
<process expanded="true">
<operator activated="true" class="radoop:aggregate" compatibility="8.1.001" expanded="true" height="82" name="Aggregate" width="90" x="246" y="34">
<list key="aggregation_attributes">
<parameter key="%{loop_attribute}" value="average"/>
</list>
<parameter key="group_by_attributes" value="label"/>
</operator>
<operator activated="true" class="radoop:join" compatibility="8.1.001" expanded="true" height="82" name="Join" width="90" x="447" y="34">
<list key="key_attributes">
<parameter key="label" value="label"/>
</list>
</operator>
<connect from_port="input 1" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Aggregate" from_port="example set output" to_op="Join" to_port="left"/>
<connect from_op="Aggregate" from_port="original" to_op="Join" to_port="right"/>
<connect from_op="Join" from_port="join" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="radoop:join" compatibility="8.1.001" expanded="true" height="82" name="Join (2)" width="90" x="313" y="34">
<list key="key_attributes">
<parameter key="id" value="id"/>
</list>
<description align="center" color="transparent" colored="false" width="126">:'(</description>
</operator>
<connect from_port="input 1" to_op="Loop Attributes (Radoop)" to_port="input 1"/>
<connect from_op="Loop Attributes (Radoop)" from_port="output 1" to_op="Join (2)" to_port="left"/>
<connect from_op="Join (2)" from_port="join" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
<description align="left" color="yellow" colored="false" height="96" resized="true" width="354" x="110" y="178">For efficiency in execution I don't want to reuse results. I want to join the resulting sets together.<br>Unfortunately I can't find the operator that would help with this.</description>
</process>
</operator>
<connect from_op="Retrieve Iris" from_port="output" to_op="Radoop Nest" to_port="input 1"/>
<connect from_op="Radoop Nest" from_port="output 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>