After using the suggested solution for a few weeks, I would like to re-express the need for a dedicated option for this (which is probably feasible since it is already available for other outputs of this operator).
Although the suggested solutions work and are simple enough, I found that they do not scale well to larger datasets. I tried running explain predictions in a ~80k rows and ~70 columns dataset.
The explain predictions operator itself ran under a reasonsable time, but the filter to keep only the top 5 important explaining attributes per example was taking too long to run: after 14 hours, it was only 25% done and therefore I terminated it. After more investigation, it seems the "Append" operator is the one taking the longest to execute under these conditions.
Also, some of our users have a Professional license, which is limited to 100,000 data rows, so I assume this limit will apply in this case as well, which is a problem because the data that is actually used (for example top 5 explaining attributes for each prediction) would be below that limit.
This is probably not a situation that will happen on a production model (it will be used on smaller datasets covering at most a few weeks of recent data), but I'm trying to use this feature to help investigate cases where the model was wrong in a dataset with several years of data. So I could probably filter to only keep the examples where the model was the most wrong, but it's also useful to have examples where the model was right for comparison.
Thanks
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.6.000" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.6.000" expanded="true" height="68" name="Retrieve Golf" width="90" x="112" y="85">
<parameter key="repository_entry" value="//Samples/data/Golf"/>
</operator>
<operator activated="true" class="h2o:generalized_linear_model" compatibility="9.3.001" expanded="true" height="124" name="Generalized Linear Model" width="90" x="313" y="85">
<parameter key="family" value="AUTO"/>
<parameter key="link" value="family_default"/>
<parameter key="solver" value="AUTO"/>
<parameter key="reproducible" value="false"/>
<parameter key="maximum_number_of_threads" value="4"/>
<parameter key="use_regularization" value="true"/>
<parameter key="lambda_search" value="false"/>
<parameter key="number_of_lambdas" value="0"/>
<parameter key="lambda_min_ratio" value="0.0"/>
<parameter key="early_stopping" value="true"/>
<parameter key="stopping_rounds" value="3"/>
<parameter key="stopping_tolerance" value="0.001"/>
<parameter key="standardize" value="true"/>
<parameter key="non-negative_coefficients" value="false"/>
<parameter key="add_intercept" value="true"/>
<parameter key="compute_p-values" value="false"/>
<parameter key="remove_collinear_columns" value="false"/>
<parameter key="missing_values_handling" value="MeanImputation"/>
<parameter key="max_iterations" value="0"/>
<parameter key="specify_beta_constraints" value="false"/>
<list key="beta_constraints"/>
<parameter key="max_runtime_seconds" value="0"/>
<list key="expert_parameters"/>
</operator>
<operator activated="true" class="multiply" compatibility="9.6.000" expanded="true" height="103" name="Multiply" width="90" x="447" y="106"/>
<operator activated="true" class="model_simulator:explain_predictions" compatibility="9.6.000" expanded="true" height="124" name="Explain Predictions" width="90" x="581" y="85">
<parameter key="maximal explaining attributes" value="3"/>
<parameter key="local sample size" value="500"/>
<parameter key="only create predictions" value="false"/>
<parameter key="normalize global weights" value="false"/>
<parameter key="sort_weights" value="true"/>
<parameter key="sort_direction" value="descending"/>
</operator>
<operator activated="true" class="operator_toolbox:group_into_collection" compatibility="2.4.000-SNAPSHOT" expanded="true" height="82" name="Group Into Collection" width="90" x="715" y="136">
<parameter key="group_by_attribute" value="Row No"/>
<parameter key="group_by_attribute (numerical)" value=""/>
<parameter key="sorting_order" value="none"/>
</operator>
<operator activated="true" class="loop_collection" compatibility="9.6.000" expanded="true" height="82" name="Loop Collection" width="90" x="849" y="136">
<parameter key="set_iteration_macro" value="false"/>
<parameter key="macro_name" value="iteration"/>
<parameter key="macro_start_value" value="1"/>
<parameter key="unfold" value="false"/>
<process expanded="true">
<operator activated="true" class="sort" compatibility="9.6.000" expanded="true" height="82" name="Sort" width="90" x="112" y="34">
<parameter key="attribute_name" value="Value"/>
<parameter key="sorting_direction" value="decreasing"/>
</operator>
<operator activated="true" class="filter_example_range" compatibility="9.6.000" expanded="true" height="82" name="Filter Example Range" width="90" x="313" y="34">
<parameter key="first_example" value="1"/>
<parameter key="last_example" value="3"/>
<parameter key="invert_filter" value="false"/>
</operator>
<connect from_port="single" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
<connect from_op="Filter Example Range" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="append" compatibility="9.6.000" expanded="true" height="82" name="Append" width="90" x="983" y="136">
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<parameter key="merge_type" value="all"/>
</operator>
<connect from_op="Retrieve Golf" from_port="output" to_op="Generalized Linear Model" to_port="training set"/>
<connect from_op="Generalized Linear Model" from_port="model" to_op="Explain Predictions" to_port="model"/>
<connect from_op="Generalized Linear Model" from_port="exampleSet" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Explain Predictions" to_port="training data"/>
<connect from_op="Multiply" from_port="output 2" to_op="Explain Predictions" to_port="test data"/>
<connect from_op="Explain Predictions" from_port="importances output" to_op="Group Into Collection" to_port="exa"/>
<connect from_op="Group Into Collection" from_port="col" to_op="Loop Collection" to_port="collection"/>
<connect from_op="Loop Collection" from_port="output 1" to_op="Append" to_port="example set 1"/>
<connect from_op="Append" from_port="merged set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>