AUPRC with imbalanced classes
Hi, it seems I am not getting expected results when using Performance (AUPRC) with highly imbalanced dataset.
The relationship between recall and precision of positive class seems pretty intuitive, but I still get AUPRC = 0.010 regardless of anything:
I am using here imbalanced credit card fraud dataset.
At the same time when I artificially balance data, AUPRC shows expected 'normal' values:
Process attached:
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.1.003" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.1.003" expanded="true" height="68" name="Retrieve creditcard" width="90" x="45" y="34">
<parameter key="repository_entry" value="../data/creditcard"/>
</operator>
<operator activated="true" class="sample" compatibility="8.1.003" expanded="true" height="82" name="equalize classes" width="90" x="179" y="34">
<parameter key="balance_data" value="true"/>
<list key="sample_size_per_class">
<parameter key="1" value="492"/>
<parameter key="0" value="492"/>
</list>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
</operator>
<operator activated="false" class="sample_stratified" compatibility="8.1.003" expanded="true" height="82" name="sample 50k" width="90" x="45" y="340">
<parameter key="sample_size" value="50000"/>
</operator>
<operator activated="false" class="create_threshold" compatibility="8.1.003" expanded="true" height="68" name="Create Threshold" width="90" x="581" y="391">
<parameter key="threshold" value="0.09"/>
<parameter key="first_class" value="0"/>
<parameter key="second_class" value="1"/>
</operator>
<operator activated="true" class="split_data" compatibility="8.1.003" expanded="true" height="103" name="Split Data" width="90" x="246" y="136">
<enumeration key="partitions">
<parameter key="ratio" value="0.8"/>
<parameter key="ratio" value="0.2"/>
</enumeration>
<parameter key="sampling_type" value="stratified sampling"/>
</operator>
<operator activated="true" class="concurrency:cross_validation" compatibility="8.1.003" expanded="true" height="145" name="Validation" width="90" x="380" y="34">
<parameter key="sampling_type" value="shuffled sampling"/>
<process expanded="true">
<operator activated="false" class="concurrency:parallel_decision_tree" compatibility="8.1.003" expanded="true" height="103" name="Decision Tree" width="90" x="112" y="136">
<parameter key="apply_pruning" value="false"/>
<parameter key="apply_prepruning" value="false"/>
</operator>
<operator activated="true" class="h2o:generalized_linear_model" compatibility="7.2.000" expanded="true" height="124" name="Generalized Linear Model" width="90" x="246" y="34">
<list key="beta_constraints"/>
<list key="expert_parameters"/>
</operator>
<operator activated="false" class="h2o:deep_learning" compatibility="7.6.001" expanded="true" height="82" name="Deep Learning" width="90" x="380" y="136">
<enumeration key="hidden_layer_sizes">
<parameter key="hidden_layer_sizes" value="50"/>
<parameter key="hidden_layer_sizes" value="50"/>
</enumeration>
<enumeration key="hidden_dropout_ratios"/>
<list key="expert_parameters"/>
<list key="expert_parameters_"/>
</operator>
<operator activated="false" class="stacking" compatibility="8.1.003" expanded="true" height="68" name="Stacking" width="90" x="179" y="289">
<process expanded="true">
<operator activated="true" class="h2o:generalized_linear_model" compatibility="7.2.000" expanded="true" height="124" name="Generalized Linear Model (2)" width="90" x="179" y="187">
<list key="beta_constraints"/>
<list key="expert_parameters"/>
</operator>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="8.1.003" expanded="true" height="103" name="Decision Tree (2)" width="90" x="112" y="34">
<parameter key="apply_pruning" value="false"/>
<parameter key="apply_prepruning" value="false"/>
</operator>
<operator activated="true" class="h2o:deep_learning" compatibility="7.6.001" expanded="true" height="82" name="Deep Learning (2)" width="90" x="112" y="340">
<enumeration key="hidden_layer_sizes">
<parameter key="hidden_layer_sizes" value="20"/>
<parameter key="hidden_layer_sizes" value="20"/>
</enumeration>
<enumeration key="hidden_dropout_ratios"/>
<list key="expert_parameters"/>
<list key="expert_parameters_"/>
</operator>
<connect from_port="training set 1" to_op="Decision Tree (2)" to_port="training set"/>
<connect from_port="training set 2" to_op="Generalized Linear Model (2)" to_port="training set"/>
<connect from_port="training set 3" to_op="Deep Learning (2)" to_port="training set"/>
<connect from_op="Generalized Linear Model (2)" from_port="model" to_port="base model 2"/>
<connect from_op="Decision Tree (2)" from_port="model" to_port="base model 1"/>
<connect from_op="Deep Learning (2)" from_port="model" to_port="base model 3"/>
<portSpacing port="source_training set 1" spacing="0"/>
<portSpacing port="source_training set 2" spacing="0"/>
<portSpacing port="source_training set 3" spacing="0"/>
<portSpacing port="source_training set 4" spacing="0"/>
<portSpacing port="sink_base model 1" spacing="0"/>
<portSpacing port="sink_base model 2" spacing="0"/>
<portSpacing port="sink_base model 3" spacing="0"/>
<portSpacing port="sink_base model 4" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="h2o:generalized_linear_model" compatibility="7.6.001" expanded="true" height="124" name="Generalized Linear Model (3)" width="90" x="45" y="34">
<list key="beta_constraints"/>
<list key="expert_parameters"/>
</operator>
<connect from_port="stacking examples" to_op="Generalized Linear Model (3)" to_port="training set"/>
<connect from_op="Generalized Linear Model (3)" from_port="model" to_port="stacking model"/>
<portSpacing port="source_stacking examples" spacing="0"/>
<portSpacing port="sink_stacking model" spacing="0"/>
</process>
</operator>
<connect from_port="training set" to_op="Generalized Linear Model" to_port="training set"/>
<connect from_op="Generalized Linear Model" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="8.1.003" expanded="true" height="82" name="apply on train" width="90" x="45" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="operator_toolbox:performance_auprc" compatibility="1.0.000" expanded="true" height="82" name="perf train" width="90" x="246" y="34">
<parameter key="main_criterion" value="AUPRC"/>
<parameter key="AUC" value="true"/>
<parameter key="AUPRC" value="true"/>
</operator>
<connect from_port="model" to_op="apply on train" to_port="model"/>
<connect from_port="test set" to_op="apply on train" to_port="unlabelled data"/>
<connect from_op="apply on train" from_port="labelled data" to_op="perf train" to_port="labelled data"/>
<connect from_op="perf train" from_port="performance" to_port="performance 1"/>
<connect from_op="perf train" from_port="example set" to_port="test set results"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_test set results" spacing="0"/>
<portSpacing port="sink_performance 1" spacing="0"/>
<portSpacing port="sink_performance 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" compatibility="8.1.003" expanded="true" height="82" name="apply on test" width="90" x="581" y="136">
<list key="application_parameters"/>
</operator>
<operator activated="false" class="select_recall" compatibility="8.1.003" expanded="true" height="82" name="Select Recall" width="90" x="581" y="289">
<parameter key="min_recall" value="0.8"/>
<parameter key="positive_label" value="1"/>
</operator>
<operator activated="false" class="apply_threshold" compatibility="8.1.003" expanded="true" height="82" name="Apply Threshold" width="90" x="715" y="289"/>
<operator activated="true" class="performance" compatibility="8.1.003" expanded="true" height="82" name="perf test" width="90" x="715" y="136"/>
<operator activated="true" class="operator_toolbox:performance_auprc" compatibility="1.0.000" expanded="true" height="82" name="perf test (2)" width="90" x="849" y="136">
<parameter key="main_criterion" value="AUPRC"/>
<parameter key="accuracy" value="false"/>
<parameter key="AUPRC" value="true"/>
</operator>
<connect from_op="Retrieve creditcard" from_port="output" to_op="equalize classes" to_port="example set input"/>
<connect from_op="equalize classes" from_port="example set output" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_op="Validation" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 2" to_op="apply on test" to_port="unlabelled data"/>
<connect from_op="Validation" from_port="model" to_op="apply on test" to_port="model"/>
<connect from_op="Validation" from_port="performance 1" to_port="result 1"/>
<connect from_op="apply on test" from_port="labelled data" to_op="perf test" to_port="labelled data"/>
<connect from_op="Select Recall" from_port="example set" to_op="Apply Threshold" to_port="example set"/>
<connect from_op="Select Recall" from_port="threshold" to_op="Apply Threshold" to_port="threshold"/>
<connect from_op="perf test" from_port="performance" to_op="perf test (2)" to_port="performance"/>
<connect from_op="perf test" from_port="example set" to_op="perf test (2)" to_port="labelled data"/>
<connect from_op="perf test (2)" from_port="performance" to_port="result 2"/>
<connect from_op="perf test (2)" from_port="example set" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>