🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

AUPRC with imbalanced classes

User: "kypexin"
New Altair Community Member
Updated by Jocelyn

Hi, it seems I am not getting expected results when using Performance (AUPRC) with highly imbalanced dataset.

 

The relationship between recall and precision of positive class seems pretty intuitive, but I still get AUPRC = 0.010 regardless of anything: 

 

Screenshot 2018-04-25 23.28.32.pngScreenshot 2018-04-25 23.28.14.png

I am using here imbalanced credit card fraud dataset.

 

At the same time when I artificially balance data, AUPRC shows expected 'normal' values:

 

Screenshot 2018-04-25 23.35.06.pngScreenshot 2018-04-25 23.34.59.png

Process attached:

 

<?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.1.003" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.1.003" expanded="true" height="68" name="Retrieve creditcard" width="90" x="45" y="34">
<parameter key="repository_entry" value="../data/creditcard"/>
</operator>
<operator activated="true" class="sample" compatibility="8.1.003" expanded="true" height="82" name="equalize classes" width="90" x="179" y="34">
<parameter key="balance_data" value="true"/>
<list key="sample_size_per_class">
<parameter key="1" value="492"/>
<parameter key="0" value="492"/>
</list>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
</operator>
<operator activated="false" class="sample_stratified" compatibility="8.1.003" expanded="true" height="82" name="sample 50k" width="90" x="45" y="340">
<parameter key="sample_size" value="50000"/>
</operator>
<operator activated="false" class="create_threshold" compatibility="8.1.003" expanded="true" height="68" name="Create Threshold" width="90" x="581" y="391">
<parameter key="threshold" value="0.09"/>
<parameter key="first_class" value="0"/>
<parameter key="second_class" value="1"/>
</operator>
<operator activated="true" class="split_data" compatibility="8.1.003" expanded="true" height="103" name="Split Data" width="90" x="246" y="136">
<enumeration key="partitions">
<parameter key="ratio" value="0.8"/>
<parameter key="ratio" value="0.2"/>
</enumeration>
<parameter key="sampling_type" value="stratified sampling"/>
</operator>
<operator activated="true" class="concurrency:cross_validation" compatibility="8.1.003" expanded="true" height="145" name="Validation" width="90" x="380" y="34">
<parameter key="sampling_type" value="shuffled sampling"/>
<process expanded="true">
<operator activated="false" class="concurrency:parallel_decision_tree" compatibility="8.1.003" expanded="true" height="103" name="Decision Tree" width="90" x="112" y="136">
<parameter key="apply_pruning" value="false"/>
<parameter key="apply_prepruning" value="false"/>
</operator>
<operator activated="true" class="h2o:generalized_linear_model" compatibility="7.2.000" expanded="true" height="124" name="Generalized Linear Model" width="90" x="246" y="34">
<list key="beta_constraints"/>
<list key="expert_parameters"/>
</operator>
<operator activated="false" class="h2o:deep_learning" compatibility="7.6.001" expanded="true" height="82" name="Deep Learning" width="90" x="380" y="136">
<enumeration key="hidden_layer_sizes">
<parameter key="hidden_layer_sizes" value="50"/>
<parameter key="hidden_layer_sizes" value="50"/>
</enumeration>
<enumeration key="hidden_dropout_ratios"/>
<list key="expert_parameters"/>
<list key="expert_parameters_"/>
</operator>
<operator activated="false" class="stacking" compatibility="8.1.003" expanded="true" height="68" name="Stacking" width="90" x="179" y="289">
<process expanded="true">
<operator activated="true" class="h2o:generalized_linear_model" compatibility="7.2.000" expanded="true" height="124" name="Generalized Linear Model (2)" width="90" x="179" y="187">
<list key="beta_constraints"/>
<list key="expert_parameters"/>
</operator>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="8.1.003" expanded="true" height="103" name="Decision Tree (2)" width="90" x="112" y="34">
<parameter key="apply_pruning" value="false"/>
<parameter key="apply_prepruning" value="false"/>
</operator>
<operator activated="true" class="h2o:deep_learning" compatibility="7.6.001" expanded="true" height="82" name="Deep Learning (2)" width="90" x="112" y="340">
<enumeration key="hidden_layer_sizes">
<parameter key="hidden_layer_sizes" value="20"/>
<parameter key="hidden_layer_sizes" value="20"/>
</enumeration>
<enumeration key="hidden_dropout_ratios"/>
<list key="expert_parameters"/>
<list key="expert_parameters_"/>
</operator>
<connect from_port="training set 1" to_op="Decision Tree (2)" to_port="training set"/>
<connect from_port="training set 2" to_op="Generalized Linear Model (2)" to_port="training set"/>
<connect from_port="training set 3" to_op="Deep Learning (2)" to_port="training set"/>
<connect from_op="Generalized Linear Model (2)" from_port="model" to_port="base model 2"/>
<connect from_op="Decision Tree (2)" from_port="model" to_port="base model 1"/>
<connect from_op="Deep Learning (2)" from_port="model" to_port="base model 3"/>
<portSpacing port="source_training set 1" spacing="0"/>
<portSpacing port="source_training set 2" spacing="0"/>
<portSpacing port="source_training set 3" spacing="0"/>
<portSpacing port="source_training set 4" spacing="0"/>
<portSpacing port="sink_base model 1" spacing="0"/>
<portSpacing port="sink_base model 2" spacing="0"/>
<portSpacing port="sink_base model 3" spacing="0"/>
<portSpacing port="sink_base model 4" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="h2o:generalized_linear_model" compatibility="7.6.001" expanded="true" height="124" name="Generalized Linear Model (3)" width="90" x="45" y="34">
<list key="beta_constraints"/>
<list key="expert_parameters"/>
</operator>
<connect from_port="stacking examples" to_op="Generalized Linear Model (3)" to_port="training set"/>
<connect from_op="Generalized Linear Model (3)" from_port="model" to_port="stacking model"/>
<portSpacing port="source_stacking examples" spacing="0"/>
<portSpacing port="sink_stacking model" spacing="0"/>
</process>
</operator>
<connect from_port="training set" to_op="Generalized Linear Model" to_port="training set"/>
<connect from_op="Generalized Linear Model" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="8.1.003" expanded="true" height="82" name="apply on train" width="90" x="45" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="operator_toolbox:performance_auprc" compatibility="1.0.000" expanded="true" height="82" name="perf train" width="90" x="246" y="34">
<parameter key="main_criterion" value="AUPRC"/>
<parameter key="AUC" value="true"/>
<parameter key="AUPRC" value="true"/>
</operator>
<connect from_port="model" to_op="apply on train" to_port="model"/>
<connect from_port="test set" to_op="apply on train" to_port="unlabelled data"/>
<connect from_op="apply on train" from_port="labelled data" to_op="perf train" to_port="labelled data"/>
<connect from_op="perf train" from_port="performance" to_port="performance 1"/>
<connect from_op="perf train" from_port="example set" to_port="test set results"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_test set results" spacing="0"/>
<portSpacing port="sink_performance 1" spacing="0"/>
<portSpacing port="sink_performance 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" compatibility="8.1.003" expanded="true" height="82" name="apply on test" width="90" x="581" y="136">
<list key="application_parameters"/>
</operator>
<operator activated="false" class="select_recall" compatibility="8.1.003" expanded="true" height="82" name="Select Recall" width="90" x="581" y="289">
<parameter key="min_recall" value="0.8"/>
<parameter key="positive_label" value="1"/>
</operator>
<operator activated="false" class="apply_threshold" compatibility="8.1.003" expanded="true" height="82" name="Apply Threshold" width="90" x="715" y="289"/>
<operator activated="true" class="performance" compatibility="8.1.003" expanded="true" height="82" name="perf test" width="90" x="715" y="136"/>
<operator activated="true" class="operator_toolbox:performance_auprc" compatibility="1.0.000" expanded="true" height="82" name="perf test (2)" width="90" x="849" y="136">
<parameter key="main_criterion" value="AUPRC"/>
<parameter key="accuracy" value="false"/>
<parameter key="AUPRC" value="true"/>
</operator>
<connect from_op="Retrieve creditcard" from_port="output" to_op="equalize classes" to_port="example set input"/>
<connect from_op="equalize classes" from_port="example set output" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_op="Validation" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 2" to_op="apply on test" to_port="unlabelled data"/>
<connect from_op="Validation" from_port="model" to_op="apply on test" to_port="model"/>
<connect from_op="Validation" from_port="performance 1" to_port="result 1"/>
<connect from_op="apply on test" from_port="labelled data" to_op="perf test" to_port="labelled data"/>
<connect from_op="Select Recall" from_port="example set" to_op="Apply Threshold" to_port="example set"/>
<connect from_op="Select Recall" from_port="threshold" to_op="Apply Threshold" to_port="threshold"/>
<connect from_op="perf test" from_port="performance" to_op="perf test (2)" to_port="performance"/>
<connect from_op="perf test" from_port="example set" to_op="perf test (2)" to_port="labelled data"/>
<connect from_op="perf test (2)" from_port="performance" to_port="result 2"/>
<connect from_op="perf test (2)" from_port="example set" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>

 

 

Find more posts tagged with