Naive Bayes - Execute Python vs RM : different AUC

I continue my experiments on RM/Execute Python with the NB model.
Sorry, but I feel obliged to appeal to you :
mschmitz , that is with numerical examples for both model RM and execute Python.
Indeed, I retrieve in both models strictly the same scoring results (accuracy, weighted mean recall, weighted mean precision, recall (positive class no/yes), precision (positive class no/yes) ) except..... for the AUC :
AUC(RM)= 0.942
AUC(Python) = 0.883
I suppose that the AUC is calculated from the ROC curve.
But how it is calculated ?. How explain this difference?
Here the process :
<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
<operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.0.001" expanded="true" height="68" name="Retrieve Deals" width="90" x="45" y="136">
<parameter key="repository_entry" value="//Samples/data/Deals"/>
<operator activated="true" class="nominal_to_numerical" compatibility="8.0.001" expanded="true" height="103" name="Nominal to Numerical (3)" width="90" x="179" y="85">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Future Customer"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
<list key="comparison_groups"/>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply" width="90" x="380" y="85"/>
<operator activated="true" class="naive_bayes" compatibility="8.0.001" expanded="true" height="82" name="Naive Bayes" width="90" x="514" y="85"/>
<operator activated="true" class="retrieve" compatibility="8.0.001" expanded="true" height="68" name="Retrieve Deals (2)" width="90" x="45" y="340">
<parameter key="repository_entry" value="//Samples/data/Deals"/>
<operator activated="true" class="nominal_to_numerical" compatibility="8.0.001" expanded="true" height="103" name="Nominal to Numerical (2)" width="90" x="179" y="340">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Future Customer"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
<list key="comparison_groups"/>
<operator activated="true" class="nominal_to_numerical" compatibility="8.0.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="380" y="340">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Future Customer"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="coding_type" value="unique integers"/>
<list key="comparison_groups"/>
<operator activated="true" class="python_scripting:execute_python" compatibility="7.4.000" expanded="true" height="166" name="Build / Apply model" width="90" x="514" y="289">
<parameter key="script" value="import pandas as pd import numpy as np from sklearn.naive_bayes import GaussianNB from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.metrics import recall_score from sklearn.metrics import precision_score from sklearn.metrics import roc_auc_score from sklearn import metrics # rm_main is a mandatory function, # the number of arguments has to be the number of input ports (can be none) def rm_main(data): # Build the model X = data.iloc[:,1:] y = data.iloc[:,0] NB = GaussianNB(),y) NB_Calib = CalibratedClassifierCV(base_estimator = NB,method = 'sigmoid'),y) #Calculate probability of each class. pr = NB.class_prior_ #Calculate mean of each feature per class th= NB.theta_ #Apply the model y_pred = NB.predict(X) y_prob = NB_Calib.predict_proba(X) # Calculate the scoring #confusion matrix conf_matrix = confusion_matrix(y,y_pred) #accuracy acc_score = 100*accuracy_score(y,y_pred) #weighted recall reca_score = 100*recall_score(y,y_pred,average = 'weighted') #weighted precision precisionscore = 100*precision_score(y,y_pred,average='weighted') #recall (positive class : yes / positive class : no ) reca_no = 100*recall_score(y,y_pred,average =None) #precision (positive class : yes / positive class : no ) precision_no = 100*precision_score(y,y_pred,average=None) #AUC (positive class : no) AUCscore = roc_auc_score(y,y_pred,average=None) #AUC (positive class : no) méthode n°2 fpr, tpr, thresholds = metrics.roc_curve(y, y_pred, pos_label=1) AUC_2 = metrics.auc(fpr, tpr) #Write the y_pred and scores in dataframe y_prediction = pd.DataFrame(data = y_pred,columns = ['prediction(Future Customer)']) y_probability = pd.DataFrame(data = y_prob,columns = ['confidence(yes)','confidence(no)']) data = data.join(y_prediction) data = data.join(y_probability) accu_score = pd.DataFrame(data = [acc_score],columns = ['accuracy']) recall_weighted = pd.DataFrame(data = [reca_score],columns = ['weighted_mean_recall']) precision_weighted = pd.DataFrame(data = [precisionscore],columns = ['weighted_mean_precision']) recall_no = pd.DataFrame(data = [reca_no],columns = ['recall (positive class : yes)','recall (positive class : no)']) precision_no = pd.DataFrame(data = [precision_no],columns = ['precision (positive class : yes)','precision (positive class : no)']) AUC = pd.DataFrame(data = [AUCscore],columns = ['AUC']) AUC2 = pd.DataFrame(data = [AUC_2],columns = ['AUC_method2']) score = accu_score.join(recall_weighted) score = score.join(precision_weighted) score = score.join(recall_no) score = score.join(precision_no) score = score.join(AUC) score = score.join(AUC2) theta = pd.DataFrame(data = th,columns = ['Gender = Male','Gender = Female','PM = Credit card','PM = cheque','PM = cash','Age']) proba = pd.DataFrame(data = pr, columns = ['probability']) confus_matrix = pd.DataFrame(data = conf_matrix,columns = ['true yes','true no']) 	 #data.rm_metadata['prediction(Future Customer)']=(None,'prediction(Future Customer)') # connect 4 output ports to see the results return score,theta, confus_matrix,proba,data"/>
<operator activated="true" class="apply_model" compatibility="8.0.001" expanded="true" height="82" name="Apply Model" width="90" x="648" y="85">
<list key="application_parameters"/>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply (2)" width="90" x="782" y="85"/>
<operator activated="true" class="performance" compatibility="8.0.001" expanded="true" height="82" name="Performance (2)" width="90" x="916" y="136"/>
<operator activated="true" class="performance_classification" compatibility="8.0.001" expanded="true" height="82" name="Performance" width="90" x="916" y="34">
<parameter key="weighted_mean_recall" value="true"/>
<parameter key="weighted_mean_precision" value="true"/>
<list key="class_weights"/>
<connect from_op="Retrieve Deals" from_port="output" to_op="Nominal to Numerical (3)" to_port="example set input"/>
<connect from_op="Nominal to Numerical (3)" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Naive Bayes" to_port="training set"/>
<connect from_op="Multiply" from_port="output 2" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Naive Bayes" from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_op="Retrieve Deals (2)" from_port="output" to_op="Nominal to Numerical (2)" to_port="example set input"/>
<connect from_op="Nominal to Numerical (2)" from_port="example set output" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Build / Apply model" to_port="input 1"/>
<connect from_op="Build / Apply model" from_port="output 1" to_port="result 1"/>
<connect from_op="Build / Apply model" from_port="output 2" to_port="result 2"/>
<connect from_op="Build / Apply model" from_port="output 3" to_port="result 3"/>
<connect from_op="Build / Apply model" from_port="output 4" to_port="result 4"/>
<connect from_op="Build / Apply model" from_port="output 5" to_port="result 7"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Multiply (2)" to_port="input"/>
<connect from_op="Multiply (2)" from_port="output 1" to_op="Performance" to_port="labelled data"/>
<connect from_op="Multiply (2)" from_port="output 2" to_op="Performance (2)" to_port="labelled data"/>
<connect from_op="Performance (2)" from_port="performance" to_port="result 8"/>
<connect from_op="Performance" from_port="performance" to_port="result 5"/>
<connect from_op="Performance" from_port="example set" to_port="result 6"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
<portSpacing port="sink_result 6" spacing="0"/>
<portSpacing port="sink_result 7" spacing="0"/>
<portSpacing port="sink_result 8" spacing="0"/>
<portSpacing port="sink_result 9" spacing="0"/>
Thanks you,
Best regards,
Best Answer
Hi @lionelderkrikor,
i think one of the main differences is this line:
NB_Calib = CalibratedClassifierCV(base_estimator = NB,method = 'sigmoid')
I am not sure exactly what it does, but it changes the confidences. RM is not doing that in his X-Val. Thus it is expected to get different results.
A fairer comparison of AUC itself would be to use the example set which was scored in RM and calculate AUC in Python and RM. There are always slight differences in how you calculate AUC - but your difference is a bit huge for this. I would expect the line above to influence the difference more.
It likely has to do with the way that ties are handled, because there are multiple options for that when calculating ROC/AUC and not all software uses the same method. You'll either have to dive into the details of the ROC/AUC calculations in python vs RapidMiner (via the java code on github), or maybe one of the developers will chime in because they already know the answer :-)
Hi @mschmitz
Here two elements :
1. Probability calibration :
Recently, during my experimentations of comparaisons Python/RM, I was too interested in
NB_Calib = CalibratedClassifierCV(base_estimator = NB,method = 'sigmoid')
In deed, first, the calculated confidences by the model (SVM) in Python were abberant (for the predicted class, the confidence was < 0.5 for a binary problem !!!???). After investigations, I discover this python class which seems to improve the relevance of classifiers confidences. So I builded a SVM model (strictly the same both python/RM) and used this class to calculated the new confidences in Python : There were differents from RM.
To go further :
To confirm, with the NB model, in the following process , I applied too with Execute Python the class above : The confidences from "Execute Python" are indeed differents from confidences of RM. (the training example set Chapter09DataSet_Training.csv in attached file)
<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
<operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.0.001" expanded="true" height="68" name="Retrieve Chapter09DataSet_Training" width="90" x="45" y="85">
<parameter key="repository_entry" value="//Rapidminer_Tests/data/Chapter09DataSet_Training"/>
<operator activated="true" class="set_role" compatibility="8.0.001" expanded="true" height="82" name="Set Role" width="90" x="179" y="85">
<parameter key="attribute_name" value="2nd_Heart_Attack"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply" width="90" x="313" y="187"/>
<operator activated="true" class="naive_bayes" compatibility="8.0.001" expanded="true" height="82" name="Naive Bayes" width="90" x="447" y="85"/>
<operator activated="true" class="apply_model" compatibility="8.0.001" expanded="true" height="82" name="Apply Model" width="90" x="581" y="136">
<list key="application_parameters"/>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply (2)" width="90" x="648" y="34"/>
<operator activated="true" class="performance" compatibility="8.0.001" expanded="true" height="82" name="Performance" width="90" x="782" y="136"/>
<operator activated="true" class="retrieve" compatibility="8.0.001" expanded="true" height="68" name="Retrieve Chapter09DataSet_Training (2)" width="90" x="45" y="442">
<parameter key="repository_entry" value="//Rapidminer_Tests/data/Chapter09DataSet_Training"/>
<operator activated="true" class="set_role" compatibility="8.0.001" expanded="true" height="82" name="Set Role (2)" width="90" x="179" y="442">
<parameter key="attribute_name" value="2nd_Heart_Attack"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply (3)" width="90" x="313" y="544"/>
<operator activated="true" class="python_scripting:execute_python" compatibility="7.4.000" expanded="true" height="103" name="Naive Bayes Python" width="90" x="447" y="442">
<parameter key="script" value="import pandas as pd from sklearn.naive_bayes import GaussianNB from sklearn.calibration import CalibratedClassifierCV # rm_main is a mandatory function, # the number of arguments has to be the number of input ports (can be none) def rm_main(data): y = data['2nd_Heart_Attack'] X = data.drop('2nd_Heart_Attack',axis = 1) #List of attributes features = list(X) #Build the model model_NB = GaussianNB(),y) #Create de calibrated model model_NB_calib =CalibratedClassifierCV(model_NB,method = 'sigmoid'),y) #Calculation of distribution table (mean) th = model_NB.theta_ th_1 = th[0,:] th_2 = th[1,:] #Calculation of distribution table (stv) std = model_NB.sigma_ std_1 = (std[0,:])**0.5 std_2 = (std[1,:])**0.5 #Write the results theta_2 = pd.DataFrame(data = th_2,columns = ['Yes (main)']) theta_1 = pd.DataFrame(data = th_1,columns = ['No (main)']) sigma_2 = pd.DataFrame(data = std_2,columns = ['Yes (std)']) sigma_1 = pd.DataFrame(data = std_1,columns = ['No (std)']) theta = pd.DataFrame(data = features,columns = ['Attribute']) theta = theta.join(theta_2) theta = theta.join(sigma_2) theta = theta.join(theta_1) theta = theta.join(sigma_1) # connect 1 output port to see the results return model_NB_calib,theta"/>
<operator activated="true" class="python_scripting:execute_python" compatibility="7.4.000" expanded="true" height="103" name="Apply model Python (2)" width="90" x="581" y="544">
<parameter key="script" value="import pandas as pd from sklearn.metrics import accuracy_score from sklearn.metrics import roc_auc_score from sklearn.preprocessing import LabelEncoder # rm_main is a mandatory function, # the number of arguments has to be the number of input ports (can be none) def rm_main(model,data): y = data['2nd_Heart_Attack'] X = data.drop('2nd_Heart_Attack',axis = 1) #Prediction : Applying of the model y_pred = model.predict(X) y_prob = model.predict_proba(X) #Transform (Yes/No) ==> (0/1) mandatory for python le = LabelEncoder() y_bin = le.fit_transform(y) y_pred_bin = le.fit_transform(y_pred) 	 #Calculation of accuracy acc = 100*accuracy_score(y,y_pred) #Calculation of AUC auc_ = roc_auc_score(y_bin,y_pred_bin,average = 'weighted') #Write the results data['prediction(2nd_Heart_Attack)'] = y_pred data['confidence(Yes)'] = y_prob[:,1] data['confidence(No)'] = y_prob[:,0] performance = pd.DataFrame(data = [acc],columns = ['accuracy']) AUC = pd.DataFrame(data = [auc_],columns = ['AUC']) performance = performance.join(AUC) data.rm_metadata['prediction(2nd_Heart_Attack)']=(None,'prediction(2nd_Heart_Attack)') data.rm_metadata['confidence(Yes)'] = (None,'confidence(Yes)') data.rm_metadata['confidence(No)'] = (None,'confidence(No)') # connect 2 output ports to see the results return data, performance"/>
<connect from_op="Retrieve Chapter09DataSet_Training" from_port="output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Naive Bayes" to_port="training set"/>
<connect from_op="Multiply" from_port="output 2" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Naive Bayes" from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Multiply (2)" to_port="input"/>
<connect from_op="Apply Model" from_port="model" to_port="result 5"/>
<connect from_op="Multiply (2)" from_port="output 1" to_op="Performance" to_port="labelled data"/>
<connect from_op="Multiply (2)" from_port="output 2" to_port="result 2"/>
<connect from_op="Performance" from_port="performance" to_port="result 1"/>
<connect from_op="Retrieve Chapter09DataSet_Training (2)" from_port="output" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Multiply (3)" to_port="input"/>
<connect from_op="Multiply (3)" from_port="output 1" to_op="Naive Bayes Python" to_port="input 1"/>
<connect from_op="Multiply (3)" from_port="output 2" to_op="Apply model Python (2)" to_port="input 2"/>
<connect from_op="Naive Bayes Python" from_port="output 1" to_op="Apply model Python (2)" to_port="input 1"/>
<connect from_op="Naive Bayes Python" from_port="output 2" to_port="result 6"/>
<connect from_op="Apply model Python (2)" from_port="output 1" to_port="result 3"/>
<connect from_op="Apply model Python (2)" from_port="output 2" to_port="result 4"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
<portSpacing port="sink_result 6" spacing="0"/>
<portSpacing port="sink_result 7" spacing="0"/>
</process>2. The ROC curve :
In parallel, I builded the ROC curve with Python and it's weird :
Python is using only one point for creating the ROC. Here a screenshot of this ROC :
NB ROC python curve
While RM is using a lot more points :
NB_ROC_RM curve
The number of points taken into account is not the same in both cases. RM is more accurate than Python and then
the two curves have not the same "shape" and then the Area Under Curve is different. For me, there is a "bug" or at least
a simplification/a lack of precision in Python.
Best regards,