Hi,
I continue my experiments on RM/Execute Python with the NB model.
Sorry, but I feel obliged to appeal to you :
mschmitz , that is with numerical examples for both model RM and execute Python.
Indeed, I retrieve in both models strictly the same scoring results (accuracy, weighted mean recall, weighted mean precision, recall (positive class no/yes), precision (positive class no/yes) ) except..... for the AUC :
AUC(RM)= 0.942
AUC(Python) = 0.883
I suppose that the AUC is calculated from the ROC curve.
But how it is calculated ?. How explain this difference?
Here the process :
<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.0.001" expanded="true" height="68" name="Retrieve Deals" width="90" x="45" y="136">
<parameter key="repository_entry" value="//Samples/data/Deals"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="8.0.001" expanded="true" height="103" name="Nominal to Numerical (3)" width="90" x="179" y="85">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Future Customer"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply" width="90" x="380" y="85"/>
<operator activated="true" class="naive_bayes" compatibility="8.0.001" expanded="true" height="82" name="Naive Bayes" width="90" x="514" y="85"/>
<operator activated="true" class="retrieve" compatibility="8.0.001" expanded="true" height="68" name="Retrieve Deals (2)" width="90" x="45" y="340">
<parameter key="repository_entry" value="//Samples/data/Deals"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="8.0.001" expanded="true" height="103" name="Nominal to Numerical (2)" width="90" x="179" y="340">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Future Customer"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="8.0.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="380" y="340">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Future Customer"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="coding_type" value="unique integers"/>
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="python_scripting:execute_python" compatibility="7.4.000" expanded="true" height="166" name="Build / Apply model" width="90" x="514" y="289">
<parameter key="script" value="import pandas as pd import numpy as np from sklearn.naive_bayes import GaussianNB from sklearn.calibration import CalibratedClassifierCV from sklearn.metrics import confusion_matrix from sklearn.metrics import accuracy_score from sklearn.metrics import recall_score from sklearn.metrics import precision_score from sklearn.metrics import roc_auc_score from sklearn import metrics # rm_main is a mandatory function, # the number of arguments has to be the number of input ports (can be none) def rm_main(data): # Build the model X = data.iloc[:,1:] y = data.iloc[:,0] NB = GaussianNB() NB.fit(X,y) NB_Calib = CalibratedClassifierCV(base_estimator = NB,method = 'sigmoid') NB_Calib.fit(X,y) #Calculate probability of each class. pr = NB.class_prior_ #Calculate mean of each feature per class th= NB.theta_ #Apply the model y_pred = NB.predict(X) y_prob = NB_Calib.predict_proba(X) # Calculate the scoring #confusion matrix conf_matrix = confusion_matrix(y,y_pred) #accuracy acc_score = 100*accuracy_score(y,y_pred) #weighted recall reca_score = 100*recall_score(y,y_pred,average = 'weighted') #weighted precision precisionscore = 100*precision_score(y,y_pred,average='weighted') #recall (positive class : yes / positive class : no ) reca_no = 100*recall_score(y,y_pred,average =None) #precision (positive class : yes / positive class : no ) precision_no = 100*precision_score(y,y_pred,average=None) #AUC (positive class : no) AUCscore = roc_auc_score(y,y_pred,average=None) #AUC (positive class : no) méthode n°2 fpr, tpr, thresholds = metrics.roc_curve(y, y_pred, pos_label=1) AUC_2 = metrics.auc(fpr, tpr) #Write the y_pred and scores in dataframe y_prediction = pd.DataFrame(data = y_pred,columns = ['prediction(Future Customer)']) y_probability = pd.DataFrame(data = y_prob,columns = ['confidence(yes)','confidence(no)']) data = data.join(y_prediction) data = data.join(y_probability) accu_score = pd.DataFrame(data = [acc_score],columns = ['accuracy']) recall_weighted = pd.DataFrame(data = [reca_score],columns = ['weighted_mean_recall']) precision_weighted = pd.DataFrame(data = [precisionscore],columns = ['weighted_mean_precision']) recall_no = pd.DataFrame(data = [reca_no],columns = ['recall (positive class : yes)','recall (positive class : no)']) precision_no = pd.DataFrame(data = [precision_no],columns = ['precision (positive class : yes)','precision (positive class : no)']) AUC = pd.DataFrame(data = [AUCscore],columns = ['AUC']) AUC2 = pd.DataFrame(data = [AUC_2],columns = ['AUC_method2']) score = accu_score.join(recall_weighted) score = score.join(precision_weighted) score = score.join(recall_no) score = score.join(precision_no) score = score.join(AUC) score = score.join(AUC2) theta = pd.DataFrame(data = th,columns = ['Gender = Male','Gender = Female','PM = Credit card','PM = cheque','PM = cash','Age']) proba = pd.DataFrame(data = pr, columns = ['probability']) confus_matrix = pd.DataFrame(data = conf_matrix,columns = ['true yes','true no']) 	 #data.rm_metadata['prediction(Future Customer)']=(None,'prediction(Future Customer)') # connect 4 output ports to see the results return score,theta, confus_matrix,proba,data"/>
</operator>
<operator activated="true" class="apply_model" compatibility="8.0.001" expanded="true" height="82" name="Apply Model" width="90" x="648" y="85">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply (2)" width="90" x="782" y="85"/>
<operator activated="true" class="performance" compatibility="8.0.001" expanded="true" height="82" name="Performance (2)" width="90" x="916" y="136"/>
<operator activated="true" class="performance_classification" compatibility="8.0.001" expanded="true" height="82" name="Performance" width="90" x="916" y="34">
<parameter key="weighted_mean_recall" value="true"/>
<parameter key="weighted_mean_precision" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_op="Retrieve Deals" from_port="output" to_op="Nominal to Numerical (3)" to_port="example set input"/>
<connect from_op="Nominal to Numerical (3)" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Naive Bayes" to_port="training set"/>
<connect from_op="Multiply" from_port="output 2" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Naive Bayes" from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_op="Retrieve Deals (2)" from_port="output" to_op="Nominal to Numerical (2)" to_port="example set input"/>
<connect from_op="Nominal to Numerical (2)" from_port="example set output" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Build / Apply model" to_port="input 1"/>
<connect from_op="Build / Apply model" from_port="output 1" to_port="result 1"/>
<connect from_op="Build / Apply model" from_port="output 2" to_port="result 2"/>
<connect from_op="Build / Apply model" from_port="output 3" to_port="result 3"/>
<connect from_op="Build / Apply model" from_port="output 4" to_port="result 4"/>
<connect from_op="Build / Apply model" from_port="output 5" to_port="result 7"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Multiply (2)" to_port="input"/>
<connect from_op="Multiply (2)" from_port="output 1" to_op="Performance" to_port="labelled data"/>
<connect from_op="Multiply (2)" from_port="output 2" to_op="Performance (2)" to_port="labelled data"/>
<connect from_op="Performance (2)" from_port="performance" to_port="result 8"/>
<connect from_op="Performance" from_port="performance" to_port="result 5"/>
<connect from_op="Performance" from_port="example set" to_port="result 6"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
<portSpacing port="sink_result 6" spacing="0"/>
<portSpacing port="sink_result 7" spacing="0"/>
<portSpacing port="sink_result 8" spacing="0"/>
<portSpacing port="sink_result 9" spacing="0"/>
</process>
</operator>
</process>
Thanks you,
Best regards,
Lionel