🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

Naive Bayes - Execute Python vs RM : same model / different scoring results

User: "lionelderkrikor"
New Altair Community Member
Updated by Jocelyn

Hi,

 

I'm doing some experimentations on RM :  I compare the results of "Naive-Bayes" operator

and those obtained from "Execute Python" operator using the "Deals" dataset.

In "Execute Python", the building, applying of the model and calculation of scoring are performed using sckit-learn.

For RM,  I use the "Naive-Bayes" operator and the "Cross-Validation" operator.

 

After executing the process, something is weird : 

 - I have in both cases, strictly the same "Distribution Table" (so I think the builded model is the same in both cases)

but 

 - The confusion matrix, the mean accuracy, the weighted mean recall and the weighted mean precision are systematically differents : the confusion matrix are differents and the performances of RM (~92%) are greater than those Execute Python (~88%) on the same dataset.

 

Here you can find my process : 

<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.0.001" expanded="true" height="68" name="Retrieve Deals" width="90" x="45" y="85">
<parameter key="repository_entry" value="//Samples/data/Deals"/>
</operator>
<operator activated="true" class="concurrency:cross_validation" compatibility="8.0.001" expanded="true" height="145" name="Cross Validation" width="90" x="179" y="34">
<process expanded="true">
<operator activated="true" class="naive_bayes" compatibility="8.0.001" expanded="true" height="82" name="Naive Bayes" width="90" x="179" y="34"/>
<connect from_port="training set" to_op="Naive Bayes" to_port="training set"/>
<connect from_op="Naive Bayes" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="8.0.001" expanded="true" height="82" name="Apply Model" width="90" x="45" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="8.0.001" expanded="true" height="82" name="Performance" width="90" x="179" y="34">
<parameter key="weighted_mean_recall" value="true"/>
<parameter key="weighted_mean_precision" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="performance 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_test set results" spacing="0"/>
<portSpacing port="sink_performance 1" spacing="0"/>
<portSpacing port="sink_performance 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="retrieve" compatibility="8.0.001" expanded="true" height="68" name="Retrieve Deals (2)" width="90" x="45" y="289">
<parameter key="repository_entry" value="//Samples/data/Deals"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="8.0.001" expanded="true" height="103" name="Nominal to Numerical (2)" width="90" x="179" y="238">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Future Customer"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="8.0.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="380" y="238">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Future Customer"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="coding_type" value="unique integers"/>
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="python_scripting:execute_python" compatibility="7.4.000" expanded="true" height="145" name="Build / Apply model" width="90" x="514" y="238">
<parameter key="script" value="import pandas as pd&#10;from sklearn.naive_bayes import GaussianNB&#10;from sklearn.model_selection import cross_val_score&#10;from sklearn.model_selection import train_test_split&#10;from sklearn.metrics import confusion_matrix&#10;from sklearn.metrics import accuracy_score&#10;from sklearn.metrics import recall_score&#10;&#10;&#10;# rm_main is a mandatory function, &#10;# the number of arguments has to be the number of input ports (can be none)&#10;def rm_main(data):&#10;&#10; # Build the model&#10; X = data.iloc[:,1:]&#10; y = data.iloc[:,0]&#10; NB = GaussianNB()&#10; NB.fit(X,y)&#10;&#10; #Calculate probability of each class.&#10;&#10; pr = NB.class_prior_ &#10; &#10; #Calculate mean of each feature per class&#10; th= NB.theta_&#10;&#10; #Apply the model&#10; y_pred = NB.predict(X)&#10; &#10; &#10; # Calculate the scoring&#10; conf_matrix = confusion_matrix(y,y_pred)&#10; &#10; acc_score_mean = (cross_val_score(NB, X, y,cv = 10, scoring = 'accuracy' )).mean()&#10; acc_score_std = (cross_val_score(NB, X, y,cv = 10, scoring = 'accuracy' )).std()&#10; acc_score = str(100* acc_score_mean) + &quot; +/- &quot; + str( 100* acc_score_std) &#10; &#10; reca_score_mean = (cross_val_score(NB, X, y,cv = 10, scoring = 'recall_weighted' )).mean()&#10; reca_score_std = (cross_val_score(NB, X, y,cv = 10, scoring = 'recall_weighted' )).std()&#10; reca_score = str(100* reca_score_mean) + &quot; +/- &quot; + str( 100* acc_score_std) &#10; &#10; precision_score_mean = (cross_val_score(NB, X, y,cv = 10, scoring = 'precision_weighted' )).mean()&#10; precision_score_std = (cross_val_score(NB, X, y,cv = 10, scoring = 'precision_weighted' )).std()&#10; precision_score = str(100* precision_score_mean) + &quot; +/- &quot; + str( 100* precision_score_std ) &#10; &#10; #Write the scores in dataframe&#10; accu_score = pd.DataFrame(data = [acc_score],columns = ['accuracy'])&#10; recall_weighted = pd.DataFrame(data = [reca_score],columns = ['weighted_mean_recall']) &#10; precision_weighted = pd.DataFrame(data = [precision_score],columns = ['weighted_mean_precision']) &#10; score = accu_score.join(recall_weighted)&#10; score = score.join(precision_weighted)&#10; &#10; theta = pd.DataFrame(data = th,columns = ['Gender = Male','Gender = Female','PM = Credit card','PM = cheque','PM = cash','Age'])&#10; proba = pd.DataFrame(data = pr, columns = ['probability'])&#10; &#10; &#10; confus_matrix = pd.DataFrame(data = conf_matrix,columns = ['true yes','true no']) &#9;&#10; &#10;&#10; # connect 4 output ports to see the results&#10; return score,theta, confus_matrix,proba"/>
</operator>
<connect from_op="Retrieve Deals" from_port="output" to_op="Cross Validation" to_port="example set"/>
<connect from_op="Cross Validation" from_port="model" to_port="result 5"/>
<connect from_op="Cross Validation" from_port="example set" to_port="result 3"/>
<connect from_op="Cross Validation" from_port="performance 1" to_port="result 6"/>
<connect from_op="Retrieve Deals (2)" from_port="output" to_op="Nominal to Numerical (2)" to_port="example set input"/>
<connect from_op="Nominal to Numerical (2)" from_port="example set output" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Build / Apply model" to_port="input 1"/>
<connect from_op="Build / Apply model" from_port="output 1" to_port="result 1"/>
<connect from_op="Build / Apply model" from_port="output 2" to_port="result 2"/>
<connect from_op="Build / Apply model" from_port="output 3" to_port="result 4"/>
<connect from_op="Build / Apply model" from_port="output 4" to_port="result 7"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
<portSpacing port="sink_result 6" spacing="0"/>
<portSpacing port="sink_result 7" spacing="0"/>
<portSpacing port="sink_result 8" spacing="0"/>
</process>
</operator>
</process>

NB : I think there is a bug in sckit-learn, the mean accuracy and the weighted mean recall are strictly and systematically  equal. (tested on other datasets).

 

How can we explain these mysterious results ?

 

Thanks you for your explanation,

 

Regards,

 

Lionel

 

 

 

Find more posts tagged with

Sort by:
1 - 1 of 11
    User: "MartinLiebig"
    Altair Employee
    Accepted Answer

    Hi Lionel,

     

    you need to compare apples with apples. You hand over a numericalized example set to Python. The python NB assumes Gaussian distribution for all attributes.

    In RM you hand over partly numerical partly nominal data. RM also assumes gaussian data for the numiercal parts, but for the nominal we get the probability from the ratios (20% of the data are female => p=0.2).

     

    If you use all numerical everywhere you get the same results (see attached process). I think sklearn is not able to handle nominals the same correct way we do.

     

    Best,

    Martin

     

    <?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="8.0.001" expanded="true" height="68" name="Retrieve Deals (2)" width="90" x="45" y="238">
    <parameter key="repository_entry" value="//Samples/data/Deals"/>
    </operator>
    <operator activated="true" class="nominal_to_numerical" compatibility="8.0.001" expanded="true" height="103" name="Nominal to Numerical (2)" width="90" x="179" y="238">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="Future Customer"/>
    <parameter key="invert_selection" value="true"/>
    <parameter key="include_special_attributes" value="true"/>
    <list key="comparison_groups"/>
    </operator>
    <operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply (3)" width="90" x="313" y="238"/>
    <operator activated="true" breakpoints="after" class="nominal_to_numerical" compatibility="8.0.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="514" y="289">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="Future Customer"/>
    <parameter key="include_special_attributes" value="true"/>
    <parameter key="coding_type" value="unique integers"/>
    <list key="comparison_groups"/>
    </operator>
    <operator activated="true" class="naive_bayes" compatibility="8.0.001" expanded="true" height="82" name="Naive Bayes" width="90" x="514" y="34"/>
    <operator activated="true" class="python_scripting:execute_python" compatibility="7.4.000" expanded="true" height="145" name="Build / Apply model" width="90" x="715" y="289">
    <parameter key="script" value="import pandas as pd&#10;from sklearn.naive_bayes import GaussianNB&#10;from sklearn.metrics import confusion_matrix&#10;from sklearn.metrics import accuracy_score&#10;from sklearn.metrics import recall_score&#10;from sklearn.metrics import precision_score&#10;&#10;&#10;&#10;# rm_main is a mandatory function, &#10;# the number of arguments has to be the number of input ports (can be none)&#10;def rm_main(data):&#10;&#10; # Build the model&#10; X = data.iloc[:,1:]&#10; y = data.iloc[:,0]&#10; NB = GaussianNB()&#10; NB.fit(X,y)&#10;&#10; #Calculate probability of each class.&#10;&#10; pr = NB.class_prior_ &#10; &#10; #Calculate mean of each feature per class&#10; th= NB.theta_&#10;&#10; #Apply the model&#10; y_pred = NB.predict(X)&#10; &#10; &#10; # Calculate the scoring&#10; &#10; #confusion matrix&#10; conf_matrix = confusion_matrix(y,y_pred)&#10; &#10; #accuracy&#10; acc_score = 100*accuracy_score(y,y_pred) &#10; &#10; #recall&#10; reca_score = 100*recall_score(y,y_pred,average='weighted') &#10; &#10; #precision&#10; precisionscore = 100*precision_score(y,y_pred,average='weighted') &#10; &#10; #Write the scores in dataframe&#10; accu_score = pd.DataFrame(data = [acc_score],columns = ['accuracy'])&#10; recall_weighted = pd.DataFrame(data = [reca_score],columns = ['weighted_mean_recall']) &#10; precision_weighted = pd.DataFrame(data = [precisionscore],columns = ['weighted_mean_precision']) &#10; score = accu_score.join(recall_weighted)&#10; score = score.join(precision_weighted)&#10; &#10; theta = pd.DataFrame(data = th,columns = ['Gender = Male','Gender = Female','PM = Credit card','PM = cheque','PM = cash','Age'])&#10; proba = pd.DataFrame(data = pr, columns = ['probability'])&#10; &#10; confus_matrix = pd.DataFrame(data = conf_matrix,columns = ['true yes','true no']) &#9;&#10; &#10;&#10; # connect 4 output ports to see the results&#10; return score,theta, confus_matrix,proba"/>
    </operator>
    <operator activated="true" class="apply_model" compatibility="8.0.001" expanded="true" height="82" name="Apply Model" width="90" x="715" y="85">
    <list key="application_parameters"/>
    </operator>
    <operator activated="true" class="performance_classification" compatibility="8.0.001" expanded="true" height="82" name="Performance" width="90" x="849" y="85">
    <parameter key="weighted_mean_recall" value="true"/>
    <parameter key="weighted_mean_precision" value="true"/>
    <list key="class_weights"/>
    </operator>
    <connect from_op="Retrieve Deals (2)" from_port="output" to_op="Nominal to Numerical (2)" to_port="example set input"/>
    <connect from_op="Nominal to Numerical (2)" from_port="example set output" to_op="Multiply (3)" to_port="input"/>
    <connect from_op="Multiply (3)" from_port="output 1" to_op="Nominal to Numerical" to_port="example set input"/>
    <connect from_op="Multiply (3)" from_port="output 2" to_op="Naive Bayes" to_port="training set"/>
    <connect from_op="Nominal to Numerical" from_port="example set output" to_op="Build / Apply model" to_port="input 1"/>
    <connect from_op="Naive Bayes" from_port="model" to_op="Apply Model" to_port="model"/>
    <connect from_op="Naive Bayes" from_port="exampleSet" to_op="Apply Model" to_port="unlabelled data"/>
    <connect from_op="Build / Apply model" from_port="output 1" to_port="result 1"/>
    <connect from_op="Build / Apply model" from_port="output 2" to_port="result 2"/>
    <connect from_op="Build / Apply model" from_port="output 3" to_port="result 3"/>
    <connect from_op="Build / Apply model" from_port="output 4" to_port="result 4"/>
    <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
    <connect from_op="Apply Model" from_port="model" to_port="result 6"/>
    <connect from_op="Performance" from_port="performance" to_port="result 5"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    <portSpacing port="sink_result 4" spacing="0"/>
    <portSpacing port="sink_result 5" spacing="0"/>
    <portSpacing port="sink_result 6" spacing="0"/>
    <portSpacing port="sink_result 7" spacing="0"/>
    </process>
    </operator>
    </process>