Combination of Ada Boost and SMOTE operators
Hi !
I am currently dealing with imbalanced data for my classification problem. I have read couple of articles that uses boost based ensembles. I would like to use combination of Adaboost and SMOTE but I am note sure that if I correctly applied it. Please find xml version below. If you have any idea to improve this process or if you believe that those algorithms applied incorrectly, could you please help me ?
Özge
I am currently dealing with imbalanced data for my classification problem. I have read couple of articles that uses boost based ensembles. I would like to use combination of Adaboost and SMOTE but I am note sure that if I correctly applied it. Please find xml version below. If you have any idea to improve this process or if you believe that those algorithms applied incorrectly, could you please help me ?
Özge
<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="8.0.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34">
<parameter key="excel_file" value="C:\Users\AngelsChange\Desktop\SPSS-TEZ\Descriptive 2\MICRO\KNN-MICRO.xlsx"/>
<parameter key="imported_cell_range" value="A1:AL1102"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="DISEASE_DURATION.true.integer.attribute"/>
<parameter key="1" value="AGE.true.integer.attribute"/>
<parameter key="2" value="WEIGHT.true.numeric.attribute"/>
<parameter key="3" value="HEIGHT.true.real.attribute"/>
<parameter key="4" value="BMIkgm2.true.numeric.attribute"/>
<parameter key="5" value="FPG.true.integer.attribute"/>
<parameter key="6" value="PBG.true.numeric.attribute"/>
<parameter key="7" value="HbA1c.true.numeric.attribute"/>
<parameter key="8" value="FPG_HbA1c.true.numeric.attribute"/>
<parameter key="9" value="PBG_HbA1c.true.numeric.attribute"/>
<parameter key="10" value="TColl.true.numeric.attribute"/>
<parameter key="11" value="TG.true.numeric.attribute"/>
<parameter key="12" value="HDLK.true.numeric.attribute"/>
<parameter key="13" value="LDLK.true.numeric.attribute"/>
<parameter key="14" value="LDL_HDL.true.numeric.attribute"/>
<parameter key="15" value="TColl_HDL.true.numeric.attribute"/>
<parameter key="16" value="Cr.true.numeric.attribute"/>
<parameter key="17" value="ALB.true.numeric.attribute"/>
<parameter key="18" value="ALB_CR.true.numeric.attribute"/>
<parameter key="19" value="GFR.true.numeric.attribute"/>
<parameter key="20" value="CCI_A.true.numeric.attribute"/>
<parameter key="21" value="GENDER.true.binominal.attribute"/>
<parameter key="22" value="BACKGROUND_INFORMATION.false.integer.attribute"/>
<parameter key="23" value="FAMILY_HEALTH_STORY.false.integer.attribute"/>
<parameter key="24" value="INSULINE_TREATMENT.false.integer.attribute"/>
<parameter key="25" value="BMI_DEGREE.false.integer.attribute"/>
<parameter key="26" value="PATIENTS_STATUS.false.integer.attribute"/>
<parameter key="27" value="SMOKING_HABIT.false.integer.attribute"/>
<parameter key="28" value="HYPERTENSION.false.integer.attribute"/>
<parameter key="29" value="MACRO.false.integer.attribute"/>
<parameter key="30" value="CODISEASE.false.integer.attribute"/>
<parameter key="31" value="HBA1C_DEGREE.false.integer.attribute"/>
<parameter key="32" value="GLUCOSE_LEVEL_RISK_DEGREE.false.integer.attribute"/>
<parameter key="33" value="LIPID_PROFILE.false.integer.attribute"/>
<parameter key="34" value="CREATININE_DEGREE.false.integer.attribute"/>
<parameter key="35" value="ALBUMIN_DEGREE.false.integer.attribute"/>
<parameter key="36" value="GFR_DEGREE.true.polynominal.attribute"/>
<parameter key="37" value="MICRO.true.binominal.attribute"/>
</list>
</operator>
<operator activated="false" class="detect_outlier_distances" compatibility="8.0.001" expanded="true" height="82" name="Detect Outlier (Distances)" width="90" x="246" y="238"/>
<operator activated="false" class="filter_examples" compatibility="8.0.001" expanded="true" height="103" name="Filter Examples" width="90" x="246" y="238">
<list key="filters_list">
<parameter key="filters_entry_key" value="outlier.equals.false"/>
</list>
</operator>
<operator activated="true" class="set_role" compatibility="8.0.001" expanded="true" height="82" name="Set Role" width="90" x="179" y="34">
<parameter key="attribute_name" value="MICRO"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="false" class="multiply" compatibility="8.0.001" expanded="true" height="68" name="Multiply" width="90" x="246" y="238"/>
<operator activated="false" class="concurrency:optimize_parameters_grid" compatibility="8.0.001" expanded="true" height="124" name="Optimize Parameters (Grid)" width="90" x="246" y="238">
<list key="parameters">
<parameter key="Binomial (2).main_criterion" value="first,accuracy,classification_error,kappa,AUC (optimistic),AUC (optimistic),AUC,AUC (pessimistic),precision,recall,lift,fallout,f_measure,false_positive,false_negative,true_positive,true_negative,sensitivity,specificity,youden,positive_predictive_value,negative_predictive_value,psep"/>
</list>
<process expanded="true">
<operator activated="true" class="concurrency:cross_validation" compatibility="8.0.001" expanded="true" height="145" name="Cross Validation (2)" width="90" x="179" y="34">
<parameter key="number_of_folds" value="80"/>
<process expanded="true">
<operator activated="true" class="add_noise" compatibility="8.0.001" expanded="true" height="103" name="Add Noise" width="90" x="45" y="34">
<list key="noise"/>
</operator>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="8.0.001" expanded="true" height="103" name="Decision Tree (2)" width="90" x="179" y="34">
<parameter key="criterion" value="gini_index"/>
<parameter key="maximal_depth" value="9"/>
<parameter key="confidence" value="0.35"/>
<parameter key="apply_prepruning" value="false"/>
<parameter key="minimal_gain" value="0.136"/>
</operator>
<connect from_port="training set" to_op="Add Noise" to_port="example set input"/>
<connect from_op="Add Noise" from_port="example set output" to_op="Decision Tree (2)" to_port="training set"/>
<connect from_op="Decision Tree (2)" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="8.0.001" expanded="true" height="82" name="Apply Model (2)" width="90" x="45" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="8.0.001" expanded="true" height="82" name="Classification" width="90" x="179" y="238">
<list key="class_weights"/>
</operator>
<operator activated="false" class="performance" compatibility="8.0.001" expanded="true" height="82" name="Performance (2)" width="90" x="179" y="34"/>
<operator activated="false" class="performance_binominal_classification" compatibility="8.0.001" expanded="true" height="82" name="Binomial (2)" width="90" x="179" y="136">
<parameter key="classification_error" value="true"/>
<parameter key="kappa" value="true"/>
<parameter key="AUC (optimistic)" value="true"/>
<parameter key="AUC" value="true"/>
<parameter key="AUC (pessimistic)" value="true"/>
<parameter key="precision" value="true"/>
<parameter key="recall" value="true"/>
<parameter key="f_measure" value="true"/>
<parameter key="false_positive" value="true"/>
<parameter key="false_negative" value="true"/>
<parameter key="true_positive" value="true"/>
<parameter key="true_negative" value="true"/>
<parameter key="sensitivity" value="true"/>
<parameter key="specificity" value="true"/>
<parameter key="youden" value="true"/>
<parameter key="positive_predictive_value" value="true"/>
<parameter key="negative_predictive_value" value="true"/>
</operator>
<connect from_port="model" to_op="Apply Model (2)" to_port="model"/>
<connect from_port="test set" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_op="Classification" to_port="labelled data"/>
<connect from_op="Classification" from_port="performance" to_port="performance 1"/>
<connect from_op="Classification" from_port="example set" to_port="test set results"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_test set results" spacing="0"/>
<portSpacing port="sink_performance 1" spacing="0"/>
<portSpacing port="sink_performance 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="log" compatibility="8.0.001" expanded="true" height="82" name="Log" width="90" x="447" y="85">
<parameter key="filename" value="C:\Users\AngelsChange\Desktop\log 1.log"/>
<list key="log">
<parameter key="Perfomance 2" value="operator.Cross Validation (2).value.performance main criterion"/>
<parameter key="Criterion" value="operator.Cross Validation (2).value.applycount"/>
</list>
</operator>
<connect from_port="input 1" to_op="Cross Validation (2)" to_port="example set"/>
<connect from_op="Cross Validation (2)" from_port="performance 1" to_op="Log" to_port="through 1"/>
<connect from_op="Log" from_port="through 1" to_port="performance"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_performance" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
</process>
</operator>
<operator activated="true" class="concurrency:optimize_parameters_grid" compatibility="8.0.001" expanded="true" height="124" name="Optimize Parameters (4)" width="90" x="380" y="34">
<list key="parameters">
<parameter key="SMOTE Upsampling (3).number_of_neighbours" value="[1.0;100.0;10;linear]"/>
</list>
<process expanded="true">
<operator activated="true" class="concurrency:cross_validation" compatibility="8.0.001" expanded="true" height="145" name="Cross Validation" width="90" x="112" y="34">
<parameter key="number_of_folds" value="61"/>
<process expanded="true">
<operator activated="true" class="adaboost" compatibility="8.0.001" expanded="true" height="82" name="AdaBoost" width="90" x="112" y="34">
<process expanded="true">
<operator activated="true" breakpoints="after" class="operator_toolbox:smote" compatibility="1.7.000" expanded="true" height="82" name="SMOTE Upsampling (3)" width="90" x="112" y="34">
<parameter key="number_of_neighbours" value="1"/>
</operator>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="8.0.001" expanded="true" height="103" name="Decision Tree" width="90" x="246" y="34">
<parameter key="criterion" value="gini_index"/>
<parameter key="maximal_depth" value="100"/>
<parameter key="apply_prepruning" value="false"/>
<parameter key="minimal_gain" value="0.136"/>
</operator>
<connect from_port="training set" to_op="SMOTE Upsampling (3)" to_port="exa"/>
<connect from_op="SMOTE Upsampling (3)" from_port="ups" to_op="Decision Tree" to_port="training set"/>
<connect from_op="Decision Tree" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
</process>
</operator>
<connect from_port="training set" to_op="AdaBoost" to_port="training set"/>
<connect from_op="AdaBoost" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="8.0.001" expanded="true" height="82" name="Apply Model" width="90" x="45" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="false" class="multiply" compatibility="8.0.001" expanded="true" height="68" name="Multiply (2)" width="90" x="45" y="187"/>
<operator activated="true" class="performance_binominal_classification" compatibility="8.0.001" expanded="true" height="82" name="Binomial" width="90" x="179" y="34">
<parameter key="main_criterion" value="false_negative"/>
<parameter key="classification_error" value="true"/>
<parameter key="kappa" value="true"/>
<parameter key="AUC (optimistic)" value="true"/>
<parameter key="AUC" value="true"/>
<parameter key="AUC (pessimistic)" value="true"/>
<parameter key="precision" value="true"/>
<parameter key="recall" value="true"/>
<parameter key="lift" value="true"/>
<parameter key="fallout" value="true"/>
<parameter key="f_measure" value="true"/>
<parameter key="false_positive" value="true"/>
<parameter key="false_negative" value="true"/>
<parameter key="true_positive" value="true"/>
<parameter key="true_negative" value="true"/>
<parameter key="sensitivity" value="true"/>
<parameter key="specificity" value="true"/>
<parameter key="youden" value="true"/>
<parameter key="positive_predictive_value" value="true"/>
<parameter key="negative_predictive_value" value="true"/>
<parameter key="psep" value="true"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="8.0.001" expanded="true" height="82" name="Performance (3)" width="90" x="313" y="34">
<parameter key="main_criterion" value="logistic_loss"/>
<parameter key="classification_error" value="true"/>
<parameter key="kappa" value="true"/>
<parameter key="weighted_mean_recall" value="true"/>
<parameter key="weighted_mean_precision" value="true"/>
<parameter key="spearman_rho" value="true"/>
<parameter key="kendall_tau" value="true"/>
<parameter key="absolute_error" value="true"/>
<parameter key="relative_error" value="true"/>
<parameter key="relative_error_lenient" value="true"/>
<parameter key="relative_error_strict" value="true"/>
<parameter key="normalized_absolute_error" value="true"/>
<parameter key="root_mean_squared_error" value="true"/>
<parameter key="root_relative_squared_error" value="true"/>
<parameter key="squared_error" value="true"/>
<parameter key="correlation" value="true"/>
<parameter key="squared_correlation" value="true"/>
<parameter key="cross-entropy" value="true"/>
<parameter key="margin" value="true"/>
<parameter key="soft_margin_loss" value="true"/>
<parameter key="logistic_loss" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Binomial" to_port="labelled data"/>
<connect from_op="Binomial" from_port="performance" to_op="Performance (3)" to_port="performance"/>
<connect from_op="Binomial" from_port="example set" to_op="Performance (3)" to_port="labelled data"/>
<connect from_op="Performance (3)" from_port="performance" to_port="performance 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_test set results" spacing="0"/>
<portSpacing port="sink_performance 1" spacing="0"/>
<portSpacing port="sink_performance 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="log" compatibility="8.0.001" expanded="true" height="82" name="Log 1" width="90" x="313" y="85">
<parameter key="filename" value="C:\Users\AngelsChange\Desktop\Optimization-DT.log"/>
<list key="log">
<parameter key="neıghbor" value="operator.SMOTE Upsampling (2).parameter.number_of_neighbours"/>
<parameter key="rate" value="operator.SMOTE Upsampling (2).parameter.nominal_change_rate"/>
</list>
</operator>
<connect from_port="input 1" to_op="Cross Validation" to_port="example set"/>
<connect from_op="Cross Validation" from_port="performance 1" to_op="Log 1" to_port="through 1"/>
<connect from_op="Log 1" from_port="through 1" to_port="performance"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_performance" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
</process>
</operator>
<operator activated="false" class="concurrency:cross_validation" compatibility="8.0.001" expanded="true" height="145" name="Cross Validation (3)" width="90" x="246" y="238">
<parameter key="number_of_folds" value="61"/>
<process expanded="true">
<operator activated="true" class="operator_toolbox:smote" compatibility="1.7.000" expanded="true" height="82" name="SMOTE Upsampling" width="90" x="112" y="136">
<parameter key="nominal_change_rate" value="0.2"/>
</operator>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="8.0.001" expanded="true" height="103" name="Decision Tree (4)" width="90" x="246" y="85">
<parameter key="criterion" value="gini_index"/>
<parameter key="maximal_depth" value="100"/>
<parameter key="apply_prepruning" value="false"/>
<parameter key="minimal_gain" value="0.136"/>
</operator>
<connect from_port="training set" to_op="SMOTE Upsampling" to_port="exa"/>
<connect from_op="SMOTE Upsampling" from_port="ups" to_op="Decision Tree (4)" to_port="training set"/>
<connect from_op="Decision Tree (4)" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="8.0.001" expanded="true" height="82" name="Apply Model (3)" width="90" x="45" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="false" class="multiply" compatibility="8.0.001" expanded="true" height="68" name="Multiply (3)" width="90" x="45" y="187"/>
<operator activated="true" class="performance_binominal_classification" compatibility="8.0.001" expanded="true" height="82" name="Binomial (3)" width="90" x="179" y="34">
<parameter key="main_criterion" value="false_negative"/>
<parameter key="classification_error" value="true"/>
<parameter key="kappa" value="true"/>
<parameter key="AUC (optimistic)" value="true"/>
<parameter key="AUC" value="true"/>
<parameter key="AUC (pessimistic)" value="true"/>
<parameter key="precision" value="true"/>
<parameter key="recall" value="true"/>
<parameter key="lift" value="true"/>
<parameter key="fallout" value="true"/>
<parameter key="f_measure" value="true"/>
<parameter key="false_positive" value="true"/>
<parameter key="false_negative" value="true"/>
<parameter key="true_positive" value="true"/>
<parameter key="true_negative" value="true"/>
<parameter key="sensitivity" value="true"/>
<parameter key="specificity" value="true"/>
<parameter key="youden" value="true"/>
<parameter key="positive_predictive_value" value="true"/>
<parameter key="negative_predictive_value" value="true"/>
<parameter key="psep" value="true"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="8.0.001" expanded="true" height="82" name="Performance (4)" width="90" x="313" y="34">
<parameter key="main_criterion" value="logistic_loss"/>
<parameter key="classification_error" value="true"/>
<parameter key="kappa" value="true"/>
<parameter key="weighted_mean_recall" value="true"/>
<parameter key="weighted_mean_precision" value="true"/>
<parameter key="spearman_rho" value="true"/>
<parameter key="kendall_tau" value="true"/>
<parameter key="absolute_error" value="true"/>
<parameter key="relative_error" value="true"/>
<parameter key="relative_error_lenient" value="true"/>
<parameter key="relative_error_strict" value="true"/>
<parameter key="normalized_absolute_error" value="true"/>
<parameter key="root_mean_squared_error" value="true"/>
<parameter key="root_relative_squared_error" value="true"/>
<parameter key="squared_error" value="true"/>
<parameter key="correlation" value="true"/>
<parameter key="squared_correlation" value="true"/>
<parameter key="cross-entropy" value="true"/>
<parameter key="margin" value="true"/>
<parameter key="soft_margin_loss" value="true"/>
<parameter key="logistic_loss" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model (3)" to_port="model"/>
<connect from_port="test set" to_op="Apply Model (3)" to_port="unlabelled data"/>
<connect from_op="Apply Model (3)" from_port="labelled data" to_op="Binomial (3)" to_port="labelled data"/>
<connect from_op="Binomial (3)" from_port="performance" to_op="Performance (4)" to_port="performance"/>
<connect from_op="Binomial (3)" from_port="example set" to_op="Performance (4)" to_port="labelled data"/>
<connect from_op="Performance (4)" from_port="performance" to_port="performance 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_test set results" spacing="0"/>
<portSpacing port="sink_performance 1" spacing="0"/>
<portSpacing port="sink_performance 2" spacing="0"/>
</process>
</operator>
<connect from_port="input 1" to_op="Read Excel" to_port="file"/>
<connect from_op="Read Excel" from_port="output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Optimize Parameters (4)" to_port="input 1"/>
<connect from_op="Optimize Parameters (4)" from_port="performance" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>