Hi,
I used the Backward Elimination operator to optimize my AUC for logistic regression by eliminating some attributes. However, when I stop using the Backward Elimination operator and eliminate the same attributes myself using the Selected Attribute operator (based on Backward Elimination operator's results) the resultant AUC/Performance is not the same (it lower). This is the same for many optimization operators (Optimize Parameter (Grid), Forward Selection).
How do these optimization operators work and how are they different from doing it manually (without optimization operator) ?
My data has 2030 instances with 33 features and 1 binary dependent variable.
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
<operator activated="true" class="retrieve" compatibility="8.1.003" expanded="true" height="68" name="Retrieve Data Screen without EV to EBITDA and EV to EBIT" width="90" x="45" y="85">
<parameter key="repository_entry" value="//NewLocalRepository/Data/3 Year (No Outlier)"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
<operator activated="true" class="set_role" compatibility="8.1.003" expanded="true" height="82" name="Set Role" width="90" x="179" y="187">
<parameter key="attribute_name" value="Outperform/Underperform"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
<operator activated="true" class="select_attributes" compatibility="8.1.003" expanded="true" height="82" name="Select Attributes (3)" width="90" x="313" y="187">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="ASSET TURNOVER_YEAR 1|ASSET TURNOVER_YEAR 2|ASSET TURNOVER_YEAR 3|DIV YIELD_YEAR 1|DIV YIELD_YEAR 2|DIV YIELD_YEAR 3|INCOME GROWTH_YEAR 1|INCOME GROWTH_YEAR 2|INCOME GROWTH_YEAR 3|NET DEBT TO EQUITY_YEAR 1|NET DEBT TO EQUITY_YEAR 2|NET DEBT TO EQUITY_YEAR 3|Outperform/Underperform|PB_YEAR 1|PB_YEAR 2|PB_YEAR 3|PE_YEAR 1|PE_YEAR 2|PE_YEAR 3|PROFIT MARGIN_YEAR 1|PROFIT MARGIN_YEAR 2|PROFIT MARGIN_YEAR 3|REVENUE GROWTH_YEAR 1|REVENUE GROWTH_YEAR 2|REVENUE GROWTH_YEAR 3|ROA_YEAR 1|ROA_YEAR 2|ROA_YEAR 3|ROE_YEAR 1|ROE_YEAR 2|ROE_YEAR 3|ROIC_YEAR 1|ROIC_YEAR 2|ROIC_YEAR 3"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
<operator activated="true" class="normalize" compatibility="8.1.003" expanded="true" height="103" name="Normalize" width="90" x="447" y="187">
<parameter key="return_preprocessing_model" value="false"/>
<parameter key="create_view" value="false"/>
<parameter key="attribute_filter_type" value="all"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="numeric"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="real"/>
<parameter key="block_type" value="value_series"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_series_end"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="method" value="Z-transformation"/>
<parameter key="min" value="0.0"/>
<parameter key="max" value="0.5"/>
<parameter key="allow_negative_values" value="false"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
<operator activated="true" class="principal_component_analysis" compatibility="8.1.003" expanded="true" height="103" name="PCA" width="90" x="581" y="187">
<parameter key="dimensionality_reduction" value="keep variance"/>
<parameter key="variance_threshold" value="1.0"/>
<parameter key="number_of_components" value="1"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.003">
<operator activated="true" class="optimize_selection_backward" compatibility="8.1.003" expanded="true" height="103" name="Backward Elimination" width="90" x="715" y="187">
<parameter key="maximal_number_of_eliminations" value="10"/>
<parameter key="speculative_rounds" value="50"/>
<parameter key="stopping_behavior" value="with decrease"/>
<parameter key="use_relative_decrease" value="true"/>
<parameter key="alpha" value="0.05"/>
<process expanded="true">
<operator activated="true" class="split_data" compatibility="8.1.003" expanded="true" height="103" name="Split Data" width="90" x="112" y="85">
<enumeration key="partitions">
<parameter key="ratio" value="0.7"/>
<parameter key="ratio" value="0.3"/>
</enumeration>
<parameter key="sampling_type" value="automatic"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<operator activated="true" class="h2o:logistic_regression" compatibility="7.6.001" expanded="true" height="124" name="Logistic Regression (3)" width="90" x="313" y="34">
<parameter key="solver" value="AUTO"/>
<parameter key="reproducible" value="false"/>
<parameter key="maximum_number_of_threads" value="4"/>
<parameter key="use_regularization" value="false"/>
<parameter key="lambda_search" value="false"/>
<parameter key="number_of_lambdas" value="0"/>
<parameter key="lambda_min_ratio" value="0.0"/>
<parameter key="early_stopping" value="true"/>
<parameter key="stopping_rounds" value="3"/>
<parameter key="stopping_tolerance" value="0.001"/>
<parameter key="standardize" value="false"/>
<parameter key="non-negative_coefficients" value="false"/>
<parameter key="add_intercept" value="true"/>
<parameter key="compute_p-values" value="true"/>
<parameter key="remove_collinear_columns" value="true"/>
<parameter key="missing_values_handling" value="MeanImputation"/>
<parameter key="max_iterations" value="0"/>
<parameter key="max_runtime_seconds" value="0"/>
</operator>
<operator activated="true" class="apply_model" compatibility="8.1.003" expanded="true" height="82" name="Apply Model (3)" width="90" x="447" y="187">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<operator activated="true" class="performance_binominal_classification" compatibility="8.1.003" expanded="true" height="82" name="Performance (3)" width="90" x="581" y="187">
<parameter key="main_criterion" value="AUC"/>
<parameter key="accuracy" value="true"/>
<parameter key="classification_error" value="false"/>
<parameter key="kappa" value="false"/>
<parameter key="AUC (optimistic)" value="true"/>
<parameter key="AUC" value="true"/>
<parameter key="AUC (pessimistic)" value="true"/>
<parameter key="precision" value="false"/>
<parameter key="recall" value="false"/>
<parameter key="lift" value="false"/>
<parameter key="fallout" value="false"/>
<parameter key="f_measure" value="false"/>
<parameter key="false_positive" value="false"/>
<parameter key="false_negative" value="false"/>
<parameter key="true_positive" value="false"/>
<parameter key="true_negative" value="false"/>
<parameter key="sensitivity" value="false"/>
<parameter key="specificity" value="false"/>
<parameter key="youden" value="false"/>
<parameter key="positive_predictive_value" value="false"/>
<parameter key="negative_predictive_value" value="false"/>
<parameter key="psep" value="false"/>
<parameter key="skip_undefined_labels" value="true"/>
<parameter key="use_example_weights" value="true"/>
</operator>
<connect from_port="example set" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_op="Logistic Regression (3)" to_port="training set"/>
<connect from_op="Split Data" from_port="partition 2" to_op="Apply Model (3)" to_port="unlabelled data"/>
<connect from_op="Logistic Regression (3)" from_port="model" to_op="Apply Model (3)" to_port="model"/>
<connect from_op="Apply Model (3)" from_port="labelled data" to_op="Performance (3)" to_port="labelled data"/>
<connect from_op="Performance (3)" from_port="performance" to_port="performance"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_performance" spacing="0"/>
</process>
</operator>
</process>
Help please