🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

Logistic Regression and Categorical Inputs

User: "btibert"
New Altair Community Member
Updated by Jocelyn
How is the Logistic Regression operator determine the category to be left-out from the regression model?  In some tools, the first category is left out, but this does not appear to be the case, at least based on alpha-sorting.  Any insight on the reference category is chosen?  I also attempted to look at the docs for H20, but it's not jumping out to me from this page (though I could be missing something obvious):

http://docs.h2o.ai/h2o/latest-stable/h2o-docs/data-science/glm.html

Find more posts tagged with

Sort by:
1 - 3 of 31
    User: "YYH"
    Altair Employee
    Accepted Answer
    Updated by YYH
    Hi @btibert,

    Good catch! When we are talking about choosing reference (baseline) category of your categorical variable in logistic regression,  there is no rule for that. 

    By default R uses the alpha-numerically first category as the reference category (e.g. “a” with letters, “0” with numbers). We can set a specific reference category by explicitly placing one of the levels first when specifying the levels. 

    According to the behavior of H2o logistic regression in RapidMiner, I can confirm that the default reference category is chosen by the first appearance in data. The example below gives you the testing process with shuffled data.

    <?xml version="1.0" encoding="UTF-8"?><process version="9.4.000-BETA">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.4.000-BETA" expanded="true" name="Process" origin="GENERATED_TUTORIAL">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="9.4.000-BETA" expanded="true" height="68" name="Retrieve Deals" origin="GENERATED_TUTORIAL" width="90" x="45" y="34">
            <parameter key="repository_entry" value="//Samples/data/Deals"/>
          </operator>
          <operator activated="true" class="h2o:logistic_regression" compatibility="9.3.001" expanded="true" height="124" name="Logistic Regression" origin="GENERATED_TUTORIAL" width="90" x="179" y="34">
            <parameter key="solver" value="AUTO"/>
            <parameter key="reproducible" value="true"/>
            <parameter key="maximum_number_of_threads" value="4"/>
            <parameter key="use_regularization" value="false"/>
            <parameter key="lambda_search" value="false"/>
            <parameter key="number_of_lambdas" value="0"/>
            <parameter key="lambda_min_ratio" value="0.0"/>
            <parameter key="early_stopping" value="true"/>
            <parameter key="stopping_rounds" value="3"/>
            <parameter key="stopping_tolerance" value="0.001"/>
            <parameter key="standardize" value="true"/>
            <parameter key="non-negative_coefficients" value="false"/>
            <parameter key="add_intercept" value="true"/>
            <parameter key="compute_p-values" value="true"/>
            <parameter key="remove_collinear_columns" value="true"/>
            <parameter key="missing_values_handling" value="MeanImputation"/>
            <parameter key="max_iterations" value="0"/>
            <parameter key="max_runtime_seconds" value="0"/>
          </operator>
          <operator activated="true" class="generate_macro" compatibility="9.4.000-BETA" expanded="true" height="82" name="Generate Macro" width="90" x="313" y="136">
            <list key="function_descriptions">
              <parameter key="seed" value="date_millis(date_now())%1000"/>
            </list>
            <description align="center" color="transparent" colored="false" width="126">a &amp;quot;random&amp;quot; seed</description>
          </operator>
          <operator activated="true" class="shuffle" compatibility="9.4.000-BETA" expanded="true" height="82" name="Shuffle" width="90" x="447" y="238">
            <parameter key="use_local_random_seed" value="true"/>
            <parameter key="local_random_seed" value="%{seed}"/>
          </operator>
          <operator activated="true" class="write_csv" compatibility="9.4.000-BETA" expanded="true" height="82" name="Write CSV" width="90" x="581" y="136">
            <parameter key="column_separator" value=";"/>
            <parameter key="write_attribute_names" value="true"/>
            <parameter key="quote_nominal_values" value="true"/>
            <parameter key="format_date_attributes" value="true"/>
            <parameter key="append_to_file" value="false"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <operator activated="true" class="read_csv" compatibility="9.4.000-BETA" expanded="true" height="68" name="Read CSV" width="90" x="715" y="136">
            <parameter key="column_separators" value=";"/>
            <parameter key="trim_lines" value="false"/>
            <parameter key="use_quotes" value="true"/>
            <parameter key="quotes_character" value="&quot;"/>
            <parameter key="escape_character" value="\"/>
            <parameter key="skip_comments" value="false"/>
            <parameter key="comment_characters" value="#"/>
            <parameter key="starting_row" value="1"/>
            <parameter key="parse_numbers" value="true"/>
            <parameter key="decimal_character" value="."/>
            <parameter key="grouped_digits" value="false"/>
            <parameter key="grouping_character" value=","/>
            <parameter key="infinity_representation" value=""/>
            <parameter key="date_format" value=""/>
            <parameter key="first_row_as_names" value="true"/>
            <list key="annotations"/>
            <parameter key="time_zone" value="SYSTEM"/>
            <parameter key="locale" value="English (United States)"/>
            <parameter key="encoding" value="SYSTEM"/>
            <parameter key="read_all_values_as_polynominal" value="false"/>
            <list key="data_set_meta_data_information"/>
            <parameter key="read_not_matching_values_as_missings" value="true"/>
            <parameter key="datamanagement" value="double_array"/>
            <parameter key="data_management" value="auto"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="9.4.000-BETA" expanded="true" height="82" name="shuffled data" width="90" x="849" y="136">
            <parameter key="attribute_name" value="Future Customer"/>
            <parameter key="target_role" value="label"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="h2o:logistic_regression" compatibility="9.3.001" expanded="true" height="124" name="Logistic Regression shuffled" origin="GENERATED_TUTORIAL" width="90" x="983" y="136">
            <parameter key="solver" value="AUTO"/>
            <parameter key="reproducible" value="true"/>
            <parameter key="maximum_number_of_threads" value="4"/>
            <parameter key="use_regularization" value="false"/>
            <parameter key="lambda_search" value="false"/>
            <parameter key="number_of_lambdas" value="0"/>
            <parameter key="lambda_min_ratio" value="0.0"/>
            <parameter key="early_stopping" value="true"/>
            <parameter key="stopping_rounds" value="3"/>
            <parameter key="stopping_tolerance" value="0.001"/>
            <parameter key="standardize" value="true"/>
            <parameter key="non-negative_coefficients" value="false"/>
            <parameter key="add_intercept" value="true"/>
            <parameter key="compute_p-values" value="true"/>
            <parameter key="remove_collinear_columns" value="true"/>
            <parameter key="missing_values_handling" value="MeanImputation"/>
            <parameter key="max_iterations" value="0"/>
            <parameter key="max_runtime_seconds" value="0"/>
          </operator>
          <connect from_op="Retrieve Deals" from_port="output" to_op="Logistic Regression" to_port="training set"/>
          <connect from_op="Logistic Regression" from_port="model" to_port="result 1"/>
          <connect from_op="Logistic Regression" from_port="exampleSet" to_op="Generate Macro" to_port="through 1"/>
          <connect from_op="Generate Macro" from_port="through 1" to_op="Shuffle" to_port="example set input"/>
          <connect from_op="Shuffle" from_port="example set output" to_op="Write CSV" to_port="input"/>
          <connect from_op="Shuffle" from_port="original" to_port="result 4"/>
          <connect from_op="Write CSV" from_port="file" to_op="Read CSV" to_port="file"/>
          <connect from_op="Read CSV" from_port="output" to_op="shuffled data" to_port="example set input"/>
          <connect from_op="shuffled data" from_port="example set output" to_op="Logistic Regression shuffled" to_port="training set"/>
          <connect from_op="Logistic Regression shuffled" from_port="model" to_port="result 2"/>
          <connect from_op="Logistic Regression shuffled" from_port="exampleSet" to_port="result 3"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="63"/>
          <portSpacing port="sink_result 3" spacing="21"/>
          <portSpacing port="sink_result 4" spacing="84"/>
          <portSpacing port="sink_result 5" spacing="21"/>
        </process>
      </operator>
    </process>
    



    Cheers,
    YY

    User: "btibert"
    New Altair Community Member
    OP
    Got it, thank you.
    User: "YYH"
    Altair Employee
    Always welcome!