select attributes with at least one entry


Hi...having a brain freeze with what should be a very simple thing. I have a data set with 250+ nominal and numerical attributes, many of which are completely empty (all missing values). Others have perhaps one or two entries with the rest missing. How do I eliminate all attributes, nominal and numerical, that only have ALL missing values? I have tried every combination of operators (including Ingo's workaround posted a while ago) to no avail.
Scott
Best Answer
-
Hi,
have a look at the attached process. You can filter on >X or <X number of missings.
~Martin
<?xml version="1.0" encoding="UTF-8"?><process version="7.4.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.4.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.4.000" expanded="true" height="68" name="Retrieve Golf" width="90" x="45" y="34">
<parameter key="repository_entry" value="//Samples/data/Golf"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="7.4.000" expanded="true" height="82" name="Generate Attributes" width="90" x="179" y="34">
<list key="function_descriptions">
<parameter key="only_missing" value="str(0/0)"/>
</list>
</operator>
<operator activated="true" class="aggregate" compatibility="7.4.000" expanded="true" height="82" name="Aggregate" width="90" x="380" y="136">
<parameter key="use_default_aggregation" value="true"/>
<parameter key="default_aggregation_function" value="count (ignoring missings)"/>
<list key="aggregation_attributes">
<parameter key="Wind" value="count (ignoring missings)"/>
</list>
</operator>
<operator activated="true" class="rename_by_replacing" compatibility="7.4.000" expanded="true" height="82" name="Rename by Replacing" width="90" x="514" y="34">
<parameter key="replace_what" value="countWithOutMissings\((.+)\)"/>
<parameter key="replace_by" value="$1"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.4.000" expanded="true" height="82" name="Select Attributes" width="90" x="648" y="34">
<parameter key="attribute_filter_type" value="numeric_value_filter"/>
<parameter key="numeric_condition" value=">1"/>
</operator>
<operator activated="true" class="data_to_weights" compatibility="7.4.000" expanded="true" height="82" name="Data to Weights" width="90" x="782" y="34"/>
<operator activated="true" class="select_by_weights" compatibility="7.4.000" expanded="true" height="103" name="Select by Weights" width="90" x="916" y="136"/>
<connect from_op="Retrieve Golf" from_port="output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Aggregate" from_port="example set output" to_op="Rename by Replacing" to_port="example set input"/>
<connect from_op="Aggregate" from_port="original" to_op="Select by Weights" to_port="example set input"/>
<connect from_op="Rename by Replacing" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Data to Weights" to_port="example set"/>
<connect from_op="Data to Weights" from_port="weights" to_op="Select by Weights" to_port="weights"/>
<connect from_op="Select by Weights" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>1
Answers
-
Remove Useless Attributes doesn't work on missings? What if you first run a Replace Missing Values with a constant value and then use it?
0 -
Hi,
have a look at the attached process. You can filter on >X or <X number of missings.
~Martin
<?xml version="1.0" encoding="UTF-8"?><process version="7.4.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.4.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.4.000" expanded="true" height="68" name="Retrieve Golf" width="90" x="45" y="34">
<parameter key="repository_entry" value="//Samples/data/Golf"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="7.4.000" expanded="true" height="82" name="Generate Attributes" width="90" x="179" y="34">
<list key="function_descriptions">
<parameter key="only_missing" value="str(0/0)"/>
</list>
</operator>
<operator activated="true" class="aggregate" compatibility="7.4.000" expanded="true" height="82" name="Aggregate" width="90" x="380" y="136">
<parameter key="use_default_aggregation" value="true"/>
<parameter key="default_aggregation_function" value="count (ignoring missings)"/>
<list key="aggregation_attributes">
<parameter key="Wind" value="count (ignoring missings)"/>
</list>
</operator>
<operator activated="true" class="rename_by_replacing" compatibility="7.4.000" expanded="true" height="82" name="Rename by Replacing" width="90" x="514" y="34">
<parameter key="replace_what" value="countWithOutMissings\((.+)\)"/>
<parameter key="replace_by" value="$1"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.4.000" expanded="true" height="82" name="Select Attributes" width="90" x="648" y="34">
<parameter key="attribute_filter_type" value="numeric_value_filter"/>
<parameter key="numeric_condition" value=">1"/>
</operator>
<operator activated="true" class="data_to_weights" compatibility="7.4.000" expanded="true" height="82" name="Data to Weights" width="90" x="782" y="34"/>
<operator activated="true" class="select_by_weights" compatibility="7.4.000" expanded="true" height="103" name="Select by Weights" width="90" x="916" y="136"/>
<connect from_op="Retrieve Golf" from_port="output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Aggregate" from_port="example set output" to_op="Rename by Replacing" to_port="example set input"/>
<connect from_op="Aggregate" from_port="original" to_op="Select by Weights" to_port="example set input"/>
<connect from_op="Rename by Replacing" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Data to Weights" to_port="example set"/>
<connect from_op="Data to Weights" from_port="weights" to_op="Select by Weights" to_port="weights"/>
<connect from_op="Select by Weights" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>1 -
that works very nicely. Thanks, Martin.
Scott0 -
yes tried that for a while. It does not appear to work because of the mixture of nominal and numerical atts...Martin's solution takes care of that.
0