hi,
I tried out 3 different outlier de tection algorithms on my dataset...
<?xml version="1.0" encoding="UTF-8"?><process version="7.2.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.2.001" expanded="true" height="68" name="Retrieve Master3Klassen_nominal" width="90" x="45" y="30">
<parameter key="repository_entry" value="../../../data/Master3Klassen_nominal"/>
</operator>
<operator activated="true" class="generate_id" compatibility="7.2.001" expanded="true" height="82" name="Generate ID" width="90" x="112" y="165"/>
<operator activated="true" class="normalize" compatibility="7.1.001" expanded="true" height="103" name="Normalize" width="90" x="246" y="165">
<parameter key="method" value="range transformation"/>
<parameter key="min" value="-1.0"/>
</operator>
<operator activated="true" class="multiply" compatibility="7.2.001" expanded="true" height="124" name="Multiply" width="90" x="346" y="142"/>
<operator activated="true" class="anomalydetection:Connectivity-Based Outlier Factor (COF)" compatibility="2.3.002" expanded="true" height="103" name="Connectivity-Based Outlier Factor (COF)" width="90" x="581" y="442">
<parameter key="k" value="5"/>
<parameter key="parallelize evaluation process" value="true"/>
</operator>
<operator activated="true" class="anomalydetection:Local Outlier Factor (LOF)" compatibility="2.3.002" expanded="true" height="103" name="Local Outlier Factor (LOF)" width="90" x="581" y="187">
<parameter key="k_min (MinPtsLB)" value="3"/>
<parameter key="k_max (MinPtsUB)" value="30"/>
<parameter key="numerical_measure" value="CamberraDistance"/>
<parameter key="parallelize evaluation process" value="true"/>
</operator>
<operator activated="true" class="anomalydetection:k-NN Global Anomaly Score" compatibility="2.3.002" expanded="true" height="103" name="k-NN Global Anomaly Score (2)" width="90" x="514" y="75">
<parameter key="k" value="5"/>
<parameter key="use k-th neighbor distance only (no average)" value="true"/>
<parameter key="numerical_measure" value="CamberraDistance"/>
<parameter key="parallelize evaluation process" value="true"/>
</operator>
<connect from_op="Retrieve Master3Klassen_nominal" from_port="output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Normalize" to_port="example set input"/>
<connect from_op="Normalize" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="k-NN Global Anomaly Score (2)" to_port="example set"/>
<connect from_op="Multiply" from_port="output 2" to_op="Local Outlier Factor (LOF)" to_port="example set"/>
<connect from_op="Multiply" from_port="output 3" to_op="Connectivity-Based Outlier Factor (COF)" to_port="example set"/>
<connect from_op="Connectivity-Based Outlier Factor (COF)" from_port="example set" to_port="result 6"/>
<connect from_op="Connectivity-Based Outlier Factor (COF)" from_port="model" to_port="result 5"/>
<connect from_op="Local Outlier Factor (LOF)" from_port="example set" to_port="result 3"/>
<connect from_op="Local Outlier Factor (LOF)" from_port="model" to_port="result 4"/>
<connect from_op="k-NN Global Anomaly Score (2)" from_port="example set" to_port="result 1"/>
<connect from_op="k-NN Global Anomaly Score (2)" from_port="model" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
<portSpacing port="sink_result 6" spacing="0"/>
<portSpacing port="sink_result 7" spacing="0"/>
</process>
</operator>
</process>
now,
I want to sort outliers by most significant ones on top (e.g. TOP 100 outliers) for all 3 detector algorithms, and select those, that have the same ID under the top 100...
and then I want to remove them out of the dataset, identified by those same ID's, is this somehow possible? I don't know how to do this...