Hello Team,
I am fairly new to RM and currently conducting some research on online text.
In particular I am trying to detect outliers from an set of documents by using the LOF operator.
Now I have some troubles, since the LOF for each document is very close to 1, no matter how I set the MinPtsUB and MinPtsLB.
Basically I have represented the each document as vector of term frequency and TF-IDF, before applying the LOF operator.
So I have two ExampleSets representing the corpus as, a matrix of TF values and a matrix of TF-IDF values, to check the differences.
However, for both matrices I get LOF values that are equal or very close to one, which does not make any sence to me.
Could you tell me, if and what I am doing wrong?
Best
Please find my XML enclosed:
<?xml
version="1.0" encoding="UTF-8" ?>
- <process
version="
9.2.000">
- <operator
activated="
true" class="
process" compatibility="
9.2.000"
expanded="
true" name="
Process">
<parameter key="logverbosity"
value="init" />
<parameter key="random_seed"
value="2001" />
<parameter key="send_mail"
value="never" />
<parameter key="notification_email"
value=""
/>
<parameter key="process_duration_for_mail" value="30" />
<parameter key="encoding"
value="SYSTEM" />
- <process
expanded="
true">
- <operator
activated="
true" class="
retrieve" compatibility="
9.2.000"
expanded="
true" height="
68" name="
Retrieve
PreppedTestData" width="
90" x="
112" y="
34">
<parameter key="repository_entry"
value="../Data/PreppedDatabase_TF" />
</operator>
- <operator
activated="
true" class="
select_attributes" compatibility="
9.2.000"
expanded="
true" height="
82" name="
Select
Attributes" width="
90" x="
246" y="
34">
<parameter key="attribute_filter_type" value="value_type" />
<parameter key="attribute"
value=""
/>
<parameter key="attributes"
value="Date" />
<parameter key="use_except_expression" value="false" />
<parameter key="value_type"
value="real" />
<parameter key="use_value_type_exception" value="false" />
<parameter key="except_value_type"
value="time" />
<parameter key="block_type"
value="attribute_block" />
<parameter key="use_block_type_exception" value="false" />
<parameter key="except_block_type"
value="value_matrix_row_start" />
<parameter key="invert_selection"
value="false" />
<parameter key="include_special_attributes" value="true" />
</operator>
- <operator
activated="
true" class="
detect_outlier_lof" compatibility="
9.2.000"
expanded="
true" height="
82" name="
Detect
Outlier (LOF)" width="
90" x="
447" y="
34">
<parameter key="minimal_points_lower_bound" value="1" />
<parameter key="minimal_points_upper_bound" value="3" />
<parameter key="distance_function"
value="euclidian distance" />
</operator>
- <operator
activated="
false" class="
anomalydetection:Local Outlier
Factor (LOF)" compatibility="
2.4.001"
expanded="
true" height="
103" name="
Local
Outlier Factor (LOF)" width="
90" x="
380" y="
340">
<parameter key="k_min (MinPtsLB)"
value="1"
/>
<parameter key="k_max (MinPtsUB)"
value="10"
/>
<parameter key="measure_types"
value="MixedMeasures" />
<parameter key="mixed_measure"
value="MixedEuclideanDistance" />
<parameter key="nominal_measure"
value="NominalDistance" />
<parameter key="numerical_measure"
value="EuclideanDistance" />
<parameter key="divergence"
value="GeneralizedIDivergence" />
<parameter key="kernel_type"
value="radial" />
<parameter key="kernel_gamma"
value="1.0" />
<parameter key="kernel_sigma1"
value="1.0" />
<parameter key="kernel_sigma2"
value="0.0" />
<parameter key="kernel_sigma3"
value="2.0" />
<parameter key="kernel_degree"
value="3.0" />
<parameter key="kernel_shift"
value="1.0" />
<parameter key="kernel_a"
value="1.0" />
<parameter key="kernel_b"
value="0.0" />
<parameter key="parallelize evaluation process" value="false" />
<parameter key="number of threads"
value="4"
/>
</operator>
- <operator
activated="
true" class="
store" compatibility="
9.2.000"
expanded="
true" height="
68" name="
Store"
width="
90"
x="
648"
y="
34">
<parameter key="repository_entry"
value="../Results/LOF_TF" />
</operator>
- <operator
activated="
false" class="
write_excel" compatibility="
9.2.000"
expanded="
true" height="
82" name="
Write
Excel" width="
90" x="
581" y="
442">
<parameter key="excel_file"
value="\\ads.dlh.de\lhuser$\LHT\HAM99\U801591\Documents\000_Masterarbeit\05_Praxis\04_LOF
- Outlier Detection\042_Results\LOF_TF.xlsx" />
<parameter key="file_format"
value="xlsx" />
<parameter key="encoding"
value="SYSTEM" />
<parameter key="sheet_name"
value="LOF_TF" />
<parameter key="date_format"
value="yyyy-MM-dd HH:mm:ss" />
<parameter key="number_format"
value="#.0" />
</operator>
- <operator
activated="
true" class="
retrieve" compatibility="
9.2.000"
expanded="
true" height="
68" name="
Retrieve
PreppedTestData (2)" width="
90" x="
112" y="
187">
<parameter key="repository_entry"
value="../Data/PreppedDatabase_TF-IDF" />
</operator>
- <operator
activated="
true" class="
select_attributes" compatibility="
9.2.000"
expanded="
true" height="
82" name="
Select
Attributes (2)" width="
90" x="
246" y="
187">
<parameter key="attribute_filter_type" value="value_type" />
<parameter key="attribute"
value=""
/>
<parameter key="attributes"
value="Date" />
<parameter key="use_except_expression" value="false" />
<parameter key="value_type"
value="real" />
<parameter key="use_value_type_exception" value="false" />
<parameter key="except_value_type"
value="time" />
<parameter key="block_type"
value="attribute_block" />
<parameter key="use_block_type_exception" value="false" />
<parameter key="except_block_type"
value="value_matrix_row_start" />
<parameter key="invert_selection"
value="false" />
<parameter key="include_special_attributes" value="true" />
</operator>
- <operator
activated="
true" class="
detect_outlier_lof" compatibility="
9.2.000"
expanded="
true" height="
82" name="
Detect
Outlier (2)" width="
90" x="
447" y="
187">
<parameter key="minimal_points_lower_bound" value="1" />
<parameter key="minimal_points_upper_bound" value="3" />
<parameter key="distance_function"
value="euclidian distance" />
</operator>
- <operator
activated="
false" class="
anomalydetection:Local Outlier
Factor (LOF)" compatibility="
2.4.001"
expanded="
true" height="
103" name="
Local
Outlier Factor (2)" width="
90" x="
380" y="
493">
<parameter key="k_min (MinPtsLB)"
value="1"
/>
<parameter key="k_max (MinPtsUB)"
value="10"
/>
<parameter key="measure_types"
value="MixedMeasures" />
<parameter key="mixed_measure"
value="MixedEuclideanDistance" />
<parameter key="nominal_measure"
value="NominalDistance" />
<parameter key="numerical_measure"
value="EuclideanDistance" />
<parameter key="divergence"
value="GeneralizedIDivergence" />
<parameter key="kernel_type"
value="radial" />
<parameter key="kernel_gamma"
value="1.0" />
<parameter key="kernel_sigma1"
value="1.0" />
<parameter key="kernel_sigma2"
value="0.0" />
<parameter key="kernel_sigma3"
value="2.0" />
<parameter key="kernel_degree"
value="3.0" />
<parameter key="kernel_shift"
value="1.0" />
<parameter key="kernel_a"
value="1.0" />
<parameter key="kernel_b"
value="0.0" />
<parameter key="parallelize evaluation process" value="false" />
<parameter key="number of threads"
value="4"
/>
</operator>
- <operator
activated="
true" class="
store" compatibility="
9.2.000"
expanded="
true" height="
68" name="
Store
(2)" width="
90" x="
648" y="
187">
<parameter key="repository_entry"
value="../Results/LOF_TF-IDF" />
</operator>
- <operator
activated="
false" class="
write_excel" compatibility="
9.2.000"
expanded="
true" height="
82" name="
Write Excel
(2)" width="
90" x="
648" y="
595">
<parameter key="excel_file"
value="\\ads.dlh.de\lhuser$\LHT\HAM99\U801591\Documents\000_Masterarbeit\05_Praxis\04_LOF
- Outlier Detection\042_Results\LOF_TF-IDF.xlsx" />
<parameter key="file_format"
value="xlsx" />
<parameter key="encoding"
value="SYSTEM" />
<parameter key="sheet_name"
value="LOF_TF-IDF" />
<parameter key="date_format"
value="yyyy-MM-dd HH:mm:ss" />
<parameter key="number_format"
value="#.0" />
</operator>
<connect from_op="Retrieve
PreppedTestData" from_port="output"
to_op="Select Attributes" to_port="example
set input" />
<connect from_op="Select Attributes" from_port="example
set output" to_op="Detect Outlier (LOF)"
to_port="example set input" />
<connect from_op="Detect Outlier (LOF)" from_port="example
set output" to_op="Store"
to_port="input" />
<connect from_op="Store" from_port="through"
to_port="result 1" />
<connect from_op="Retrieve PreppedTestData
(2)" from_port="output"
to_op="Select Attributes (2)" to_port="example
set input" />
<connect from_op="Select Attributes (2)" from_port="example
set output" to_op="Detect Outlier (2)"
to_port="example set input" />
<connect from_op="Detect Outlier (2)" from_port="example
set output" to_op="Store (2)"
to_port="input" />
<connect from_op="Store (2)" from_port="through"
to_port="result 2" />
<portSpacing port="source_input 1" spacing="0" />
<portSpacing port="sink_result 1" spacing="0" />
<portSpacing port="sink_result 2" spacing="0" />
<portSpacing port="sink_result 3" spacing="0" />
</process>
</operator>
</process>