Hi,
Because of a really fluctuated datasets I'm trying to filter out spikes from the datasets. When I filter out the spikes I might get a better prediction. Now I'm wondering if I'm doing this right. I don't want to delete this from my dataset but they exist and when I delete those datapoints then I miss maybe valuable information


- With lag series = 1
- Calculating standard deviation.
- Generating a new variable maintainance. See picture above.
- Setting this new variable equals 0.
- Finally select the modified data.
Because of my strange outcomes I wondering if I'm doing this right. Could anyone confirm this or suggest another method?
Regards,
Maurits Freriks
The code to check for the detailed parameters. I didn't attached my datasets because this are a few different datasets but the method should be work on each of them.
<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.0.001" expanded="true" height="68" name="Retrieve flow ANJ Train" width="90" x="45" y="187">
<parameter key="repository_entry" value="../data/flow ANJ Train"/>
</operator>
<operator activated="true" class="sort" compatibility="8.0.001" expanded="true" height="82" name="Sort" width="90" x="45" y="34">
<parameter key="attribute_name" value="A"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.0.001" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="B"/>
<description align="center" color="transparent" colored="false" width="126">Select the 'data' column</description>
</operator>
<operator activated="true" class="series:lag_series" compatibility="7.4.000" expanded="true" height="82" name="Lag Series" width="90" x="313" y="34">
<list key="attributes">
<parameter key="B" value="1"/>
</list>
<description align="center" color="transparent" colored="false" width="126">Lag 'A' column for striping out spikes</description>
</operator>
<operator activated="true" class="aggregate" compatibility="8.0.001" expanded="true" height="82" name="Aggregate" width="90" x="447" y="34">
<list key="aggregation_attributes">
<parameter key="B" value="standard_deviation"/>
</list>
<description align="center" color="transparent" colored="false" width="126">Calculate std dev of the data, push to macro</description>
</operator>
<operator activated="true" class="extract_macro" compatibility="8.0.001" expanded="true" height="68" name="Extract Macro" width="90" x="648" y="34">
<parameter key="macro" value="stdev"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="standard_deviation(B)"/>
<parameter key="example_index" value="1"/>
<list key="additional_macros"/>
<description align="center" color="transparent" colored="false" width="126">extract std dev value to use in Generate Attributes</description>
</operator>
<operator activated="true" class="generate_attributes" compatibility="8.0.001" expanded="true" height="82" name="Generate Attributes" width="90" x="648" y="238">
<list key="function_descriptions">
<parameter key="Maintainence" value="if(B < ([B-1]-B), 1, 0)"/>
</list>
<description align="center" color="transparent" colored="false" width="126">Create a Maintenance attribute to help filter out the days it's in maintenance mode</description>
</operator>
<operator activated="true" class="filter_examples" compatibility="8.0.001" expanded="true" height="103" name="Filter Examples" width="90" x="782" y="238">
<list key="filters_list">
<parameter key="filters_entry_key" value="Maintainence.eq.0"/>
</list>
<description align="center" color="transparent" colored="false" width="126">Select only non maintenance mode days</description>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.0.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="916" y="238">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="B"/>
<description align="center" color="transparent" colored="false" width="126">Select 'A' again</description>
</operator>
<operator activated="true" class="optimize_parameters_grid" compatibility="8.0.001" expanded="true" height="145" name="Optimize Parameters (Grid)" width="90" x="1050" y="238">
<list key="parameters">
<parameter key="Validation.cumulative_training" value="true,false"/>
<parameter key="SVM.kernel_gamma" value="[0.01;1;5;logarithmic]"/>
<parameter key="SVM.C" value="[0;10000;4;linear]"/>
<parameter key="Validation.training_window_width" value="[40;60;5;linear]"/>
<parameter key="Validation.training_window_step_size" value="[4;6;2;linear]"/>
<parameter key="Validation.test_window_width" value="[3;5;2;linear]"/>
</list>
<process expanded="true">
<operator activated="true" class="set_macro" compatibility="8.0.001" expanded="true" height="82" name="Set Macro" width="90" x="45" y="34">
<parameter key="macro" value="day_ahead"/>
<parameter key="value" value="5"/>
</operator>
<operator activated="true" class="series:windowing" compatibility="7.4.000" expanded="true" height="82" name="Windowing Train" width="90" x="179" y="34">
<parameter key="window_size" value="%{day_ahead}"/>
<parameter key="create_label" value="true"/>
<parameter key="label_attribute" value="B"/>
</operator>
<operator activated="true" class="series:windowing" compatibility="7.4.000" expanded="true" height="82" name="Windowing Test" width="90" x="380" y="187">
<parameter key="window_size" value="%{day_ahead}"/>
</operator>
<operator activated="true" class="series:sliding_window_validation" compatibility="7.4.000" expanded="true" height="124" name="Validation" width="90" x="380" y="34">
<parameter key="training_window_width" value="60"/>
<parameter key="training_window_step_size" value="6"/>
<parameter key="test_window_width" value="5"/>
<parameter key="horizon" value="2"/>
<process expanded="true">
<operator activated="true" class="support_vector_machine" compatibility="8.0.001" expanded="true" height="124" name="SVM" width="90" x="112" y="34">
<parameter key="kernel_type" value="radial"/>
<parameter key="C" value="10000.0"/>
</operator>
<connect from_port="training" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="8.0.001" expanded="true" height="82" name="Apply Model" width="90" x="112" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="series:forecasting_performance" compatibility="7.4.000" expanded="true" height="82" name="Performance" width="90" x="246" y="34">
<parameter key="horizon" value="2"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="log" compatibility="8.0.001" expanded="true" height="82" name="Log" width="90" x="581" y="238">
<parameter key="filename" value="tmp"/>
<list key="log">
<parameter key="C" value="operator.SVM.parameter.C"/>
<parameter key="Gamma" value="operator.SVM.parameter.kernel_gamma"/>
<parameter key="Training Width" value="operator.Validation.parameter.training_window_width"/>
<parameter key="Step Width" value="operator.Validation.parameter.training_window_step_size"/>
<parameter key="Testing Width" value="operator.Validation.parameter.test_window_width"/>
<parameter key="Perf" value="operator.Validation.value.performance"/>
<parameter key="Set Macro Value" value="operator.Set Macro.value.macro_value"/>
</list>
</operator>
<connect from_port="input 1" to_op="Set Macro" to_port="through 1"/>
<connect from_op="Set Macro" from_port="through 1" to_op="Windowing Train" to_port="example set input"/>
<connect from_op="Windowing Train" from_port="example set output" to_op="Validation" to_port="training"/>
<connect from_op="Windowing Train" from_port="original" to_op="Windowing Test" to_port="example set input"/>
<connect from_op="Windowing Test" from_port="example set output" to_port="result 2"/>
<connect from_op="Validation" from_port="model" to_port="result 1"/>
<connect from_op="Validation" from_port="averagable 1" to_op="Log" to_port="through 1"/>
<connect from_op="Log" from_port="through 1" to_port="performance"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_performance" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
<description align="center" color="transparent" colored="false" width="126">Optimize and store optimized model</description>
</operator>
<operator activated="true" class="store" compatibility="8.0.001" expanded="true" height="68" name="Store" width="90" x="1251" y="187">
<parameter key="repository_entry" value="../data/Thomas ott test ANJ"/>
<description align="center" color="transparent" colored="false" width="126">Store optimized model</description>
</operator>
<operator activated="true" class="apply_model" compatibility="8.0.001" expanded="true" height="82" name="Apply Model (2)" width="90" x="1385" y="289">
<list key="application_parameters"/>
<description align="center" color="transparent" colored="false" width="126">Sanity Check. Review 'A' time series against predicted 'A' time series from training data set.</description>
</operator>
<operator activated="true" class="write_excel" compatibility="8.0.001" expanded="true" height="82" name="Write Excel" width="90" x="1519" y="187">
<parameter key="excel_file" value="/Users/Maurits/Documents/Stage/Tests/SVM/ANJ/Output RapidMiner Thomas ott ANJ Train.xlsx"/>
</operator>
<connect from_op="Retrieve flow ANJ Train" from_port="output" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Lag Series" to_port="example set input"/>
<connect from_op="Lag Series" from_port="example set output" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Aggregate" from_port="example set output" to_op="Extract Macro" to_port="example set"/>
<connect from_op="Aggregate" from_port="original" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
<connect from_op="Optimize Parameters (Grid)" from_port="performance" to_port="result 2"/>
<connect from_op="Optimize Parameters (Grid)" from_port="parameter" to_port="result 1"/>
<connect from_op="Optimize Parameters (Grid)" from_port="result 1" to_op="Store" to_port="input"/>
<connect from_op="Optimize Parameters (Grid)" from_port="result 2" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Store" from_port="through" to_op="Apply Model (2)" to_port="model"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_op="Write Excel" to_port="input"/>
<connect from_op="Write Excel" from_port="through" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>