Classification and feature construction on Time series Data
Hello everyone,
As part of a case study, I 've been working on the task 'Time series Classification' and the goal is to classify the time series data (each example in the dataset represents a time series) into 7 different classes. With the basic process( K-NN with Dynamic Time Warping) I got the classification accuracy of 98.93 and RMSE 0.011 +/0.103 ( which is strange). Since I am new to time series classification, I built a simple process without any feature construction.
So I would like to have your comments on the processes that I have built and about the various feature engineering(preprocessing) techniques and the operators in RapidMiner that I can apply on time series data (each example represents a time series) for classification
I have attached the sample data and the XML of the process. Please review the process and the data, and it would be great if you can let me know the right way to deal with the time series( each example in the dataset) data for the classification task with RapidMiner.
About the dataset:
*. Each example ( each row) represents a time series and have 34 regular attributes(features) which represent the different periods of the time series.
*. The class labels Type have 7 different classes(1,2,..7). see below picture
Your comments are valuable,
Many thanks and best regards,
Surya
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.1.001" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.1.001" expanded="true" height="68" name="Retrieve Classfication_timeseries_with classnames" width="90" x="45" y="34">
<parameter key="repository_entry" value="../data/Classfication_timeseries_with classnames"/>
</operator>
<operator activated="true" class="split_data" compatibility="8.1.001" expanded="true" height="103" name="Split Data" width="90" x="179" y="85">
<enumeration key="partitions">
<parameter key="ratio" value="0.8"/>
<parameter key="ratio" value="0.2"/>
</enumeration>
<parameter key="sampling_type" value="automatic"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.1.001" expanded="true" height="82" name="Select Attributes" width="90" x="447" y="289">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Type"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="series:sliding_window_validation" compatibility="7.4.000" expanded="true" height="124" name="Validation" width="90" x="581" y="34">
<parameter key="create_complete_model" value="false"/>
<parameter key="training_window_width" value="10"/>
<parameter key="training_window_step_size" value="-1"/>
<parameter key="test_window_width" value="10"/>
<parameter key="horizon" value="1"/>
<parameter key="cumulative_training" value="false"/>
<parameter key="average_performances_only" value="true"/>
<process expanded="true">
<operator activated="true" class="k_nn" compatibility="8.1.001" expanded="true" height="82" name="k-NN" width="90" x="112" y="34">
<parameter key="k" value="1"/>
<parameter key="weighted_vote" value="false"/>
<parameter key="measure_types" value="NumericalMeasures"/>
<parameter key="mixed_measure" value="MixedEuclideanDistance"/>
<parameter key="nominal_measure" value="NominalDistance"/>
<parameter key="numerical_measure" value="DynamicTimeWarpingDistance"/>
<parameter key="divergence" value="GeneralizedIDivergence"/>
<parameter key="kernel_type" value="radial"/>
<parameter key="kernel_gamma" value="1.0"/>
<parameter key="kernel_sigma1" value="1.0"/>
<parameter key="kernel_sigma2" value="0.0"/>
<parameter key="kernel_sigma3" value="2.0"/>
<parameter key="kernel_degree" value="3.0"/>
<parameter key="kernel_shift" value="1.0"/>
<parameter key="kernel_a" value="1.0"/>
<parameter key="kernel_b" value="0.0"/>
</operator>
<connect from_port="training" to_op="k-NN" to_port="training set"/>
<connect from_op="k-NN" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="7.1.001" expanded="true" height="82" name="Apply Model" width="90" x="112" y="34">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="8.1.001" expanded="true" height="82" name="Performance (2)" width="90" x="313" y="34">
<parameter key="main_criterion" value="first"/>
<parameter key="accuracy" value="true"/>
<parameter key="classification_error" value="false"/>
<parameter key="kappa" value="false"/>
<parameter key="weighted_mean_recall" value="false"/>
<parameter key="weighted_mean_precision" value="false"/>
<parameter key="spearman_rho" value="false"/>
<parameter key="kendall_tau" value="false"/>
<parameter key="absolute_error" value="false"/>
<parameter key="relative_error" value="false"/>
<parameter key="relative_error_lenient" value="false"/>
<parameter key="relative_error_strict" value="false"/>
<parameter key="normalized_absolute_error" value="false"/>
<parameter key="root_mean_squared_error" value="true"/>
<parameter key="root_relative_squared_error" value="false"/>
<parameter key="squared_error" value="false"/>
<parameter key="correlation" value="false"/>
<parameter key="squared_correlation" value="false"/>
<parameter key="cross-entropy" value="false"/>
<parameter key="margin" value="false"/>
<parameter key="soft_margin_loss" value="false"/>
<parameter key="logistic_loss" value="false"/>
<parameter key="skip_undefined_labels" value="true"/>
<parameter key="use_example_weights" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance (2)" to_port="labelled data"/>
<connect from_op="Performance (2)" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" compatibility="7.1.001" expanded="true" height="82" name="Apply Model (2)" width="90" x="849" y="136">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<connect from_op="Retrieve Classfication_timeseries_with classnames" from_port="output" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_op="Validation" to_port="training"/>
<connect from_op="Split Data" from_port="partition 2" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Apply Model (2)" to_port="unlabelled data"/>
<connect from_op="Validation" from_port="model" to_op="Apply Model (2)" to_port="model"/>
<connect from_op="Validation" from_port="training" to_port="result 3"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
<connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>