Hi,
I am using Linear Regression Model to train a Model. I have several training and test files stored in different folders.
For example,
Training files are stored in:
/path/to/training/files/s2_merged_train.csv
/path/to/training/files/s3_merged_train.csv
/path/to/training/files/s5_merged_train.csv
....
Similarly the Testing files are stored as:
/path/to/testing/files/s2_merged_test.csv
/path/to/testing/files/s3_merged_test.csv
/path/to/testing/files/s5_merged_test.csv
.....
There are 271 training and 271 testing files. File names have the same pattern but they are not continuous, that is, there may be s2_merged_train.csv, s3_merged_train.csv but may not be s4_merged_train.csv. Similarly for the test files.
I have written the following process which is working correctly for given inputs. For file names and some attribute name, I am using macros, and passing the macro values through command line.
I am also storing the resulting file and the performance vectors in files.
I want to execute that process for all the training and testing files without inserting the macro values manually.
After searching on the internet I found that there is a Loop Files operator in RapidMiner which can be used to solve the problem, but I am having trouble in changing the macro values for each file.
My xml code is as follows:
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="read_csv" compatibility="8.1.000" expanded="true" height="68" name="Read CSV" width="90" x="45" y="34">
<parameter key="csv_file" value="%{training-file}"/>
<parameter key="column_separators" value=","/>
<parameter key="trim_lines" value="false"/>
<parameter key="use_quotes" value="true"/>
<parameter key="quotes_character" value="""/>
<parameter key="escape_character" value="\"/>
<parameter key="skip_comments" value="false"/>
<parameter key="comment_characters" value="#"/>
<parameter key="parse_numbers" value="true"/>
<parameter key="decimal_character" value="."/>
<parameter key="grouped_digits" value="false"/>
<parameter key="grouping_character" value=","/>
<parameter key="date_format" value=""/>
<parameter key="first_row_as_names" value="true"/>
<list key="annotations"/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="read_all_values_as_polynominal" value="false"/>
<list key="data_set_meta_data_information"/>
<parameter key="read_not_matching_values_as_missings" value="true"/>
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="nominal_to_date" compatibility="8.1.000" expanded="true" height="82" name="Nominal to Date" width="90" x="179" y="34">
<parameter key="attribute_name" value="Time"/>
<parameter key="date_type" value="date_time"/>
<parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<parameter key="keep_old_attribute" value="false"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="read_csv" compatibility="8.1.000" expanded="true" height="68" name="Read CSV (2)" width="90" x="45" y="289">
<parameter key="csv_file" value="%{testing-file}"/>
<parameter key="column_separators" value=","/>
<parameter key="trim_lines" value="false"/>
<parameter key="use_quotes" value="true"/>
<parameter key="quotes_character" value="""/>
<parameter key="escape_character" value="\"/>
<parameter key="skip_comments" value="false"/>
<parameter key="comment_characters" value="#"/>
<parameter key="parse_numbers" value="true"/>
<parameter key="decimal_character" value="."/>
<parameter key="grouped_digits" value="false"/>
<parameter key="grouping_character" value=","/>
<parameter key="date_format" value=""/>
<parameter key="first_row_as_names" value="true"/>
<list key="annotations"/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="read_all_values_as_polynominal" value="false"/>
<list key="data_set_meta_data_information"/>
<parameter key="read_not_matching_values_as_missings" value="true"/>
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="store" compatibility="8.1.000" expanded="true" height="68" name="Store" width="90" x="313" y="34">
<parameter key="repository_entry" value="%{training-repository}"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="set_role" compatibility="8.1.000" expanded="true" height="82" name="Set Role" width="90" x="447" y="34">
<parameter key="attribute_name" value="%{training-role-attribute-name}"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="linear_regression" compatibility="8.1.000" expanded="true" height="103" name="Linear Regression" width="90" x="581" y="34">
<parameter key="feature_selection" value="M5 prime"/>
<parameter key="alpha" value="0.05"/>
<parameter key="max_iterations" value="10"/>
<parameter key="forward_alpha" value="0.05"/>
<parameter key="backward_alpha" value="0.05"/>
<parameter key="eliminate_colinear_features" value="true"/>
<parameter key="min_tolerance" value="0.05"/>
<parameter key="use_bias" value="true"/>
<parameter key="ridge" value="1.0E-8"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="nominal_to_date" compatibility="8.1.000" expanded="true" height="82" name="Nominal to Date (2)" width="90" x="179" y="289">
<parameter key="attribute_name" value="Time"/>
<parameter key="date_type" value="date_time"/>
<parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
<parameter key="time_zone" value="SYSTEM"/>
<parameter key="locale" value="English (United States)"/>
<parameter key="keep_old_attribute" value="false"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="store" compatibility="8.1.000" expanded="true" height="68" name="Store (2)" width="90" x="313" y="289">
<parameter key="repository_entry" value="%{testing-repository}"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="set_role" compatibility="8.1.000" expanded="true" height="82" name="Set Role (2)" width="90" x="447" y="289">
<parameter key="attribute_name" value="%{testing-role-attribute-name}"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="apply_model" compatibility="8.1.000" expanded="true" height="82" name="Apply Model" width="90" x="715" y="187">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="performance_regression" compatibility="8.1.000" expanded="true" height="82" name="Performance" width="90" x="849" y="187">
<parameter key="main_criterion" value="absolute_error"/>
<parameter key="root_mean_squared_error" value="true"/>
<parameter key="absolute_error" value="true"/>
<parameter key="relative_error" value="false"/>
<parameter key="relative_error_lenient" value="false"/>
<parameter key="relative_error_strict" value="false"/>
<parameter key="normalized_absolute_error" value="true"/>
<parameter key="root_relative_squared_error" value="false"/>
<parameter key="squared_error" value="false"/>
<parameter key="correlation" value="false"/>
<parameter key="squared_correlation" value="false"/>
<parameter key="prediction_average" value="true"/>
<parameter key="spearman_rho" value="false"/>
<parameter key="kendall_tau" value="false"/>
<parameter key="skip_undefined_labels" value="true"/>
<parameter key="use_example_weights" value="true"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="write_as_text" compatibility="8.1.000" expanded="true" height="82" name="Write as Text" width="90" x="1050" y="85">
<parameter key="result_file" value="%{performance-file}"/>
<parameter key="encoding" value="SYSTEM"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="format_numbers" compatibility="8.1.000" expanded="true" height="82" name="Format Numbers" width="90" x="983" y="238">
<parameter key="attribute_filter_type" value="value_type"/>
<parameter key="attribute" value="Time"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="numeric"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="real"/>
<parameter key="block_type" value="value_series"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_series_end"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="format_type" value="integer"/>
<parameter key="locale" value="English (United States)"/>
<parameter key="use_grouping" value="false"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="write_csv" compatibility="8.1.000" expanded="true" height="82" name="Write CSV" width="90" x="1117" y="238">
<parameter key="csv_file" value="%{result-file}"/>
<parameter key="column_separator" value=","/>
<parameter key="write_attribute_names" value="true"/>
<parameter key="quote_nominal_values" value="true"/>
<parameter key="format_date_attributes" value="true"/>
<parameter key="append_to_file" value="false"/>
<parameter key="encoding" value="SYSTEM"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<operator activated="true" class="store" compatibility="8.1.000" expanded="true" height="68" name="Store (3)" width="90" x="1251" y="238">
<parameter key="repository_entry" value="%{result-repository}"/>
</operator>
</process>
For executing through command line, I am using the following command in Windows cmd:
C:/Softwares/RapidMiner/RapidMinerStudio/scripts>rapidminer-batch.bat "//Local Repository/processes/rp2" "-Mtraining-file=D:\ME_Thesis\Data_v2\experiments\x_prediction_time\10min_ahead\training\input_training_merged\s2_merged_train.csv" "-Mtraining-repository=//Local Repository/data/s2_train.csv" "-Mtraining-role-attribute-name=s2predicted" "-Mtesting-file=D:\ME_Thesis\Data_v2\experiments\x_prediction_time\10min_ahead\testing\input_testing_merged\s2_merged_test.csv" "-Mtesting-repository=//Local Repository/data/s2_test.csv" "-Mtesting-role-attribute-name=s2predicted" "-Mperformance-file=D:\ME_Thesis\Data_v2\experiments\x_prediction_time\10min_ahead\performance\s2_performance.res" "-Mresult-file=D:\ME_Thesis\Data_v2\experiments\x_prediction_time\10min_ahead\results\s2_result.csv" "-Mresult-repository=//Local Repository/data/s2_result.csv"
There may be 2 solutions for the mentioned problem.
1. Use RapidMiner operators to execute the process on all the available files
2. Write a batch file and include the the commands with parameters.
I don't have any prior experience in writing Windows Batch Files. So I would prefer the first solution.
Can anyone guide me how to use Loop operators and macros for this purpose?
Any help would be highly appreciated.
Thanks.