Importing CSV

Jamia
Jamia New Altair Community Member
edited November 5 in Community Q&A
If I already have splitted data in test.csv and Train.csv. what do I do? how I import two CSV files? how we apply the model on both files.
Normally we import a CSV/excel file and apply x_validation on it which break the input in two parts that is train and test.

Answers

  • sgenzer
    sgenzer
    Altair Employee
    hi @Jamia all of these questions are fundamental to the software and can be answered by going through the "Getting Started" course in RapidMiner Academy. I would highly recommend taking some time with the course. It will give you a foundation to do a lot more.

    Scott

  • hughesfleming68
    hughesfleming68 New Altair Community Member
    Jamia, build your model using cross validation using just your train.csv. As a final step you can import your test.csv  and apply your model to your test data. This keeps your test data out of sample.
  • varunm1
    varunm1 New Altair Community Member
    As @hughesfleming68 mentioned you can do this with cross-validation for training and then connect the trained model to test data. Example with Titanic data set below. If you have a label attribute in test dataset as well, you can connect a performance operator to apply model which gives you test performance. You can run the below XML code by copying it from here and open a new process in RapidMiner --> (View --> Show Panel --> XML) --> Paste this code in XML window and press the green tick mark. It shows you the process and you can run this.

    <?xml version="1.0" encoding="UTF-8"?><process version="9.2.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="9.2.001" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
    <operator activated="true" class="retrieve" compatibility="9.2.001" expanded="true" height="68" name="Retrieve Titanic Training" width="90" x="112" y="34">
    <parameter key="repository_entry" value="//Samples/data/Titanic Training"/>
    </operator>
    <operator activated="true" class="concurrency:cross_validation" compatibility="9.2.001" expanded="true" height="145" name="Cross Validation" width="90" x="313" y="34">
    <parameter key="split_on_batch_attribute" value="false"/>
    <parameter key="leave_one_out" value="false"/>
    <parameter key="number_of_folds" value="5"/>
    <parameter key="sampling_type" value="automatic"/>
    <parameter key="use_local_random_seed" value="false"/>
    <parameter key="local_random_seed" value="1992"/>
    <parameter key="enable_parallel_execution" value="true"/>
    <process expanded="true">
    <operator activated="true" class="concurrency:parallel_decision_tree" compatibility="9.2.001" expanded="true" height="103" name="Decision Tree" width="90" x="112" y="34">
    <parameter key="criterion" value="gain_ratio"/>
    <parameter key="maximal_depth" value="10"/>
    <parameter key="apply_pruning" value="true"/>
    <parameter key="confidence" value="0.1"/>
    <parameter key="apply_prepruning" value="true"/>
    <parameter key="minimal_gain" value="0.01"/>
    <parameter key="minimal_leaf_size" value="2"/>
    <parameter key="minimal_size_for_split" value="4"/>
    <parameter key="number_of_prepruning_alternatives" value="3"/>
    </operator>
    <connect from_port="training set" to_op="Decision Tree" to_port="training set"/>
    <connect from_op="Decision Tree" from_port="model" to_port="model"/>
    <portSpacing port="source_training set" spacing="0"/>
    <portSpacing port="sink_model" spacing="0"/>
    <portSpacing port="sink_through 1" spacing="0"/>
    </process>
    <process expanded="true">
    <operator activated="true" class="apply_model" compatibility="9.2.001" expanded="true" height="82" name="Apply Model (2)" width="90" x="45" y="34">
    <list key="application_parameters"/>
    <parameter key="create_view" value="false"/>
    </operator>
    <operator activated="true" class="performance_classification" compatibility="9.2.001" expanded="true" height="82" name="Performance" width="90" x="179" y="34">
    <parameter key="main_criterion" value="first"/>
    <parameter key="accuracy" value="true"/>
    <parameter key="classification_error" value="false"/>
    <parameter key="kappa" value="true"/>
    <parameter key="weighted_mean_recall" value="false"/>
    <parameter key="weighted_mean_precision" value="false"/>
    <parameter key="spearman_rho" value="false"/>
    <parameter key="kendall_tau" value="false"/>
    <parameter key="absolute_error" value="false"/>
    <parameter key="relative_error" value="false"/>
    <parameter key="relative_error_lenient" value="false"/>
    <parameter key="relative_error_strict" value="false"/>
    <parameter key="normalized_absolute_error" value="false"/>
    <parameter key="root_mean_squared_error" value="true"/>
    <parameter key="root_relative_squared_error" value="false"/>
    <parameter key="squared_error" value="false"/>
    <parameter key="correlation" value="false"/>
    <parameter key="squared_correlation" value="false"/>
    <parameter key="cross-entropy" value="false"/>
    <parameter key="margin" value="false"/>
    <parameter key="soft_margin_loss" value="false"/>
    <parameter key="logistic_loss" value="false"/>
    <parameter key="skip_undefined_labels" value="true"/>
    <parameter key="use_example_weights" value="true"/>
    <list key="class_weights"/>
    </operator>
    <connect from_port="model" to_op="Apply Model (2)" to_port="model"/>
    <connect from_port="test set" to_op="Apply Model (2)" to_port="unlabelled data"/>
    <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
    <connect from_op="Performance" from_port="performance" to_port="performance 1"/>
    <portSpacing port="source_model" spacing="0"/>
    <portSpacing port="source_test set" spacing="0"/>
    <portSpacing port="source_through 1" spacing="0"/>
    <portSpacing port="sink_test set results" spacing="0"/>
    <portSpacing port="sink_performance 1" spacing="0"/>
    <portSpacing port="sink_performance 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="retrieve" compatibility="9.2.001" expanded="true" height="68" name="Retrieve Titanic Unlabeled" width="90" x="112" y="187">
    <parameter key="repository_entry" value="//Samples/data/Titanic Unlabeled"/>
    </operator>
    <operator activated="true" class="apply_model" compatibility="9.2.001" expanded="true" height="82" name="Apply Model" width="90" x="514" y="187">
    <list key="application_parameters"/>
    <parameter key="create_view" value="false"/>
    </operator>
    <connect from_op="Retrieve Titanic Training" from_port="output" to_op="Cross Validation" to_port="example set"/>
    <connect from_op="Cross Validation" from_port="model" to_op="Apply Model" to_port="model"/>
    <connect from_op="Cross Validation" from_port="performance 1" to_port="result 2"/>
    <connect from_op="Retrieve Titanic Unlabeled" from_port="output" to_op="Apply Model" to_port="unlabelled data"/>
    <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    </process>
    </operator>
    </process>