Funny Results after processing new data on a trained model

User: "AKO"
New Altair Community Member
Updated by Jocelyn

I have a trained text-classification model, that works fine with Cross-Validation. Now I am trying to apply the model on new data to predict the different categories. The results are below.

funny2.png

 

What is going wrong. In the zip are the model and the training and test data files.

PS I look at the post https://community.rapidminer.com/t5/RapidMiner-Studio-Forum/SOLVED-Applying-a-pre-trained-model-on-new-data/td-p/16643 

But it applying it gives above results. Probably I forgot something?

 

 

 

Find more posts tagged with

Sort by:
1 - 2 of 21
    User: "Thomas_Ott"
    New Altair Community Member
    Accepted Answer

    @AKO@land makes very great points but from I see that your process is not what I would do at the very tail end. 

     

    Try this (just repath everything)

     

    <?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
    <context>
    <input/>
    <output/>
    <macros/>
    </context>
    <operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
    <process expanded="true">
    <operator activated="true" class="advanced_file_connectors:read_xml" compatibility="8.0.001" expanded="true" height="68" name="Read XML" width="90" x="45" y="34">
    <parameter key="file" value="C:\Users\Thomas Ott\Downloads\CIM_kNN_NewData\Training_items_RapMin.xml"/>
    <parameter key="xpath_for_examples" value="//root/site/page/children/page/children/page"/>
    <enumeration key="xpaths_for_attributes">
    <parameter key="xpath_for_attribute" value="title[1]/text()"/>
    <parameter key="xpath_for_attribute" value="body[1]/text()"/>
    <parameter key="xpath_for_attribute" value="categorie[1]/text()"/>
    </enumeration>
    <list key="namespaces"/>
    <parameter key="use_default_namespace" value="false"/>
    <parameter key="parse_numbers" value="false"/>
    <list key="annotations"/>
    <parameter key="locale" value="Dutch"/>
    <list key="data_set_meta_data_information">
    <parameter key="0" value="title[1]/text().true.polynominal.attribute"/>
    <parameter key="1" value="body[1]/text().true.polynominal.attribute"/>
    <parameter key="2" value="categorie[1]/text().true.polynominal.attribute"/>
    </list>
    </operator>
    <operator activated="true" class="rename" compatibility="8.0.001" expanded="true" height="82" name="Rename" width="90" x="112" y="136">
    <parameter key="old_name" value="body[1]/text()"/>
    <parameter key="new_name" value="Text"/>
    <list key="rename_additional_attributes">
    <parameter key="categorie[1]/text()" value="Categorie"/>
    <parameter key="title[1]/text()" value="Titel"/>
    </list>
    </operator>
    <operator activated="true" class="set_role" compatibility="8.0.001" expanded="true" height="82" name="Set Role" width="90" x="246" y="34">
    <parameter key="attribute_name" value="Categorie"/>
    <parameter key="target_role" value="label"/>
    <list key="set_additional_roles">
    <parameter key="Text" value="regular"/>
    <parameter key="Titel" value="regular"/>
    </list>
    </operator>
    <operator activated="true" class="web:unescape_html_attribute" compatibility="7.3.000" expanded="true" height="82" name="Unescape HTML" width="90" x="380" y="34">
    <parameter key="attribute" value="Text"/>
    </operator>
    <operator activated="true" class="nominal_to_text" compatibility="8.0.001" expanded="true" height="82" name="Nominal to Text" width="90" x="514" y="34">
    <parameter key="attribute" value="Titel"/>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="648" y="34">
    <parameter key="prune_method" value="percentual"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
    <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
    <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="7.5.000" expanded="true" height="82" name="Filter Stopwords (Dictionary)" width="90" x="380" y="85">
    <parameter key="file" value="C:\Users\Thomas Ott\Downloads\CIM_kNN_NewData\stopwords_nl.txt"/>
    </operator>
    <operator activated="true" class="text:generate_n_grams_terms" compatibility="7.5.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="581" y="34">
    <parameter key="max_length" value="1"/>
    </operator>
    <connect from_port="document" to_op="Tokenize" to_port="document"/>
    <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
    <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
    <connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
    <connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="concurrency:cross_validation" compatibility="8.0.001" expanded="true" height="145" name="Cross Validation" width="90" x="849" y="34">
    <process expanded="true">
    <operator activated="true" class="k_nn" compatibility="8.0.001" expanded="true" height="82" name="k-NN" width="90" x="112" y="34">
    <parameter key="k" value="3"/>
    </operator>
    <connect from_port="training set" to_op="k-NN" to_port="training set"/>
    <connect from_op="k-NN" from_port="model" to_port="model"/>
    <portSpacing port="source_training set" spacing="0"/>
    <portSpacing port="sink_model" spacing="0"/>
    <portSpacing port="sink_through 1" spacing="0"/>
    <description align="center" color="yellow" colored="false" height="105" resized="false" width="180" x="905" y="151">Type your comment</description>
    </process>
    <process expanded="true">
    <operator activated="true" class="apply_model" compatibility="8.0.001" expanded="true" height="82" name="Apply Model" width="90" x="112" y="34">
    <list key="application_parameters"/>
    </operator>
    <operator activated="true" class="performance" compatibility="8.0.001" expanded="true" height="82" name="Performance" width="90" x="313" y="34"/>
    <connect from_port="model" to_op="Apply Model" to_port="model"/>
    <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
    <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
    <connect from_op="Performance" from_port="performance" to_port="performance 1"/>
    <connect from_op="Performance" from_port="example set" to_port="test set results"/>
    <portSpacing port="source_model" spacing="0"/>
    <portSpacing port="source_test set" spacing="0"/>
    <portSpacing port="source_through 1" spacing="0"/>
    <portSpacing port="sink_test set results" spacing="0"/>
    <portSpacing port="sink_performance 1" spacing="0"/>
    <portSpacing port="sink_performance 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="advanced_file_connectors:read_xml" compatibility="8.0.001" expanded="true" height="68" name="Read XML (2)" width="90" x="45" y="391">
    <parameter key="file" value="C:\Users\Thomas Ott\Downloads\CIM_kNN_NewData\Validatie_items_RapMin.xml"/>
    <parameter key="xpath_for_examples" value="//root/site/page/children/page/children/page"/>
    <enumeration key="xpaths_for_attributes">
    <parameter key="xpath_for_attribute" value="title[1]/text()"/>
    <parameter key="xpath_for_attribute" value="body[1]/text()"/>
    <parameter key="xpath_for_attribute" value="categorie[1]/text()"/>
    </enumeration>
    <list key="namespaces"/>
    <parameter key="use_default_namespace" value="false"/>
    <parameter key="parse_numbers" value="false"/>
    <list key="annotations"/>
    <parameter key="locale" value="Dutch"/>
    <list key="data_set_meta_data_information">
    <parameter key="0" value="title[1]/text().true.polynominal.attribute"/>
    <parameter key="1" value="body[1]/text().true.polynominal.attribute"/>
    <parameter key="2" value="categorie[1]/text().true.polynominal.attribute"/>
    </list>
    </operator>
    <operator activated="true" class="rename" compatibility="8.0.001" expanded="true" height="82" name="Rename (2)" width="90" x="179" y="391">
    <parameter key="old_name" value="body[1]/text()"/>
    <parameter key="new_name" value="Text"/>
    <list key="rename_additional_attributes">
    <parameter key="categorie[1]/text()" value="Categorie"/>
    <parameter key="title[1]/text()" value="Titel"/>
    </list>
    </operator>
    <operator activated="true" class="select_attributes" compatibility="8.0.001" expanded="true" height="82" name="Select Attributes" width="90" x="313" y="391">
    <parameter key="attribute_filter_type" value="single"/>
    <parameter key="attribute" value="Categorie"/>
    <parameter key="invert_selection" value="true"/>
    </operator>
    <operator activated="false" class="set_role" compatibility="8.0.001" expanded="true" height="82" name="Set Role (2)" width="90" x="380" y="493">
    <parameter key="attribute_name" value="Categorie"/>
    <parameter key="target_role" value="label"/>
    <list key="set_additional_roles">
    <parameter key="Text" value="regular"/>
    <parameter key="Titel" value="regular"/>
    </list>
    </operator>
    <operator activated="true" class="web:unescape_html_attribute" compatibility="7.3.000" expanded="true" height="82" name="Unescape HTML (2)" width="90" x="514" y="391">
    <parameter key="attribute" value="Text"/>
    </operator>
    <operator activated="true" class="nominal_to_text" compatibility="8.0.001" expanded="true" height="82" name="Nominal to Text (2)" width="90" x="648" y="391">
    <parameter key="attribute" value="Titel"/>
    </operator>
    <operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data (2)" width="90" x="782" y="289">
    <parameter key="prune_method" value="percentual"/>
    <list key="specify_weights"/>
    <process expanded="true">
    <operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="34"/>
    <operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="34"/>
    <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="7.5.000" expanded="true" height="82" name="Filter Stopwords (2)" width="90" x="380" y="34">
    <parameter key="file" value="C:\Users\Thomas Ott\Downloads\CIM_kNN_NewData\stopwords_nl.txt"/>
    </operator>
    <operator activated="true" class="text:generate_n_grams_terms" compatibility="7.5.000" expanded="true" height="68" name="Generate n-Grams (2)" width="90" x="581" y="34">
    <parameter key="max_length" value="1"/>
    </operator>
    <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
    <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
    <connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
    <connect from_op="Filter Stopwords (2)" from_port="document" to_op="Generate n-Grams (2)" to_port="document"/>
    <connect from_op="Generate n-Grams (2)" from_port="document" to_port="document 1"/>
    <portSpacing port="source_document" spacing="0"/>
    <portSpacing port="sink_document 1" spacing="0"/>
    <portSpacing port="sink_document 2" spacing="0"/>
    </process>
    </operator>
    <operator activated="true" class="apply_model" compatibility="8.0.001" expanded="true" height="82" name="Apply Model (2)" width="90" x="1050" y="238">
    <list key="application_parameters"/>
    </operator>
    <connect from_op="Read XML" from_port="output" to_op="Rename" to_port="example set input"/>
    <connect from_op="Rename" from_port="example set output" to_op="Set Role" to_port="example set input"/>
    <connect from_op="Set Role" from_port="example set output" to_op="Unescape HTML" to_port="example set input"/>
    <connect from_op="Unescape HTML" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
    <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="example set" to_op="Cross Validation" to_port="example set"/>
    <connect from_op="Process Documents from Data" from_port="word list" to_op="Process Documents from Data (2)" to_port="word list"/>
    <connect from_op="Cross Validation" from_port="model" to_op="Apply Model (2)" to_port="model"/>
    <connect from_op="Cross Validation" from_port="performance 1" to_port="result 2"/>
    <connect from_op="Read XML (2)" from_port="output" to_op="Rename (2)" to_port="example set input"/>
    <connect from_op="Rename (2)" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
    <connect from_op="Select Attributes" from_port="example set output" to_op="Unescape HTML (2)" to_port="example set input"/>
    <connect from_op="Unescape HTML (2)" from_port="example set output" to_op="Nominal to Text (2)" to_port="example set input"/>
    <connect from_op="Nominal to Text (2)" from_port="example set output" to_op="Process Documents from Data (2)" to_port="example set"/>
    <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Apply Model (2)" to_port="unlabelled data"/>
    <connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 1"/>
    <portSpacing port="source_input 1" spacing="0"/>
    <portSpacing port="sink_result 1" spacing="0"/>
    <portSpacing port="sink_result 2" spacing="0"/>
    <portSpacing port="sink_result 3" spacing="0"/>
    </process>
    </operator>
    </process>
    User: "Thomas_Ott"
    New Altair Community Member
    Accepted Answer

    @AKO the problem you had was the position of the Performance operator. The Performance operator needs to see a label from the training set to compare it to what the model is building on the training set. In your case, your test set provided a label, but it was not what the K-nn trained on, so when it was comparing the two, you got bad results.