An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
<?xml version="1.0" encoding="UTF-8" standalone="no"?><process version="5.2.001"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="5.2.001" expanded="true" name="Process"> <process expanded="true" height="370" width="850"> <operator activated="true" class="generate_nominal_data" compatibility="5.2.001" expanded="true" height="60" name="Generate Training Data" width="90" x="45" y="30"> <parameter key="number_of_attributes" value="1"/> <parameter key="number_of_values" value="50"/> </operator> <operator activated="true" class="nominal_to_text" compatibility="5.2.001" expanded="true" height="76" name="Nominal to Text" width="90" x="179" y="30"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="att1"/> </operator> <operator activated="true" class="text:process_document_from_data" compatibility="5.2.001" expanded="true" height="76" name="Process Documents from Data" width="90" x="313" y="30"> <list key="specify_weights"/> <process expanded="true" height="541" width="969"> <operator activated="true" class="text:tokenize" compatibility="5.2.001" expanded="true" height="60" name="Tokenize" width="90" x="313" y="30"/> <connect from_port="document" to_op="Tokenize" to_port="document"/> <connect from_op="Tokenize" from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="naive_bayes" compatibility="5.2.001" expanded="true" height="76" name="Naive Bayes" width="90" x="447" y="30"/> <operator activated="true" class="generate_nominal_data" compatibility="5.2.001" expanded="true" height="60" name="Generate Testing Data" width="90" x="45" y="210"> <parameter key="number_of_attributes" value="1"/> <parameter key="number_of_values" value="50"/> </operator> <operator activated="true" class="nominal_to_text" compatibility="5.2.001" expanded="true" height="76" name="Nominal to Text (2)" width="90" x="179" y="210"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="att1"/> </operator> <operator activated="true" class="text:process_document_from_data" compatibility="5.2.001" expanded="true" height="76" name="Process Documents from Data (2)" width="90" x="447" y="210"> <list key="specify_weights"/> <process expanded="true"> <operator activated="true" class="text:tokenize" compatibility="5.2.001" expanded="true" name="Tokenize (2)"/> <connect from_port="document" to_op="Tokenize (2)" to_port="document"/> <connect from_op="Tokenize (2)" from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="apply_model" compatibility="5.2.001" expanded="true" height="76" name="Apply Model" width="90" x="581" y="210"> <list key="application_parameters"/> </operator> <connect from_op="Generate Training Data" from_port="output" to_op="Nominal to Text" to_port="example set input"/> <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/> <connect from_op="Process Documents from Data" from_port="example set" to_op="Naive Bayes" to_port="training set"/> <connect from_op="Process Documents from Data" from_port="word list" to_op="Process Documents from Data (2)" to_port="word list"/> <connect from_op="Naive Bayes" from_port="model" to_op="Apply Model" to_port="model"/> <connect from_op="Generate Testing Data" from_port="output" to_op="Nominal to Text (2)" to_port="example set input"/> <connect from_op="Nominal to Text (2)" from_port="example set output" to_op="Process Documents from Data (2)" to_port="example set"/> <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/> <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/> <connect from_op="Apply Model" from_port="model" to_port="result 2"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> <portSpacing port="sink_result 3" spacing="0"/> </process> </operator></process>
how missing or additional features are handled, depends on the classification algorithm and its implementation in RapidMiner. In general, the behaviour is undefined, though in some cases you may get reasonable results.
I found out that in the second case the classificaton accuracy is much lower (it went to 20% from 70%). Is that something I should have expected, is my logic wrong here?
as even though I use training data information during test data feature extraction,
...you put information about the distributions of the test set into the training already.