Hi everyone!
I'm trying to train a classifier so that it can classify a data as true, false and "unclassified" depending on the data set. The output should be that if the data is within or is relation to the data set, it will be classified accordingly. If not, it should be classified as "unclassified".
Currently i only have a process that can classify true/false. Is there a process in rapid miner in which i can include the unclassified class or do i need to add "unclassified" data into my data set for it to work?
Our data is about health beliefs involving asthma and diabetes. Whenever it is not included in our data set, it should be unclassified.
Sample data:
Tweet Category
diabetes causes heart problems false
Diabetes causes shingles false
Dirt treats Asthma false
oatmeal medicates diabetes true
Obesity causes Asthma true
Obesity causes Diabetes true
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="5.3.013" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
<parameter key="excel_file" value="C:\Users\user1.user\Desktop\Training Set.xlsx"/>
<parameter key="imported_cell_range" value="A1:B77"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="Text.true.text.attribute"/>
<parameter key="1" value="Category.true.binominal.label"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="false"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.013" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="30">
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.3.013" expanded="true" height="76" name="Set Role" width="90" x="313" y="30">
<parameter key="attribute_name" value="Category"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="447" y="30">
<parameter key="keep_text" value="true"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="9999"/>
<list key="specify_weights">
<parameter key="Text" value="1.0"/>
</list>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="112" y="75"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="165"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="246" y="75">
<parameter key="min_chars" value="3"/>
<parameter key="max_chars" value="999"/>
</operator>
<operator activated="true" class="text:stem_snowball" compatibility="5.3.002" expanded="true" height="60" name="Stem (Snowball)" width="90" x="246" y="165"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
<connect from_op="Stem (Snowball)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="x_validation" compatibility="5.3.013" expanded="true" height="112" name="Validation" width="90" x="581" y="30">
<process expanded="true">
<operator activated="true" class="support_vector_machine" compatibility="5.3.013" expanded="true" height="112" name="SVM" width="90" x="45" y="30">
<parameter key="kernel_cache" value="80"/>
</operator>
<operator activated="false" class="naive_bayes" compatibility="5.3.013" expanded="true" height="76" name="Naive Bayes" width="90" x="45" y="165"/>
<operator activated="false" class="decision_tree" compatibility="5.3.013" expanded="true" height="76" name="Decision Tree" width="90" x="179" y="165"/>
<operator activated="false" class="weka:W-J48" compatibility="5.3.001" expanded="true" height="76" name="W-J48" width="90" x="45" y="255"/>
<connect from_port="training" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="5.3.013" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="store" compatibility="5.3.013" expanded="true" height="60" name="Store" width="90" x="45" y="165">
<parameter key="repository_entry" value="../data/AsthmaDiabetesModel"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="5.3.013" expanded="true" height="76" name="Performance" width="90" x="213" y="30">
<parameter key="main_criterion" value="accuracy"/>
<parameter key="classification_error" value="true"/>
<parameter key="kappa" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Apply Model" from_port="model" to_op="Store" to_port="input"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read Excel" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Validation" to_port="training"/>
<connect from_op="Validation" from_port="model" to_port="result 1"/>
<connect from_op="Validation" from_port="training" to_port="result 3"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>
Thanks in advance!
Yvan