Bayes Learner and Text Classification - Index Out of Bounds
I am trying to classify text with W-Naive Bayes Multinomial, but keep getting an error 263 Array Index Out of Bounds error.
I use a sample of 200 text records from a database as a training set, save the model, then read 2000 text records to classify. I know there are additional terms in the full set that are not in the test set. Is this causing the index out of bounds error?
What will correct this?
Thanks
B.
training code
I use a sample of 200 text records from a database as a training set, save the model, then read 2000 text records to classify. I know there are additional terms in the full set that are not in the test set. Is this causing the index out of bounds error?
What will correct this?
Thanks
B.
training code
model applier
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.0.0" expanded="true" name="Root">
<description>Using a simple Naive Bayes classifier.</description>
<process expanded="true" height="584" width="962">
<operator activated="true" class="retrieve" compatibility="5.0.10" expanded="true" height="60" name="Retrieve" width="90" x="45" y="75">
<parameter key="repository_entry" value="//RMRepository/train_source"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.0.10" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="165">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="trainentry"/>
<parameter key="invert_selection" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.0.10" expanded="true" height="76" name="Set Role" width="90" x="45" y="300">
<parameter key="name" value="trainlabel"/>
<parameter key="target_role" value="label"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.0.10" expanded="true" height="76" name="Set Role (2)" width="90" x="45" y="390">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="5.0.10" expanded="true" height="76" name="Nominal to Text" width="90" x="246" y="345">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="posttitle"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.0.6" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="255">
<list key="specify_weights"/>
<process expanded="true" height="565" width="827">
<operator activated="true" class="text:transform_cases" compatibility="5.0.6" expanded="true" height="60" name="Transform Cases" width="90" x="179" y="165"/>
<operator activated="true" class="text:tokenize" compatibility="5.0.6" expanded="true" height="60" name="Tokenize" width="90" x="313" y="120"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="weka:W-NaiveBayesMultinomialUpdateable" compatibility="5.0.1" expanded="true" height="76" name="W-NaiveBayesMultinomialUpdateable" width="90" x="447" y="120">
<parameter key="D" value="true"/>
</operator>
<operator activated="true" class="write_model" compatibility="5.0.10" expanded="true" height="60" name="Write Model" width="90" x="455" y="30">
<parameter key="model_file" value="M:\RM\mdl_test_01.mod"/>
<parameter key="output_type" value="XML"/>
</operator>
<connect from_op="Retrieve" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="W-NaiveBayesMultinomialUpdateable" to_port="training set"/>
<connect from_op="W-NaiveBayesMultinomialUpdateable" from_port="model" to_op="Write Model" to_port="input"/>
<connect from_op="W-NaiveBayesMultinomialUpdateable" from_port="exampleSet" to_port="result 2"/>
<connect from_op="Write Model" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.0.10" expanded="true" name="Process">
<process expanded="true" height="341" width="681">
<operator activated="true" class="read_model" compatibility="5.0.10" expanded="true" height="60" name="Read Model" width="90" x="376" y="34">
<parameter key="model_file" value="M:\RM\mdl_test_01.mod"/>
</operator>
<operator activated="true" class="retrieve" compatibility="5.0.10" expanded="true" height="60" name="Retrieve" width="90" x="44" y="22">
<parameter key="repository_entry" value="assignlabel_01"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.0.10" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="120">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="entrylabel"/>
<parameter key="invert_selection" value="true"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.0.10" expanded="true" height="76" name="Set Role" width="90" x="45" y="255">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="5.0.10" expanded="true" height="76" name="Nominal to Text" width="90" x="179" y="75">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="posttitle"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.0.6" expanded="true" height="76" name="Process Documents from Data" width="90" x="313" y="165">
<list key="specify_weights"/>
<process expanded="true" height="583" width="845">
<operator activated="true" class="text:transform_cases" compatibility="5.0.6" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="51"/>
<operator activated="true" class="text:tokenize" compatibility="5.0.6" expanded="true" height="60" name="Tokenize" width="90" x="313" y="75"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" compatibility="5.0.10" expanded="true" height="76" name="Apply Model" width="90" x="581" y="75">
<list key="application_parameters"/>
</operator>
<connect from_op="Read Model" from_port="output" to_op="Apply Model" to_port="model"/>
<connect from_op="Retrieve" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>