"good repformance almost zero accuracy"

kersor
kersor New Altair Community Member
edited November 5 in Community Q&A
With this training process, the accuracy exceeds 75%

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.011">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.1.011" expanded="true" name="Process">
    <parameter key="parallelize_main_process" value="true"/>
    <process expanded="true" height="378" width="663">
      <operator activated="true" class="text:process_document_from_file" compatibility="5.1.002" expanded="true" height="76" name="Process Documents from Files" width="90" x="112" y="75">
        <list key="text_directories">
          <parameter key="neg" value="C:\Users\Αlkis_!!\Desktop\DATA_MINIMG\Negative"/>
          <parameter key="pos" value="C:\Users\Αlkis_!!\Desktop\DATA_MINIMG\positive"/>
        </list>
        <parameter key="vector_creation" value="Term Frequency"/>
        <parameter key="parallelize_vector_creation" value="true"/>
        <process expanded="true" height="396" width="681">
          <operator activated="true" class="text:tokenize" compatibility="5.1.002" expanded="true" height="60" name="Tokenize" width="90" x="45" y="75"/>
          <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.1.002" expanded="true" height="60" name="Filter Stopwords (Dictionary)" width="90" x="246" y="165">
            <parameter key="file" value="C:\Users\Αlkis_!!\Desktop\DATA_MINIMG\stopwords_greek\stopwords_greek.txt"/>
          </operator>
          <operator activated="true" class="text:transform_cases" compatibility="5.1.002" expanded="true" height="60" name="Transform Cases" width="90" x="433" y="150"/>
          <operator activated="true" class="text:generate_n_grams_terms" compatibility="5.1.002" expanded="true" height="60" name="Generate n-Grams (Terms)" width="90" x="568" y="131"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
          <connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
          <connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="x_validation" compatibility="5.1.011" expanded="true" height="112" name="Validation" width="90" x="246" y="210">
        <parameter key="parallelize_training" value="true"/>
        <parameter key="parallelize_testing" value="true"/>
        <process expanded="true" height="396" width="315">
          <operator activated="true" class="naive_bayes" compatibility="5.1.011" expanded="true" height="76" name="Naive Bayes" width="90" x="112" y="103"/>
          <connect from_port="training" to_op="Naive Bayes" to_port="training set"/>
          <connect from_op="Naive Bayes" from_port="model" to_port="model"/>
          <portSpacing port="source_training" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true" height="396" width="315">
          <operator activated="true" class="apply_model" compatibility="5.1.011" expanded="true" height="76" name="Apply Model" width="90" x="32" y="46">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="performance" compatibility="5.1.011" expanded="true" height="76" name="Performance" width="90" x="167" y="55"/>
          <connect from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_averagable 1" spacing="0"/>
          <portSpacing port="sink_averagable 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="write_model" compatibility="5.1.011" expanded="true" height="60" name="Write Model" width="90" x="447" y="120">
        <parameter key="model_file" value="C:\Users\Αlkis_!!\Desktop\DATA_MINIMG\model_file"/>
        <parameter key="output_type" value="Binary"/>
      </operator>
      <connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
      <connect from_op="Process Documents from Files" from_port="example set" to_op="Validation" to_port="training"/>
      <connect from_op="Process Documents from Files" from_port="word list" to_port="result 1"/>
      <connect from_op="Validation" from_port="model" to_op="Write Model" to_port="input"/>
      <connect from_op="Validation" from_port="averagable 1" to_port="result 3"/>
      <connect from_op="Write Model" from_port="through" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="source_input 2" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
    </process>
  </operator>
</process>




. However, when we test a single document, with the same preprocessing steps, the accuracy is almost zero.

Is there an easy way to map the initial vector obtained from the training phase to the new vector that corresponds to the new, unseen document without having to write it in code?If  anyone wants to help please let me know and sent all the data to a mail.

Best regards

Answers

  • dan_agape
    dan_agape New Altair Community Member

    Hi,

    Just a couple of remarks.
    when we test a single document, with the same preprocessing steps, the accuracy is almost zero.
    Note that a document would correspond to a single row/example in the dataset, and it is rather improper to test a model with a single example. Normally evaluating a classification model with one example only leads to an accuracy of 1 or 0. To diminish the variance of the estimates for the accuracy (that is, to improve the estimate you get), more examples are to be used for testing, obviously.

    Another remark - you may wish to try also to use TF-IDF when building the vectors - this scheme is commonly used with text mining as it takes into account (in addition to the frequency) how specific tokens/words are for some of the documents to classify, so this may often lead to a better performance of the built model.

    Regards
    Dan
  • kersor
    kersor New Altair Community Member
    Dear Dan, thank you for the remark.

    However, the X-fold cross validation is performing what you suggest, obtainig a final model, which we save for real-time classification.

    Therefore, the new, single document is inserted to the classifier (2nd process) and the classifier reads the model trained above and predicts the label. The problem is that the vectors for training and testing are not the same.