unable to classify/learn text "sentences"

User: "lavramu"
New Altair Community Member
Updated by Jocelyn
HI,

I am trying to classify some text using a learning algorithm. For this my input is text files. I like to use the sentences of the texts as the unit (not words) hence I use the wordlist output of "Process document from files". While loading the files I give the label/class name and then tokenize using linguistic sentences. Now the example set retains the label attribute when the output is exampleset . But for the wordlist output , I get all the sentences in the rows but it loses the label attribute hence my process errors out before running with "Input example set must have special attribute 'label' when I use the validation operator .Even when I feed this to the wordlist to data operator, it does not help.

Wordlist output has the sentences, a new attribute created called "attribute name" with the same sentences as the data , total occurences, document occurences and 0 or 1 with the attribute as the two labels/classes i gave indicating the precense. (My classes are low/high). So there is no longer an attribute named label with values low high but now I have two colums named low and high with values 0 or 1.

Hence it is erroring out. How do I retain the label attribute indicating the class with which the sentence belongs to so that I can use validation operator for classifying.

Please can you help.
Basically I want to classify sentences by reading files tokenizing them on sentences and feed them into a learning algo. Simple. Not words but the whole sentences.


<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
 <context>
   <input/>
   <output/>
   <macros/>
 </context>
 <operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
   <process expanded="true">
     <operator activated="true" class="text:process_document_from_file" compatibility="5.3.001" expanded="true" height="76" name="Process Documents from Files (2)" width="90" x="45" y="30">
       <list key="text_directories">
         <parameter key="verylow" value="C:\Users\Uma\Desktop\nvivo\VeryLow"/>
         <parameter key="veryhigh" value="C:\Users\Uma\Desktop\nvivo\VeryHigh"/>
       </list>
       <parameter key="keep_text" value="true"/>
       <process expanded="true">
         <operator activated="true" class="text:transform_cases" compatibility="5.3.001" expanded="true" height="60" name="Transform Cases" width="90" x="45" y="30"/>
         <operator activated="true" class="text:replace_tokens" compatibility="5.3.001" expanded="true" height="60" name="Replace Tokens" width="90" x="180" y="30">
           <list key="replace_dictionary">
             <parameter key="reference.*coverage" value=" "/>
             <parameter key="&lt;internals.*]" value=" "/>
             <parameter key="&lt;page&gt;" value=" "/>
           </list>
         </operator>
         <operator activated="true" class="text:tokenize" compatibility="5.3.001" expanded="true" height="60" name="Tokenize" width="90" x="315" y="30">
           <parameter key="mode" value="linguistic sentences"/>
         </operator>
         <operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.001" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="450" y="30"/>
         <operator activated="true" class="text:stem_porter" compatibility="5.3.001" expanded="true" height="60" name="Stem (Porter)" width="90" x="514" y="30"/>
         <connect from_port="document" to_op="Transform Cases" to_port="document"/>
         <connect from_op="Transform Cases" from_port="document" to_op="Replace Tokens" to_port="document"/>
         <connect from_op="Replace Tokens" from_port="document" to_op="Tokenize" to_port="document"/>
         <connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
         <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
         <connect from_op="Stem (Porter)" from_port="document" to_port="document 1"/>
         <portSpacing port="source_document" spacing="0"/>
         <portSpacing port="sink_document 1" spacing="0"/>
         <portSpacing port="sink_document 2" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="text:wordlist_to_data" compatibility="5.3.001" expanded="true" height="76" name="WordList to Data" width="90" x="179" y="30"/>
     <operator activated="true" class="x_validation" compatibility="5.3.013" expanded="true" height="112" name="Validation" width="90" x="447" y="210">
       <parameter key="number_of_validations" value="5"/>
       <process expanded="true">
         <operator activated="true" class="naive_bayes" compatibility="5.3.013" expanded="true" height="76" name="Naive Bayes" width="90" x="45" y="30"/>
         <connect from_port="training" to_op="Naive Bayes" to_port="training set"/>
         <connect from_op="Naive Bayes" from_port="model" to_port="model"/>
         <portSpacing port="source_training" spacing="0"/>
         <portSpacing port="sink_model" spacing="0"/>
         <portSpacing port="sink_through 1" spacing="0"/>
       </process>
       <process expanded="true">
         <operator activated="true" class="apply_model" compatibility="5.3.013" expanded="true" height="76" name="Apply Model" width="90" x="112" y="30">
           <list key="application_parameters"/>
         </operator>
         <operator activated="true" class="performance_classification" compatibility="5.3.013" expanded="true" height="76" name="Performance" width="90" x="179" y="210">
           <parameter key="skip_undefined_labels" value="false"/>
           <parameter key="use_example_weights" value="false"/>
           <list key="class_weights"/>
         </operator>
         <connect from_port="model" to_op="Apply Model" to_port="model"/>
         <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
         <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
         <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
         <portSpacing port="source_model" spacing="0"/>
         <portSpacing port="source_test set" spacing="0"/>
         <portSpacing port="source_through 1" spacing="0"/>
         <portSpacing port="sink_averagable 1" spacing="0"/>
         <portSpacing port="sink_averagable 2" spacing="0"/>
       </process>
     </operator>
     <connect from_op="Process Documents from Files (2)" from_port="word list" to_op="WordList to Data" to_port="word list"/>
     <connect from_op="WordList to Data" from_port="example set" to_op="Validation" to_port="training"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="0"/>
   </process>
 </operator>
</process>

Find more posts tagged with