🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

"Text Classification with different terms"

User: "dport"
New Altair Community Member
Updated by Jocelyn
I would like to classify an example set based on a classification model generated from a related but different example set. The terms will not be identical. Is it reasonable to supply the word list form the model to the example set I wish to classify?

The model I am experimenting with is listed below. It seems to give pretty decent results but I have yet to give it full check (this would require a lot of data preparation).

Any feedback appreciated!

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
 <context>
   <input>
     <location/>
   </input>
   <output>
     <location/>
     <location/>
   </output>
   <macros/>
 </context>
 <operator activated="true" class="process" expanded="true" name="Process">
   <process expanded="true" height="448" width="748">
     <operator activated="true" class="retrieve" expanded="true" height="60" name="Retrieve" width="90" x="45" y="75">
       <parameter key="repository_entry" value="team_x_risks_no_dups"/>
     </operator>
     <operator activated="true" class="nominal_to_text" expanded="true" height="76" name="Nominal to Text" width="90" x="179" y="75">
       <parameter key="attribute_filter_type" value="subset"/>
       <parameter key="attribute" value="risk_title risk_desc_risk_keywords_risk_factor_description"/>
       <parameter key="attributes" value="risk_all"/>
     </operator>
     <operator activated="true" class="retrieve" expanded="true" height="60" name="Retrieve (2)" width="90" x="45" y="300">
       <parameter key="repository_entry" value="team_x_risk_cats"/>
     </operator>
     <operator activated="true" class="nominal_to_text" expanded="true" height="76" name="Nominal to Text (2)" width="90" x="179" y="300"/>
     <operator activated="true" class="text:process_document_from_data" expanded="true" height="76" name="Process Documents from Data" width="90" x="313" y="300">
       <list key="specify_weights"/>
       <process expanded="true">
         <operator activated="true" class="text:tokenize" expanded="true" height="60" name="Tokenize" width="90" x="45" y="75"/>
         <operator activated="true" class="text:transform_cases" expanded="true" height="60" name="Transform Cases (2)" width="90" x="179" y="210"/>
         <operator activated="true" class="text:filter_stopwords_english" expanded="true" height="60" name="Filter Stopwords (2)" width="90" x="313" y="300"/>
         <operator activated="true" class="text:filter_by_length" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="458" y="288">
           <parameter key="min_chars" value="3"/>
         </operator>
         <operator activated="true" class="text:generate_n_grams_terms" expanded="true" height="60" name="Generate n-Grams (2)" width="90" x="715" y="120"/>
         <connect from_port="document" to_op="Tokenize" to_port="document"/>
         <connect from_op="Tokenize" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
         <connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
         <connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
         <connect from_op="Filter Tokens (2)" from_port="document" to_op="Generate n-Grams (2)" to_port="document"/>
         <connect from_op="Generate n-Grams (2)" from_port="document" to_port="document 1"/>
         <portSpacing port="source_document" spacing="0"/>
         <portSpacing port="sink_document 1" spacing="0"/>
         <portSpacing port="sink_document 2" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="k_nn" expanded="true" height="76" name="k-NN" width="90" x="447" y="300">
       <parameter key="measure_types" value="NumericalMeasures"/>
       <parameter key="numerical_measure" value="CosineSimilarity"/>
     </operator>
     <operator activated="true" class="text:process_document_from_data" expanded="true" height="76" name="Process Documents from Data (2)" width="90" x="313" y="75">
       <parameter key="keep_text" value="true"/>
       <parameter key="prune_method" value="percentual"/>
       <parameter key="prune_above_percent" value="50.0"/>
       <list key="specify_weights"/>
       <process expanded="true">
         <operator activated="true" class="text:tokenize" expanded="true" height="60" name="Tokenize (2)" width="90" x="45" y="30"/>
         <operator activated="true" class="text:transform_cases" expanded="true" height="60" name="Transform Cases" width="90" x="179" y="75"/>
         <operator activated="true" class="text:filter_stopwords_english" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="313" y="210"/>
         <operator activated="true" class="text:filter_by_length" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="447" y="120">
           <parameter key="min_chars" value="3"/>
         </operator>
         <operator activated="true" class="text:generate_n_grams_terms" expanded="true" height="60" name="Generate n-Grams (Terms)" width="90" x="514" y="30"/>
         <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
         <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases" to_port="document"/>
         <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
         <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
         <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
         <connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
         <portSpacing port="source_document" spacing="0"/>
         <portSpacing port="sink_document 1" spacing="0"/>
         <portSpacing port="sink_document 2" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="apply_model" expanded="true" height="76" name="Apply Model" width="90" x="581" y="165">
       <list key="application_parameters"/>
     </operator>
     <connect from_op="Retrieve" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
     <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data (2)" to_port="example set"/>
     <connect from_op="Retrieve (2)" from_port="output" to_op="Nominal to Text (2)" to_port="example set input"/>
     <connect from_op="Nominal to Text (2)" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
     <connect from_op="Process Documents from Data" from_port="example set" to_op="k-NN" to_port="training set"/>
     <connect from_op="Process Documents from Data" from_port="word list" to_op="Process Documents from Data (2)" to_port="word list"/>
     <connect from_op="k-NN" from_port="model" to_op="Apply Model" to_port="model"/>
     <connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/>
     <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="0"/>
     <portSpacing port="sink_result 2" spacing="0"/>
   </process>
 </operator>
</process>

Find more posts tagged with