[SOLVED] How to extract words based on wordlist

subhasisdasgupt
subhasisdasgupt New Altair Community Member
edited November 5 in Community Q&A
I have a custom corpus and I can easily create Document Term Matrix (DTM) using Process Document from Files operator. However, I am not in a position to extract a DTM using subset of words. I also tried to provide a wordlist to the optional wordlist input port but the output is not what I wanted. The process XML is shown below

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
 <context>
   <input/>
   <output/>
   <macros/>
 </context>
 <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
   <process expanded="true">
     <operator activated="true" class="text:create_document" compatibility="5.3.002" expanded="true" height="60" name="Create Document" width="90" x="45" y="30">
       <parameter key="text" value="iphone galaxy htc xperia screen ram able i"/>
     </operator>
     <operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="165">
       <list key="text_directories">
         <parameter key="S3" value="C:\nltk_data\corpora\reviewdata"/>
       </list>
       <parameter key="create_word_vector" value="false"/>
       <parameter key="add_meta_information" value="false"/>
       <parameter key="keep_text" value="true"/>
       <process expanded="true">
         <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30"/>
         <operator activated="false" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="179" y="30"/>
         <operator activated="false" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="313" y="30"/>
         <operator activated="false" class="text:filter_tokens_by_content" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Content)" width="90" x="447" y="30">
           <parameter key="condition" value="equals"/>
           <parameter key="string" value="&quot;battery|iphone&quot;"/>
         </operator>
         <connect from_port="document" to_op="Tokenize" to_port="document"/>
         <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
         <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
         <portSpacing port="source_document" spacing="0"/>
         <portSpacing port="sink_document 1" spacing="0"/>
         <portSpacing port="sink_document 2" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="set_role" compatibility="5.3.015" expanded="true" height="76" name="Set Role" width="90" x="179" y="165">
       <parameter key="attribute_name" value="text"/>
       <list key="set_additional_roles"/>
     </operator>
     <operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Documents" width="90" x="179" y="30">
       <process expanded="true">
         <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (3)" width="90" x="112" y="30"/>
         <connect from_port="document" to_op="Tokenize (3)" to_port="document"/>
         <connect from_op="Tokenize (3)" from_port="document" to_port="document 1"/>
         <portSpacing port="source_document" spacing="0"/>
         <portSpacing port="sink_document 1" spacing="0"/>
         <portSpacing port="sink_document 2" spacing="0"/>
       </process>
     </operator>
     <operator activated="true" class="text:process_document_from_data" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="120">
       <parameter key="add_meta_information" value="false"/>
       <list key="specify_weights"/>
       <process expanded="true">
         <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (2)" width="90" x="112" y="30"/>
         <operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases (2)" width="90" x="246" y="30"/>
         <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
         <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
         <connect from_op="Transform Cases (2)" from_port="document" to_port="document 1"/>
         <portSpacing port="source_document" spacing="0"/>
         <portSpacing port="sink_document 1" spacing="0"/>
         <portSpacing port="sink_document 2" spacing="0"/>
       </process>
     </operator>
     <connect from_op="Create Document" from_port="output" to_op="Process Documents" to_port="documents 1"/>
     <connect from_op="Process Documents from Files" from_port="example set" to_op="Set Role" to_port="example set input"/>
     <connect from_op="Set Role" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
     <connect from_op="Process Documents" from_port="word list" to_op="Process Documents from Data" to_port="word list"/>
     <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
     <connect from_op="Process Documents from Data" from_port="word list" to_port="result 2"/>
     <portSpacing port="source_input 1" spacing="0"/>
     <portSpacing port="sink_result 1" spacing="0"/>
     <portSpacing port="sink_result 2" spacing="0"/>
     <portSpacing port="sink_result 3" spacing="0"/>
   </process>
 </operator>
</process>

Can anyone help me out?

Subhasis

Answers

  • MariusHelf
    MariusHelf New Altair Community Member
    Hi Subhasis,

    you don't need the Process Documents from Data operator, you can do everything in Process Documents from File. There you should also connect the wordlist from the first Process Documents.

    Furthermore the subprocess of both remaining Process Documents operators must be exactly the same, so if you want to transform cases you should do it in both.

    Best regards,
    Marius
  • subhasisdasgupt
    subhasisdasgupt New Altair Community Member
    Dear Marius,

    I initially did exactly the same thing that you have suggested. But the issue is that TFIDF score of each word in each document is coming out to be zero. Am I doing anything wrong in this process? Please advise.

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.015">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="text:create_document" compatibility="5.3.002" expanded="true" height="60" name="Create Document" width="90" x="45" y="30">
            <parameter key="text" value="iphone galaxy htc xperia screen ram able i"/>
          </operator>
          <operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Documents" width="90" x="179" y="30">
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (3)" width="90" x="112" y="30"/>
              <operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases (3)" width="90" x="313" y="30"/>
              <connect from_port="document" to_op="Tokenize (3)" to_port="document"/>
              <connect from_op="Tokenize (3)" from_port="document" to_op="Transform Cases (3)" to_port="document"/>
              <connect from_op="Transform Cases (3)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="text:process_document_from_file" compatibility="5.3.002" expanded="true" height="76" name="Process Documents from Files" width="90" x="380" y="30">
            <list key="text_directories">
              <parameter key="S3" value="C:\nltk_data\corpora\reviewdata"/>
            </list>
            <parameter key="add_meta_information" value="false"/>
            <parameter key="keep_text" value="true"/>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30"/>
              <operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="179" y="30"/>
              <operator activated="false" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="313" y="30"/>
              <operator activated="false" class="text:filter_tokens_by_content" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Content)" width="90" x="447" y="30">
                <parameter key="condition" value="equals"/>
                <parameter key="string" value="&quot;battery|iphone&quot;"/>
              </operator>
              <connect from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
              <connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Create Document" from_port="output" to_op="Process Documents" to_port="documents 1"/>
          <connect from_op="Process Documents" from_port="word list" to_op="Process Documents from Files" to_port="word list"/>
          <connect from_op="Process Documents from Files" from_port="example set" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
  • MariusHelf
    MariusHelf New Altair Community Member
    Hi Subhasis,

    the problem is that the TF/IDF is calculated with respect to the first input data. As you know the TF/IDF for one term in one document is high if the term occurs frequently in one document, but, and that's the important thing, terms that appear in many documents get a lower TF/IDF value. If a term occurs in all documents its TF/IDF value is zero. Since you have only one input document, the value is zero for all terms.

    So if you want to stick to your current process setup you can only change the vector_creation from TF/IDF to e.g. term occurences to get a term count.

    Another option would be to omit the first Process Documents completely and instead add a Select Attributes after the second one to select only the interesting terms.

    Yet another option and probably the most elegant one is to filter tokens in process documents, as in the process below.

    Best regards,
    Marius
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="6.0.002">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="text:create_document" compatibility="5.3.002" expanded="true" height="60" name="Create Document (2)" width="90" x="45" y="30">
            <parameter key="text" value="This is an iphone test data"/>
          </operator>
          <operator activated="true" class="text:create_document" compatibility="5.3.002" expanded="true" height="60" name="Create Document (3)" width="90" x="45" y="120">
            <parameter key="text" value="This is an iphone test data set with htc"/>
          </operator>
          <operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="112" name="Process Documents (2)" width="90" x="380" y="30">
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize (2)" width="90" x="45" y="30"/>
              <operator activated="true" breakpoints="after" class="text:filter_tokens_by_content" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="246" y="30">
                <parameter key="condition" value="matches"/>
                <parameter key="regular_expression" value="iphone|galaxy|htc|xperia|screen|ram|able|i"/>
              </operator>
              <operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases (2)" width="90" x="393" y="30"/>
              <connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
              <connect from_op="Tokenize (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
              <connect from_op="Filter Tokens (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
              <connect from_op="Transform Cases (2)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Create Document (2)" from_port="output" to_op="Process Documents (2)" to_port="documents 1"/>
          <connect from_op="Create Document (3)" from_port="output" to_op="Process Documents (2)" to_port="documents 2"/>
          <connect from_op="Process Documents (2)" from_port="example set" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>