Altair RISE

A program to recognize and reward our most engaged community members

Nominate Yourself Now!

Process Documents operator ends up with more documents than example set

I have an example set with 733 examples and with text as an attribute, I process it into TF-IDF using the data to documents and the process documents operators, inside the process documents I clean up and tokenize the text then I use the Write Document and I end up with 1466 files

Why do I end up with more twice as many files as examples?
How do I ensure that 1 document in = 1 document out ?

I have extract content set to negate every tag possible but I end up with 2 outputs for every 1 input. From a brief look it seems file 734 is simiilar to file 1 so its like the whole thing loops twice for some reason

Find more posts tagged with

AI Studio

Text Mining + NLP

Accepted answers

All comments

JEdward

Can you post up the XML of your process (or a simplified version) showing it?

That way I can properly see what's happening.

Thanks

mob

Can't share the data but here is the process to take a text field from an example set, pre-process it, write the documents to disk before outputting a td-idf example set

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.015">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30"/>
      <operator activated="true" class="text:data_to_documents" compatibility="5.3.002" expanded="true" height="60" name="Data to Documents" width="90" x="246" y="30">
        <description>Need to select the attributes in the pro settings of the Data to Document </description>
        <parameter key="select_attributes_and_weights" value="true"/>
        <list key="specify_weights">
          <parameter key="OriginalTextFromDocument" value="1.0"/>
        </list>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Non Letters (2)" width="90" x="447" y="30">
        <description>Process the entire dataset as if it were DIT data and use the merged stop phrase list to remove the boiler plate</description>
        <parameter key="keep_text" value="true"/>
        <process expanded="true">
          <operator activated="true" class="web:extract_html_text_content" compatibility="5.3.002" expanded="true" height="60" name="Extract Content (3)" width="90" x="45" y="30"/>
          <operator activated="true" class="text:replace_tokens" compatibility="5.3.002" expanded="true" height="60" name="Split textText or TextText" width="90" x="180" y="30">
            <description>Uses the regex from rapidminer forum to split where capitialised letters are in the middle of words because punctuation is missing from the original text. It finds captialised words and replaces them with a space and the captured text</description>
            <list key="replace_dictionary">
              <parameter key="([A-Z])" value=" $1"/>
            </list>
          </operator>
          <operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="lower case (2)" width="90" x="315" y="30"/>
          <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.3.002" expanded="true" height="76" name="Filter Stop Phrases" width="90" x="450" y="30">
            <parameter key="file" value="C:\Users\Michael\Google Drive\My Masters\RapidMinerRepo\Text Mining\Assignment\AdditionalFiles\MergedStopPhrases.txt"/>
          </operator>
          <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize non letters (2)" width="90" x="585" y="30"/>
          <operator activated="true" class="text:replace_tokens" compatibility="5.3.002" expanded="true" height="60" name="Replace Regex" width="90" x="581" y="120">
            <description>Remove �� from the start of some words and sometime ���� and replace it with just the word found after it
Regular Expression	Replacement
��{1,2}()		$1</description>
            <list key="replace_dictionary">
              <parameter key="��{1,2}()" value="$1"/>
            </list>
          </operator>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Stop Eng (2)" width="90" x="313" y="187"/>
          <operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="447" y="255">
            <parameter key="min_chars" value="3"/>
          </operator>
          <operator activated="true" class="text:write_document" compatibility="5.3.002" expanded="true" height="76" name="Write Document" width="90" x="581" y="255">
            <parameter key="file" value="C:\Users\michael.obrien\Google Drive\My Masters\Text and Web Mining\Text Mining Assignment\Python\MergedDataset\%{a}.txt"/>
          </operator>
          <connect from_port="document" to_op="Extract Content (3)" to_port="document"/>
          <connect from_op="Extract Content (3)" from_port="document" to_op="Split textText or TextText" to_port="document"/>
          <connect from_op="Split textText or TextText" from_port="document" to_op="lower case (2)" to_port="document"/>
          <connect from_op="lower case (2)" from_port="document" to_op="Filter Stop Phrases" to_port="document"/>
          <connect from_op="Filter Stop Phrases" from_port="document" to_op="Tokenize non letters (2)" to_port="document"/>
          <connect from_op="Tokenize non letters (2)" from_port="document" to_op="Replace Regex" to_port="document"/>
          <connect from_op="Replace Regex" from_port="document" to_op="Stop Eng (2)" to_port="document"/>
          <connect from_op="Stop Eng (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
          <connect from_op="Filter Tokens (2)" from_port="document" to_op="Write Document" to_port="document"/>
          <connect from_op="Write Document" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <connect from_op="Retrieve" from_port="output" to_op="Data to Documents" to_port="example set"/>
      <connect from_op="Data to Documents" from_port="documents" to_op="Process Non Letters (2)" to_port="documents 1"/>
      <connect from_op="Process Non Letters (2)" from_port="word list" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>