Keep the date column after tokenization for text mining

Titzaaa
Titzaaa New Altair Community Member
edited November 5 in Community Q&A
Hi everyone,
When conducting sentiment analysis on my dataset, I receive a positivy/negativity score for each article (each row) by counting positive and negative words.
I would now need this table to be expanded by the date for each article (each row) from the original file, which not only included text but also date information.
How can I do that?
Here is my code:
<?xml version="1.0" encoding="UTF-8"?><process version="9.3.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.3.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="9.3.000" expanded="true" height="68" name="Retrieve" width="90" x="45" y="34">
        <parameter key="repository_entry" value="//Masterarbeit/Data/ImmobilienZeitung"/>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="9.3.000" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="34">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="Body"/>
        <parameter key="attributes" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="nominal"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="file_path"/>
        <parameter key="block_type" value="single_value"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="single_value"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
      </operator>
      <operator activated="true" class="text:data_to_documents" compatibility="8.2.000" expanded="true" height="68" name="Data to Documents" width="90" x="313" y="34">
        <parameter key="select_attributes_and_weights" value="false"/>
        <list key="specify_weights"/>
      </operator>
      <operator activated="true" class="loop_collection" compatibility="9.3.000" expanded="true" height="82" name="Loop Collection" width="90" x="447" y="34">
        <parameter key="set_iteration_macro" value="false"/>
        <parameter key="macro_name" value="iteration"/>
        <parameter key="macro_start_value" value="1"/>
        <parameter key="unfold" value="false"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="8.2.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="34">
            <parameter key="mode" value="non letters"/>
            <parameter key="characters" value=".:"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
          </operator>
          <operator activated="true" class="text:transform_cases" compatibility="8.2.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="34">
            <parameter key="transform_to" value="lower case"/>
          </operator>
          <operator activated="true" class="text:filter_stopwords_german" compatibility="8.2.000" expanded="true" height="68" name="Filter Stopwords (2)" width="90" x="313" y="34">
            <parameter key="stop_word_list" value="Standard"/>
          </operator>
          <operator activated="true" class="text:filter_by_length" compatibility="8.2.000" expanded="true" height="68" name="Filter Tokens (2)" width="90" x="514" y="34">
            <parameter key="min_chars" value="3"/>
            <parameter key="max_chars" value="10000"/>
          </operator>
          <connect from_port="single" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
          <connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
          <connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
          <connect from_op="Filter Tokens (2)" from_port="document" to_port="output 1"/>
          <portSpacing port="source_single" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="retrieve" compatibility="9.3.000" expanded="true" height="68" name="Retrieve (2)" width="90" x="45" y="289">
        <parameter key="repository_entry" value="../Data/GRESD"/>
      </operator>
      <operator activated="true" class="retrieve" compatibility="9.3.000" expanded="true" height="68" name="Retrieve (3)" width="90" x="45" y="391">
        <parameter key="repository_entry" value="../Data/Negationsliste"/>
      </operator>
      <operator activated="true" class="operator_toolbox:dictionary_sentiment_learner" compatibility="2.0.001" expanded="true" height="82" name="Dictionary-Based Sentiment (Documents)" width="90" x="246" y="289">
        <parameter key="value_attribute" value="Klassifizierung"/>
        <parameter key="key_attribute" value="Wort"/>
        <parameter key="negation_attribute" value="Negationen"/>
        <parameter key="negation_window_size" value="5"/>
        <parameter key="use_symmetric_negation_window" value="true"/>
      </operator>
      <operator activated="true" class="operator_toolbox:apply_model_documents" compatibility="2.0.001" expanded="true" height="103" name="Apply Model (Documents)" width="90" x="581" y="187">
        <list key="application_parameters"/>
      </operator>
      <operator activated="true" class="generate_attributes" compatibility="9.3.000" expanded="true" height="82" name="Generate Attributes" width="90" x="715" y="187">
        <list key="function_descriptions">
          <parameter key="#Pos_Wörter/(#Pos_Wörter+#Neg_Wörter)" value="Positivity/(Positivity-Negativity)"/>
          <parameter key="#Neg_Wörter/(#Pos_Wörter+#Neg_Wörter)" value="Negativity*-1/(Negativity*-1+Positivity)"/>
          <parameter key="Pos_Score" value="if(Positivity&gt;(Negativity*-1),1,0)"/>
          <parameter key="Neg_Score" value="if((Negativity*-1)&gt;Positivity,-1,0)"/>
        </list>
        <parameter key="keep_all" value="true"/>
      </operator>
      <operator activated="true" class="generate_attributes" compatibility="9.3.000" expanded="true" height="82" name="Generate Attributes (2)" width="90" x="849" y="187">
        <list key="function_descriptions">
          <parameter key="Sentiment_Score" value="if(Pos_Score&gt;0,1,if(Neg_Score&lt;0,-1,0))"/>
        </list>
        <parameter key="keep_all" value="true"/>
      </operator>
      <operator activated="true" class="write_excel" compatibility="9.3.000" expanded="true" height="103" name="Write Excel" width="90" x="983" y="187">
        <parameter key="excel_file" value="D:\Franziska C. Weis\Masterarbeit\03 Datenanalyse\Rapid_Miner_Analysis_IZ.xlsx"/>
        <parameter key="file_format" value="xlsx"/>
        <enumeration key="sheet_names"/>
        <parameter key="sheet_name" value="RapidMiner Data"/>
        <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
        <parameter key="number_format" value="#.0"/>
        <parameter key="encoding" value="SYSTEM"/>
      </operator>
      <connect from_op="Retrieve" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Data to Documents" to_port="example set"/>
      <connect from_op="Data to Documents" from_port="documents" to_op="Loop Collection" to_port="collection"/>
      <connect from_op="Loop Collection" from_port="output 1" to_op="Apply Model (Documents)" to_port="doc"/>
      <connect from_op="Retrieve (2)" from_port="output" to_op="Dictionary-Based Sentiment (Documents)" to_port="exa"/>
      <connect from_op="Retrieve (3)" from_port="output" to_op="Dictionary-Based Sentiment (Documents)" to_port="neg"/>
      <connect from_op="Dictionary-Based Sentiment (Documents)" from_port="mod" to_op="Apply Model (Documents)" to_port="mod"/>
      <connect from_op="Apply Model (Documents)" from_port="exa" to_op="Generate Attributes" to_port="example set input"/>
      <connect from_op="Generate Attributes" from_port="example set output" to_op="Generate Attributes (2)" to_port="example set input"/>
      <connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Write Excel" to_port="input"/>
      <connect from_op="Write Excel" from_port="through" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>
Thanks a lot in advance!

Answers

  • lionelderkrikor
    lionelderkrikor New Altair Community Member
    Hi @Titzaaa,
    I would say that you have to use the Join operator.
    But to better understand can you share all your data ?
    if not can you at least provide a sample example of what you have, and from this example, what you want to obtain ?

    Regards,

    Lionel
  • kayman
    kayman New Altair Community Member
    You could use the 'set role' operator for that. You are actually not restricted to the dropdown options but can give any name to whatever attribute.

    So if you would give your datefield the role 'date' before you start your sentiment flow,it becomes a special attribute, and by default it will remain available for the rest of your process.
  • Titzaaa
    Titzaaa New Altair Community Member
    Thank you to both of you!
    Unfortunately the "set role" does not work, as the column remains available only before applying a loop collection and applying the dictionary, afterwards it is not there anymore.

    For the join operator I cannot find a key attribute.

    Please find attached the text I conduct the Sentiment Analysis with.

    Here again the code:
    <?xml version="1.0" encoding="UTF-8"?><process version="9.3.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.3.000" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="retrieve" compatibility="9.3.000" expanded="true" height="68" name="Retrieve" width="90" x="45" y="34">
            <parameter key="repository_entry" value="//Masterarbeit/Data/ImmobilienZeitung"/>
          </operator>
          <operator activated="true" class="retrieve" compatibility="9.3.000" expanded="true" height="68" name="Retrieve (2)" width="90" x="45" y="289">
            <parameter key="repository_entry" value="../Data/GRESD"/>
          </operator>
          <operator activated="true" class="retrieve" compatibility="9.3.000" expanded="true" height="68" name="Retrieve (3)" width="90" x="45" y="391">
            <parameter key="repository_entry" value="../Data/Negationsliste"/>
          </operator>
          <operator activated="true" class="operator_toolbox:dictionary_sentiment_learner" compatibility="2.0.001" expanded="true" height="82" name="Dictionary-Based Sentiment (Documents)" width="90" x="246" y="289">
            <parameter key="value_attribute" value="Klassifizierung"/>
            <parameter key="key_attribute" value="Wort"/>
            <parameter key="negation_attribute" value="Negationen"/>
            <parameter key="negation_window_size" value="5"/>
            <parameter key="use_symmetric_negation_window" value="true"/>
          </operator>
          <operator activated="true" class="set_role" compatibility="9.3.000" expanded="true" height="82" name="Set Role" width="90" x="112" y="136">
            <parameter key="attribute_name" value="Datum"/>
            <parameter key="target_role" value="Datum"/>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="nominal_to_text" compatibility="9.3.000" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="34">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="Body"/>
            <parameter key="attributes" value=""/>
            <parameter key="use_except_expression" value="false"/>
            <parameter key="value_type" value="nominal"/>
            <parameter key="use_value_type_exception" value="false"/>
            <parameter key="except_value_type" value="file_path"/>
            <parameter key="block_type" value="single_value"/>
            <parameter key="use_block_type_exception" value="false"/>
            <parameter key="except_block_type" value="single_value"/>
            <parameter key="invert_selection" value="false"/>
            <parameter key="include_special_attributes" value="false"/>
          </operator>
          <operator activated="true" class="text:data_to_documents" compatibility="8.2.000" expanded="true" height="68" name="Data to Documents" width="90" x="313" y="34">
            <parameter key="select_attributes_and_weights" value="false"/>
            <list key="specify_weights"/>
          </operator>
          <operator activated="true" class="loop_collection" compatibility="9.3.000" expanded="true" height="82" name="Loop Collection" width="90" x="447" y="34">
            <parameter key="set_iteration_macro" value="false"/>
            <parameter key="macro_name" value="iteration"/>
            <parameter key="macro_start_value" value="1"/>
            <parameter key="unfold" value="false"/>
            <process expanded="true">
              <operator activated="true" class="text:tokenize" compatibility="8.2.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="45" y="34">
                <parameter key="mode" value="non letters"/>
                <parameter key="characters" value=".:"/>
                <parameter key="language" value="English"/>
                <parameter key="max_token_length" value="3"/>
              </operator>
              <operator activated="true" class="text:transform_cases" compatibility="8.2.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="34">
                <parameter key="transform_to" value="lower case"/>
              </operator>
              <operator activated="true" class="text:filter_stopwords_german" compatibility="8.2.000" expanded="true" height="68" name="Filter Stopwords (2)" width="90" x="313" y="34">
                <parameter key="stop_word_list" value="Standard"/>
              </operator>
              <operator activated="true" class="text:filter_by_length" compatibility="8.2.000" expanded="true" height="68" name="Filter Tokens (2)" width="90" x="514" y="34">
                <parameter key="min_chars" value="3"/>
                <parameter key="max_chars" value="10000"/>
              </operator>
              <connect from_port="single" to_op="Tokenize (2)" to_port="document"/>
              <connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
              <connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
              <connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
              <connect from_op="Filter Tokens (2)" from_port="document" to_port="output 1"/>
              <portSpacing port="source_single" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="operator_toolbox:apply_model_documents" compatibility="2.0.001" expanded="true" height="103" name="Apply Model (Documents)" width="90" x="581" y="187">
            <list key="application_parameters"/>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="9.3.000" expanded="true" height="82" name="Generate Attributes" width="90" x="715" y="187">
            <list key="function_descriptions">
              <parameter key="#Pos_Wörter/(#Pos_Wörter+#Neg_Wörter)" value="Positivity/(Positivity-Negativity)"/>
              <parameter key="#Neg_Wörter/(#Pos_Wörter+#Neg_Wörter)" value="Negativity*-1/(Negativity*-1+Positivity)"/>
              <parameter key="Pos_Score" value="if(Positivity&gt;(Negativity*-1),1,0)"/>
              <parameter key="Neg_Score" value="if((Negativity*-1)&gt;Positivity,-1,0)"/>
            </list>
            <parameter key="keep_all" value="true"/>
          </operator>
          <operator activated="true" class="generate_attributes" compatibility="9.3.000" expanded="true" height="82" name="Generate Attributes (2)" width="90" x="849" y="187">
            <list key="function_descriptions">
              <parameter key="Sentiment_Score" value="if(Pos_Score&gt;0,1,if(Neg_Score&lt;0,-1,0))"/>
            </list>
            <parameter key="keep_all" value="true"/>
          </operator>
          <operator activated="true" class="write_excel" compatibility="9.3.000" expanded="true" height="103" name="Write Excel" width="90" x="983" y="187">
            <parameter key="excel_file" value="D:\Franziska C. Weis\Masterarbeit\03 Datenanalyse\Rapid_Miner_Analysis_IZ.xlsx"/>
            <parameter key="file_format" value="xlsx"/>
            <enumeration key="sheet_names"/>
            <parameter key="sheet_name" value="RapidMiner Data"/>
            <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
            <parameter key="number_format" value="#.0"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <connect from_op="Retrieve" from_port="output" to_op="Set Role" to_port="example set input"/>
          <connect from_op="Retrieve (2)" from_port="output" to_op="Dictionary-Based Sentiment (Documents)" to_port="exa"/>
          <connect from_op="Retrieve (3)" from_port="output" to_op="Dictionary-Based Sentiment (Documents)" to_port="neg"/>
          <connect from_op="Dictionary-Based Sentiment (Documents)" from_port="mod" to_op="Apply Model (Documents)" to_port="mod"/>
          <connect from_op="Set Role" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
          <connect from_op="Nominal to Text" from_port="example set output" to_op="Data to Documents" to_port="example set"/>
          <connect from_op="Data to Documents" from_port="documents" to_op="Loop Collection" to_port="collection"/>
          <connect from_op="Loop Collection" from_port="output 1" to_op="Apply Model (Documents)" to_port="doc"/>
          <connect from_op="Apply Model (Documents)" from_port="exa" to_op="Generate Attributes" to_port="example set input"/>
          <connect from_op="Generate Attributes" from_port="example set output" to_op="Generate Attributes (2)" to_port="example set input"/>
          <connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Write Excel" to_port="input"/>
          <connect from_op="Write Excel" from_port="through" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>