RSS Classification

montaqi
montaqi New Altair Community Member
edited November 2024 in Community Q&A
Hello,
    I am able to read RSS url and write an excel sheet that contains all the rss feeds. Also I am able to classify each row as positive/negative based on my given wordlists. My question is, which operator do I use if I want to do filtering? For example, I only want to extract the ones that belong to negative category (at least have a confidence level of 50% for instance) then generate N-Gram for them. Also, how do I actually do to get an overall classification as in positive/negative about the RSS url as a whole instead of classifying each rss feed? Thank you, I am not sure if I have made myself clear....

My process so far is, from having an excel sheet that contains an rss feed at each row:

<process version="5.1.006">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.1.006" expanded="true" name="Process">
    <process expanded="true" height="476" width="815">
      <operator activated="true" class="text:process_document_from_file" compatibility="5.1.001" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="30">
        <list key="text_directories">
          <parameter key="positive" value="C:\Documents and Settings\TU001YU\Desktop\positive"/>
          <parameter key="negative" value="C:\Documents and Settings\TU001YU\Desktop\negative"/>
        </list>
        <process expanded="true" height="524" width="806">
          <operator activated="true" class="text:transform_cases" compatibility="5.1.001" expanded="true" height="60" name="Transform Cases" width="90" x="179" y="30"/>
          <operator activated="true" class="text:tokenize" compatibility="5.1.001" expanded="true" height="60" name="Tokenize" width="90" x="447" y="30"/>
          <connect from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="naive_bayes_kernel" compatibility="5.1.006" expanded="true" height="76" name="Naive Bayes (Kernel)" width="90" x="447" y="30"/>
      <operator activated="true" class="read_excel" compatibility="5.1.006" expanded="true" height="60" name="Read Excel" width="90" x="45" y="300">
        <parameter key="excel_file" value="C:\Documents and Settings\TU001YU\Desktop\BBCNewsFeeds.xls"/>
        <parameter key="imported_cell_range" value="A1"/>
        <list key="annotations"/>
        <list key="data_set_meta_data_information"/>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="5.1.006" expanded="true" height="76" name="Nominal to Text" width="90" x="179" y="300"/>
      <operator activated="true" class="text:data_to_documents" compatibility="5.1.001" expanded="true" height="60" name="Data to Documents" width="90" x="313" y="300">
        <list key="specify_weights"/>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="5.1.001" expanded="true" height="94" name="Process Documents" width="90" x="447" y="165">
        <process expanded="true" height="524" width="806">
          <operator activated="true" class="text:transform_cases" compatibility="5.1.001" expanded="true" height="60" name="Transform Cases (2)" width="90" x="112" y="30"/>
          <operator activated="true" class="text:tokenize" compatibility="5.1.001" expanded="true" height="60" name="Tokenize (2)" width="90" x="246" y="30"/>
          <operator activated="true" class="text:stem_porter" compatibility="5.1.001" expanded="true" height="60" name="Stem (Porter)" width="90" x="380" y="30"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="5.1.001" expanded="true" height="60" name="Filter Stopwords (2)" width="90" x="514" y="30"/>
          <operator activated="true" class="text:filter_by_length" compatibility="5.1.001" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="648" y="30">
            <parameter key="min_chars" value="2"/>
            <parameter key="max_chars" value="99"/>
          </operator>
          <connect from_port="document" to_op="Transform Cases (2)" to_port="document"/>
          <connect from_op="Transform Cases (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
          <connect from_op="Stem (Porter)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
          <connect from_op="Filter Stopwords (2)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="apply_model" compatibility="5.1.006" expanded="true" height="76" name="Apply Model" width="90" x="648" y="30">
        <list key="application_parameters"/>
      </operator>
      <connect from_op="Process Documents from Files" from_port="example set" to_op="Naive Bayes (Kernel)" to_port="training set"/>
      <connect from_op="Process Documents from Files" from_port="word list" to_op="Process Documents" to_port="word list"/>
      <connect from_op="Naive Bayes (Kernel)" from_port="model" to_op="Apply Model" to_port="model"/>
      <connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Data to Documents" to_port="example set"/>
      <connect from_op="Data to Documents" from_port="documents" to_op="Process Documents" to_port="documents 1"/>
      <connect from_op="Process Documents" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/>
      <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
      <connect from_op="Apply Model" from_port="model" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>
Tagged:

Welcome!

It looks like you're new here. Sign in or register to get started.

Answers

  • montaqi
    montaqi New Altair Community Member
    Could somebody help me please?
  • el_chief
    el_chief New Altair Community Member
    use the Data Transformation > Filtering > Filter Examples operator

    change Condition Class to attribute_value_filter, and make sure you read the documentation
  • montaqi
    montaqi New Altair Community Member
    Sorry could you tell me what I should put in the "parameter string" for "attribute_value_filter" please?
    Neil McGuigan wrote:

    use the Data Transformation > Filtering > Filter Examples operator

    change Condition Class to attribute_value_filter, and make sure you read the documentation

Welcome!

It looks like you're new here. Sign in or register to get started.

Welcome!

It looks like you're new here. Sign in or register to get started.