🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

Tailoring Text Mining Clusters

kulturevultureUser: "kulturevulture"
New Altair Community Member
Updated by Jocelyn
Hello. I am trying to classify electronic notebook items into one of three defined categories (clusters). My process does run and I get a count of notebooks into each of 3 clusters. Is there a way to tailor the creation of the clusters since I know generally the identity of what I want each of the 3 clusters to be but not the actual words which will identify the cluster? I'm thinking of something like PCA where I can select the words which would define a cluster. Any ideas would be appreciated.
Here is my XML:<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.003">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.2.003" expanded="true" name="Process">
    <process expanded="true" height="386" width="415">
      <operator activated="true" class="read_excel" compatibility="5.2.003" expanded="true" height="60" name="Read Excel" width="90" x="45" y="30">
        <parameter key="excel_file" value="E:\R-2.14.0\bin\i386\ELN.xls"/>
        <parameter key="imported_cell_range" value="A1:C301"/>
        <list key="annotations">
          <parameter key="0" value="Name"/>
        </list>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="ELN.true.nominal.id"/>
          <parameter key="1" value="Title.true.text.attribute"/>
          <parameter key="2" value="Text.true.text.attribute"/>
        </list>
        <parameter key="read_not_matching_values_as_missings" value="false"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="5.2.001" expanded="true" height="76" name="Process Documents from Data" width="90" x="112" y="120">
        <parameter key="keep_text" value="true"/>
        <parameter key="prune_below_absolute" value="1"/>
        <parameter key="prune_above_absolute" value="3"/>
        <list key="specify_weights"/>
        <process expanded="true" height="426" width="501">
          <operator activated="true" class="text:transform_cases" compatibility="5.2.001" expanded="true" height="60" name="Transform Cases" width="90" x="45" y="30"/>
          <operator activated="true" class="text:stem_snowball" compatibility="5.2.001" expanded="true" height="60" name="Stem (Snowball)" width="90" x="179" y="30"/>
          <operator activated="true" class="text:tokenize" compatibility="5.2.001" expanded="true" height="60" name="Tokenize" width="90" x="313" y="30"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="5.2.001" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="45" y="165"/>
          <operator activated="true" class="text:filter_by_length" compatibility="5.2.001" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="179" y="210">
            <parameter key="min_chars" value="2"/>
          </operator>
          <operator activated="true" class="text:generate_n_grams_terms" compatibility="5.2.001" expanded="true" height="60" name="Generate n-Grams (Terms)" width="90" x="313" y="255"/>
          <connect from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
          <connect from_op="Stem (Snowball)" from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
          <connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="k_means" compatibility="5.2.003" expanded="true" height="76" name="Clustering" width="90" x="45" y="300">
        <parameter key="add_as_label" value="true"/>
        <parameter key="k" value="3"/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="5.2.003" expanded="true" height="76" name="Select Attributes" width="90" x="179" y="300"/>
      <operator activated="true" class="write_csv" compatibility="5.2.003" expanded="true" height="76" name="Write CSV" width="90" x="313" y="300"/>
      <connect from_op="Read Excel" from_port="output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Clustering" to_port="example set"/>
      <connect from_op="Clustering" from_port="cluster model" to_port="result 2"/>
      <connect from_op="Clustering" from_port="clustered set" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Write CSV" to_port="input"/>
      <connect from_op="Write CSV" from_port="file" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Find more posts tagged with

Comments

No comments on this post.