Altair RISE

A program to recognize and reward our most engaged community members

Nominate Yourself Now!

To classify customer queries using RapidMiner

I have a huge excel file, which has general attributes like BILLID, CLAIM ID, ITERATION ID, QUERY TEXT etc. I want to classify these queries to identify major bulk of problem areas. I tried employing Decision Tree, but it did not render any insights. Most of what I researched I stumbled upon using filter keywords, Tokenizing, Stemming to identify keyword frequency, But I couldn't use it for excel attribute, irrespective of that I am not sure that would help me in finding clusters/buckets of complete query statement rather than just keywords. Any help in finding right direction would be of much help, Thank You.

Find more posts tagged with

AI Studio

Accepted answers

All comments

Thomas_Ott

You need to set a label to classify your dataset and if you have some sort of text to process, you'd need the Text Processing extension too.

Attached is a sample process that I used to classify some HR data that also used open ended survey questions.

<?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" breakpoints="after" class="retrieve" compatibility="7.5.000" expanded="true" height="68" name="Retrieve EmployeeData" width="90" x="45" y="30">
        <parameter key="repository_entry" value="../data/EmployeeData"/>
      </operator>
      <operator activated="true" class="replace_missing_values" compatibility="7.5.000" expanded="true" height="103" name="Replace Missing Values" width="90" x="179" y="30">
        <parameter key="value_type" value="nominal"/>
        <list key="columns"/>
        <parameter key="replenishment_value" value="MISSING"/>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="6.0.003" expanded="true" height="82" name="Nominal to Text" width="90" x="313" y="30">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="CanDoBetter"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="7.5.000" expanded="true" height="82" name="Set Role" width="90" x="447" y="30">
        <parameter key="attribute_name" value="ID"/>
        <parameter key="target_role" value="id"/>
        <list key="set_additional_roles">
          <parameter key="Status" value="label"/>
        </list>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="34">
        <parameter key="vector_creation" value="Binary Term Occurrences"/>
        <parameter key="prune_method" value="percentual"/>
        <parameter key="prune_above_percent" value="50.0"/>
        <parameter key="prune_below_absolute" value="5"/>
        <parameter key="prune_above_absolute" value="2000"/>
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
          <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
          <operator activated="true" class="text:generate_n_grams_terms" compatibility="7.4.001" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="313" y="34">
            <parameter key="max_length" value="3"/>
          </operator>
          <operator activated="true" class="text:stem_porter" compatibility="7.4.001" expanded="true" height="68" name="Stem (Porter)" width="90" x="447" y="34"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="7.4.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="581" y="34"/>
          <operator activated="true" class="text:filter_by_length" compatibility="7.4.001" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="715" y="34"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
          <connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
          <connect from_op="Stem (Porter)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="Select Attributes" width="90" x="581" y="120">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="ID|Status|Benefit|DriveTime|Education|Est_Income|Facility|Marital|PositionType|SpousalBenefit|StockPlan|Training|TrainingRecieved|length_of_hire|bottled_wat|benefit|event|pay_mor|listen|pension|salari|social"/>
      </operator>
      <operator activated="true" class="real_to_integer" compatibility="6.0.003" expanded="true" height="82" name="Real to Integer" width="90" x="581" y="210">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="length_of_hire|Est_Income"/>
      </operator>
      <operator activated="true" class="numerical_to_binominal" compatibility="6.0.003" expanded="true" height="82" name="Numerical to Binominal" width="90" x="581" y="300">
        <parameter key="attribute_filter_type" value="value_type"/>
        <parameter key="value_type" value="real"/>
      </operator>
      <operator activated="true" class="multiply" compatibility="7.5.000" expanded="true" height="103" name="Multiply" width="90" x="581" y="390"/>
      <operator activated="true" class="concurrency:parallel_decision_tree" compatibility="7.5.000" expanded="true" height="82" name="Decision Tree (2)" width="90" x="782" y="85">
        <parameter key="maximal_depth" value="5"/>
        <parameter key="apply_prepruning" value="false"/>
      </operator>
      <operator activated="true" class="nominal_to_numerical" compatibility="7.1.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="581" y="525">
        <parameter key="attributes" value="Benefit|SpousalBenefit|StockPlan"/>
        <list key="comparison_groups"/>
      </operator>
      <operator activated="true" class="remap_binominals" compatibility="7.3.000" expanded="true" height="82" name="Remap Binominals" width="90" x="715" y="525">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="Status"/>
        <parameter key="include_special_attributes" value="true"/>
        <parameter key="negative_value" value="past"/>
        <parameter key="positive_value" value="current"/>
      </operator>
      <operator activated="true" class="weight_by_svm" compatibility="7.5.000" expanded="true" height="82" name="Weight by SVM" width="90" x="849" y="525">
        <parameter key="normalize_weights" value="true"/>
      </operator>
      <operator activated="true" class="weights_to_data" compatibility="7.5.000" expanded="true" height="68" name="Weights to Data" width="90" x="782" y="210"/>
      <operator activated="true" class="sort" compatibility="7.5.000" expanded="true" height="82" name="Sort" width="90" x="916" y="210">
        <parameter key="attribute_name" value="Weight"/>
        <parameter key="sorting_direction" value="decreasing"/>
      </operator>
      <operator activated="true" class="filter_example_range" compatibility="7.5.000" expanded="true" height="82" name="Filter Example Range" width="90" x="1050" y="210">
        <parameter key="first_example" value="1"/>
        <parameter key="last_example" value="10"/>
      </operator>
      <connect from_op="Retrieve EmployeeData" from_port="output" to_op="Replace Missing Values" to_port="example set input"/>
      <connect from_op="Replace Missing Values" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_port="result 1"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Real to Integer" to_port="example set input"/>
      <connect from_op="Real to Integer" from_port="example set output" to_op="Numerical to Binominal" to_port="example set input"/>
      <connect from_op="Numerical to Binominal" from_port="example set output" to_op="Multiply" to_port="input"/>
      <connect from_op="Multiply" from_port="output 1" to_op="Nominal to Numerical" to_port="example set input"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Decision Tree (2)" to_port="training set"/>
      <connect from_op="Decision Tree (2)" from_port="model" to_port="result 3"/>
      <connect from_op="Nominal to Numerical" from_port="example set output" to_op="Remap Binominals" to_port="example set input"/>
      <connect from_op="Remap Binominals" from_port="example set output" to_op="Weight by SVM" to_port="example set"/>
      <connect from_op="Weight by SVM" from_port="weights" to_op="Weights to Data" to_port="attribute weights"/>
      <connect from_op="Weights to Data" from_port="example set" to_op="Sort" to_port="example set input"/>
      <connect from_op="Sort" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
      <connect from_op="Filter Example Range" from_port="example set output" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
    </process>
  </operator>
</process>

Akshay_Gupta

I tried to replicate your process, but

1. It doesn't show any attribute in Remap Binomials operator,

2. The decision tree generated individual branches for each example, no relevant insights, I think I am not using it properly.

3. Word frquency looks irregular, For example at times it is showing count for "approv" "provide" or "support" instead of supportings.

Thank you for such a prompt reply, This is my first process @ Rapidminer. My expectation is to classify textual queries(thousands) into some handful of classes, For example "Tax related", "Incorrect/Empty details", "Supportings not Attached". Queries are remarks put up by officer to intiator or QTI(Query to Initiator). I am looking a way to automate these query handling proccesses in order to improve efficiency and productivity. Hence I am starting with the first step of identifying major problem/Query areas - classification.

I am sorry, If I sound immature, I have taken 3 weeks of internet scouring before posting here. This field is quite new in my environment. Hence I couldn't get much help from around, It would help in great deal, if you can share resources about some example processes from where I can learn by comparison.

Query Iteration.rmp

Thomas_Ott

Do you have some sample data to share too? Would make troubleshooting this easier.

Thanks!

Akshay_Gupta

Hi Thank you for your time and attention, PFA the data file. I hope it helps the context of my problem. These are queries related to payable, i want to automate query handling processes, hence I am trying to classify these queries into handful of focus areas and think about solutions from there.

QRY_ITERATION_T_Mini.zip

Thomas_Ott

So your data set has some data quality issues. I just Filtered them out by selecting only ER- records. Then I made some tweaks to the process and got a results.

Customer Class.png

See below for the XML.

<?xml version="1.0" encoding="UTF-8"?><process version="7.5.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.5.001" expanded="true" name="Process">
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="7.5.001" expanded="true" height="68" name="Retrieve QRY_ITERATION_T_Mini" width="90" x="45" y="34">
        <parameter key="repository_entry" value="../data/QRY_ITERATION_T_Mini"/>
      </operator>
      <operator activated="true" class="filter_examples" compatibility="7.5.001" expanded="true" height="103" name="Filter Examples" width="90" x="45" y="136">
        <list key="filters_list">
          <parameter key="filters_entry_key" value="CLAIMID.contains.ER-"/>
        </list>
      </operator>
      <operator activated="true" class="replace_missing_values" compatibility="7.5.001" expanded="true" height="103" name="Replace Missing Values" width="90" x="179" y="136">
        <parameter key="value_type" value="nominal"/>
        <list key="columns"/>
        <parameter key="replenishment_value" value="MISSING"/>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="6.0.003" expanded="true" height="82" name="Nominal to Text" width="90" x="313" y="136">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value="QUERYTEXT"/>
        <parameter key="attributes" value="REPLYTEXT|QUERYTEXT"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="7.5.001" expanded="true" height="82" name="Set Role" width="90" x="447" y="34">
        <parameter key="attribute_name" value="BILLID"/>
        <parameter key="target_role" value="id"/>
        <list key="set_additional_roles">
          <parameter key="QUERYCATEGORY" value="label"/>
        </list>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="34">
        <parameter key="vector_creation" value="Binary Term Occurrences"/>
        <parameter key="prune_method" value="percentual"/>
        <parameter key="prune_above_percent" value="50.0"/>
        <parameter key="prune_below_absolute" value="5"/>
        <parameter key="prune_above_absolute" value="2000"/>
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
          <operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
          <operator activated="true" class="text:generate_n_grams_terms" compatibility="7.4.001" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="313" y="34">
            <parameter key="max_length" value="3"/>
          </operator>
          <operator activated="true" class="text:stem_porter" compatibility="7.4.001" expanded="true" height="68" name="Stem (Porter)" width="90" x="447" y="34"/>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="7.4.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="581" y="34"/>
          <operator activated="true" class="text:filter_by_length" compatibility="7.4.001" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="715" y="34"/>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
          <connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
          <connect from_op="Stem (Porter)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="7.5.001" expanded="true" height="82" name="Select Attributes" width="90" x="447" y="238">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="att7|REPLYBY|QUERYDATE|QUERYBY|ITERATIONID|CLAIMID|BILLID"/>
        <parameter key="invert_selection" value="true"/>
      </operator>
      <operator activated="true" class="nominal_to_date" compatibility="7.5.001" expanded="true" height="82" name="Nominal to Date" width="90" x="447" y="340">
        <parameter key="attribute_name" value="CREATEDDATE"/>
        <parameter key="date_format" value="yyyy-MM-dd"/>
      </operator>
      <operator activated="true" class="real_to_integer" compatibility="6.0.003" expanded="true" height="82" name="Real to Integer" width="90" x="246" y="340">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attributes" value="length_of_hire|Est_Income"/>
      </operator>
      <operator activated="true" class="numerical_to_binominal" compatibility="6.0.003" expanded="true" height="82" name="Numerical to Binominal" width="90" x="179" y="493">
        <parameter key="attribute_filter_type" value="value_type"/>
        <parameter key="value_type" value="real"/>
      </operator>
      <operator activated="true" class="multiply" compatibility="7.5.001" expanded="true" height="103" name="Multiply" width="90" x="380" y="544"/>
      <operator activated="true" class="concurrency:parallel_decision_tree" compatibility="7.5.001" expanded="true" height="82" name="Decision Tree (2)" width="90" x="782" y="85">
        <parameter key="maximal_depth" value="5"/>
        <parameter key="apply_prepruning" value="false"/>
      </operator>
      <operator activated="true" class="nominal_to_numerical" compatibility="7.1.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="581" y="525">
        <parameter key="attributes" value="Benefit|SpousalBenefit|StockPlan"/>
        <list key="comparison_groups"/>
      </operator>
      <operator activated="true" class="remap_binominals" compatibility="7.3.000" expanded="true" height="82" name="Remap Binominals" width="90" x="715" y="525">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="Status"/>
        <parameter key="include_special_attributes" value="true"/>
        <parameter key="negative_value" value="past"/>
        <parameter key="positive_value" value="current"/>
      </operator>
      <operator activated="true" class="weight_by_svm" compatibility="7.5.001" expanded="true" height="82" name="Weight by SVM" width="90" x="849" y="525">
        <parameter key="normalize_weights" value="true"/>
      </operator>
      <operator activated="true" class="weights_to_data" compatibility="7.5.001" expanded="true" height="68" name="Weights to Data" width="90" x="782" y="210"/>
      <operator activated="true" class="sort" compatibility="7.5.001" expanded="true" height="82" name="Sort" width="90" x="916" y="210">
        <parameter key="attribute_name" value="Weight"/>
        <parameter key="sorting_direction" value="decreasing"/>
      </operator>
      <operator activated="true" class="filter_example_range" compatibility="7.5.001" expanded="true" height="82" name="Filter Example Range" width="90" x="1050" y="210">
        <parameter key="first_example" value="1"/>
        <parameter key="last_example" value="10"/>
      </operator>
      <connect from_op="Retrieve QRY_ITERATION_T_Mini" from_port="output" to_op="Filter Examples" to_port="example set input"/>
      <connect from_op="Filter Examples" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
      <connect from_op="Replace Missing Values" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_port="result 1"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Nominal to Date" to_port="example set input"/>
      <connect from_op="Nominal to Date" from_port="example set output" to_op="Real to Integer" to_port="example set input"/>
      <connect from_op="Real to Integer" from_port="original" to_op="Numerical to Binominal" to_port="example set input"/>
      <connect from_op="Numerical to Binominal" from_port="example set output" to_op="Multiply" to_port="input"/>
      <connect from_op="Multiply" from_port="output 1" to_op="Nominal to Numerical" to_port="example set input"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Decision Tree (2)" to_port="training set"/>
      <connect from_op="Decision Tree (2)" from_port="model" to_port="result 3"/>
      <connect from_op="Nominal to Numerical" from_port="example set output" to_op="Remap Binominals" to_port="example set input"/>
      <connect from_op="Remap Binominals" from_port="example set output" to_op="Weight by SVM" to_port="example set"/>
      <connect from_op="Weight by SVM" from_port="weights" to_op="Weights to Data" to_port="attribute weights"/>
      <connect from_op="Weights to Data" from_port="example set" to_op="Sort" to_port="example set input"/>
      <connect from_op="Sort" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
      <connect from_op="Filter Example Range" from_port="example set output" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
    </process>
  </operator>
</process>

Customer Class.png