To classify customer queries using RapidMiner

I have a huge excel file, which has general attributes like BILLID, CLAIM ID, ITERATION ID, QUERY TEXT etc. I want to classify these queries to identify major bulk of problem areas. I tried employing Decision Tree, but it did not render any insights. Most of what I researched I stumbled upon using filter keywords, Tokenizing, Stemming to identify keyword frequency, But I couldn't use it for excel attribute, irrespective of that I am not sure that would help me in finding clusters/buckets of complete query statement rather than just keywords. Any help in finding right direction would be of much help, Thank You.
Answers
-
You need to set a label to classify your dataset and if you have some sort of text to process, you'd need the Text Processing extension too.
Attached is a sample process that I used to classify some HR data that also used open ended survey questions.
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" breakpoints="after" class="retrieve" compatibility="7.5.000" expanded="true" height="68" name="Retrieve EmployeeData" width="90" x="45" y="30">
<parameter key="repository_entry" value="../data/EmployeeData"/>
</operator>
<operator activated="true" class="replace_missing_values" compatibility="7.5.000" expanded="true" height="103" name="Replace Missing Values" width="90" x="179" y="30">
<parameter key="value_type" value="nominal"/>
<list key="columns"/>
<parameter key="replenishment_value" value="MISSING"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="6.0.003" expanded="true" height="82" name="Nominal to Text" width="90" x="313" y="30">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="CanDoBetter"/>
</operator>
<operator activated="true" class="set_role" compatibility="7.5.000" expanded="true" height="82" name="Set Role" width="90" x="447" y="30">
<parameter key="attribute_name" value="ID"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles">
<parameter key="Status" value="label"/>
</list>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="34">
<parameter key="vector_creation" value="Binary Term Occurrences"/>
<parameter key="prune_method" value="percentual"/>
<parameter key="prune_above_percent" value="50.0"/>
<parameter key="prune_below_absolute" value="5"/>
<parameter key="prune_above_absolute" value="2000"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
<operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="7.4.001" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="313" y="34">
<parameter key="max_length" value="3"/>
</operator>
<operator activated="true" class="text:stem_porter" compatibility="7.4.001" expanded="true" height="68" name="Stem (Porter)" width="90" x="447" y="34"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="7.4.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="581" y="34"/>
<operator activated="true" class="text:filter_by_length" compatibility="7.4.001" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="715" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
<connect from_op="Stem (Porter)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="Select Attributes" width="90" x="581" y="120">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="ID|Status|Benefit|DriveTime|Education|Est_Income|Facility|Marital|PositionType|SpousalBenefit|StockPlan|Training|TrainingRecieved|length_of_hire|bottled_wat|benefit|event|pay_mor|listen|pension|salari|social"/>
</operator>
<operator activated="true" class="real_to_integer" compatibility="6.0.003" expanded="true" height="82" name="Real to Integer" width="90" x="581" y="210">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="length_of_hire|Est_Income"/>
</operator>
<operator activated="true" class="numerical_to_binominal" compatibility="6.0.003" expanded="true" height="82" name="Numerical to Binominal" width="90" x="581" y="300">
<parameter key="attribute_filter_type" value="value_type"/>
<parameter key="value_type" value="real"/>
</operator>
<operator activated="true" class="multiply" compatibility="7.5.000" expanded="true" height="103" name="Multiply" width="90" x="581" y="390"/>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="7.5.000" expanded="true" height="82" name="Decision Tree (2)" width="90" x="782" y="85">
<parameter key="maximal_depth" value="5"/>
<parameter key="apply_prepruning" value="false"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="7.1.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="581" y="525">
<parameter key="attributes" value="Benefit|SpousalBenefit|StockPlan"/>
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="remap_binominals" compatibility="7.3.000" expanded="true" height="82" name="Remap Binominals" width="90" x="715" y="525">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Status"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="negative_value" value="past"/>
<parameter key="positive_value" value="current"/>
</operator>
<operator activated="true" class="weight_by_svm" compatibility="7.5.000" expanded="true" height="82" name="Weight by SVM" width="90" x="849" y="525">
<parameter key="normalize_weights" value="true"/>
</operator>
<operator activated="true" class="weights_to_data" compatibility="7.5.000" expanded="true" height="68" name="Weights to Data" width="90" x="782" y="210"/>
<operator activated="true" class="sort" compatibility="7.5.000" expanded="true" height="82" name="Sort" width="90" x="916" y="210">
<parameter key="attribute_name" value="Weight"/>
<parameter key="sorting_direction" value="decreasing"/>
</operator>
<operator activated="true" class="filter_example_range" compatibility="7.5.000" expanded="true" height="82" name="Filter Example Range" width="90" x="1050" y="210">
<parameter key="first_example" value="1"/>
<parameter key="last_example" value="10"/>
</operator>
<connect from_op="Retrieve EmployeeData" from_port="output" to_op="Replace Missing Values" to_port="example set input"/>
<connect from_op="Replace Missing Values" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Process Documents from Data" from_port="word list" to_port="result 1"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Real to Integer" to_port="example set input"/>
<connect from_op="Real to Integer" from_port="example set output" to_op="Numerical to Binominal" to_port="example set input"/>
<connect from_op="Numerical to Binominal" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 2" to_op="Decision Tree (2)" to_port="training set"/>
<connect from_op="Decision Tree (2)" from_port="model" to_port="result 3"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Remap Binominals" to_port="example set input"/>
<connect from_op="Remap Binominals" from_port="example set output" to_op="Weight by SVM" to_port="example set"/>
<connect from_op="Weight by SVM" from_port="weights" to_op="Weights to Data" to_port="attribute weights"/>
<connect from_op="Weights to Data" from_port="example set" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
<connect from_op="Filter Example Range" from_port="example set output" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>0 -
I tried to replicate your process, but
1. It doesn't show any attribute in Remap Binomials operator,
2. The decision tree generated individual branches for each example, no relevant insights, I think I am not using it properly.
3. Word frquency looks irregular, For example at times it is showing count for "approv" "provide" or "support" instead of supportings.
Thank you for such a prompt reply, This is my first process @ Rapidminer. My expectation is to classify textual queries(thousands) into some handful of classes, For example "Tax related", "Incorrect/Empty details", "Supportings not Attached". Queries are remarks put up by officer to intiator or QTI(Query to Initiator). I am looking a way to automate these query handling proccesses in order to improve efficiency and productivity. Hence I am starting with the first step of identifying major problem/Query areas - classification.
I am sorry, If I sound immature, I have taken 3 weeks of internet scouring before posting here. This field is quite new in my environment. Hence I couldn't get much help from around, It would help in great deal, if you can share resources about some example processes from where I can learn by comparison.
0 -
Do you have some sample data to share too? Would make troubleshooting this easier.
Thanks!
0 -
Hi Thank you for your time and attention, PFA the data file. I hope it helps the context of my problem. These are queries related to payable, i want to automate query handling processes, hence I am trying to classify these queries into handful of focus areas and think about solutions from there.
0 -
So your data set has some data quality issues. I just Filtered them out by selecting only ER- records. Then I made some tweaks to the process and got a results.
See below for the XML.
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.5.001" expanded="true" name="Process">
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.5.001" expanded="true" height="68" name="Retrieve QRY_ITERATION_T_Mini" width="90" x="45" y="34">
<parameter key="repository_entry" value="../data/QRY_ITERATION_T_Mini"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="7.5.001" expanded="true" height="103" name="Filter Examples" width="90" x="45" y="136">
<list key="filters_list">
<parameter key="filters_entry_key" value="CLAIMID.contains.ER-"/>
</list>
</operator>
<operator activated="true" class="replace_missing_values" compatibility="7.5.001" expanded="true" height="103" name="Replace Missing Values" width="90" x="179" y="136">
<parameter key="value_type" value="nominal"/>
<list key="columns"/>
<parameter key="replenishment_value" value="MISSING"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="6.0.003" expanded="true" height="82" name="Nominal to Text" width="90" x="313" y="136">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value="QUERYTEXT"/>
<parameter key="attributes" value="REPLYTEXT|QUERYTEXT"/>
</operator>
<operator activated="true" class="set_role" compatibility="7.5.001" expanded="true" height="82" name="Set Role" width="90" x="447" y="34">
<parameter key="attribute_name" value="BILLID"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles">
<parameter key="QUERYCATEGORY" value="label"/>
</list>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="34">
<parameter key="vector_creation" value="Binary Term Occurrences"/>
<parameter key="prune_method" value="percentual"/>
<parameter key="prune_above_percent" value="50.0"/>
<parameter key="prune_below_absolute" value="5"/>
<parameter key="prune_above_absolute" value="2000"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
<operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="7.4.001" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="313" y="34">
<parameter key="max_length" value="3"/>
</operator>
<operator activated="true" class="text:stem_porter" compatibility="7.4.001" expanded="true" height="68" name="Stem (Porter)" width="90" x="447" y="34"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="7.4.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="581" y="34"/>
<operator activated="true" class="text:filter_by_length" compatibility="7.4.001" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="715" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
<connect from_op="Stem (Porter)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.5.001" expanded="true" height="82" name="Select Attributes" width="90" x="447" y="238">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="att7|REPLYBY|QUERYDATE|QUERYBY|ITERATIONID|CLAIMID|BILLID"/>
<parameter key="invert_selection" value="true"/>
</operator>
<operator activated="true" class="nominal_to_date" compatibility="7.5.001" expanded="true" height="82" name="Nominal to Date" width="90" x="447" y="340">
<parameter key="attribute_name" value="CREATEDDATE"/>
<parameter key="date_format" value="yyyy-MM-dd"/>
</operator>
<operator activated="true" class="real_to_integer" compatibility="6.0.003" expanded="true" height="82" name="Real to Integer" width="90" x="246" y="340">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="length_of_hire|Est_Income"/>
</operator>
<operator activated="true" class="numerical_to_binominal" compatibility="6.0.003" expanded="true" height="82" name="Numerical to Binominal" width="90" x="179" y="493">
<parameter key="attribute_filter_type" value="value_type"/>
<parameter key="value_type" value="real"/>
</operator>
<operator activated="true" class="multiply" compatibility="7.5.001" expanded="true" height="103" name="Multiply" width="90" x="380" y="544"/>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="7.5.001" expanded="true" height="82" name="Decision Tree (2)" width="90" x="782" y="85">
<parameter key="maximal_depth" value="5"/>
<parameter key="apply_prepruning" value="false"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="7.1.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="581" y="525">
<parameter key="attributes" value="Benefit|SpousalBenefit|StockPlan"/>
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="remap_binominals" compatibility="7.3.000" expanded="true" height="82" name="Remap Binominals" width="90" x="715" y="525">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Status"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="negative_value" value="past"/>
<parameter key="positive_value" value="current"/>
</operator>
<operator activated="true" class="weight_by_svm" compatibility="7.5.001" expanded="true" height="82" name="Weight by SVM" width="90" x="849" y="525">
<parameter key="normalize_weights" value="true"/>
</operator>
<operator activated="true" class="weights_to_data" compatibility="7.5.001" expanded="true" height="68" name="Weights to Data" width="90" x="782" y="210"/>
<operator activated="true" class="sort" compatibility="7.5.001" expanded="true" height="82" name="Sort" width="90" x="916" y="210">
<parameter key="attribute_name" value="Weight"/>
<parameter key="sorting_direction" value="decreasing"/>
</operator>
<operator activated="true" class="filter_example_range" compatibility="7.5.001" expanded="true" height="82" name="Filter Example Range" width="90" x="1050" y="210">
<parameter key="first_example" value="1"/>
<parameter key="last_example" value="10"/>
</operator>
<connect from_op="Retrieve QRY_ITERATION_T_Mini" from_port="output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
<connect from_op="Replace Missing Values" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Process Documents from Data" from_port="word list" to_port="result 1"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Nominal to Date" to_port="example set input"/>
<connect from_op="Nominal to Date" from_port="example set output" to_op="Real to Integer" to_port="example set input"/>
<connect from_op="Real to Integer" from_port="original" to_op="Numerical to Binominal" to_port="example set input"/>
<connect from_op="Numerical to Binominal" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 2" to_op="Decision Tree (2)" to_port="training set"/>
<connect from_op="Decision Tree (2)" from_port="model" to_port="result 3"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Remap Binominals" to_port="example set input"/>
<connect from_op="Remap Binominals" from_port="example set output" to_op="Weight by SVM" to_port="example set"/>
<connect from_op="Weight by SVM" from_port="weights" to_op="Weights to Data" to_port="attribute weights"/>
<connect from_op="Weights to Data" from_port="example set" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
<connect from_op="Filter Example Range" from_port="example set output" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>0