"Issues with processing data and clustering operators"

Hi,
I am making a project on Rapidminer for the Kaggle Walmart Customer Trip type prediction but I want to use Clustering Algorithm instead of Prediction to find the maximum and minimum sales based on days and the departments making the maximum and minimum sales. I am using the same data set used in the Kaggle competition.
I am new to data analytics and am trying to understand the operators to reach my result but I am unable to proceed ahead with the process. Please have a look at the process flow in the attachment and help me out by letting me know where am I going wrong.
Dataset: https://www.kaggle.com/c/walmart-recruiting-trip-type-classification/data
Regards,
Naman
Answers
-
@naman_sharma where is your clustering operator? Could you please share the process xml code or rmp file?
1 -
@dang Please see the attached rmp file. I tried using k-means for clustering but its taking too much time to complete the process. In 2 hours it completed just 14% of the process.
0 -
@naman_sharma the process you shared has no clustering operator attached. Please attach the dataset you used, I don't want to answer a survey from Walmart to unlock the dataset.
0 -
-
@naman_sharma the process runs in about 40 seconds on my machine, so it might be a problem with memory or the type of license you have.
I'm not familiar with this dataset but I noticed that the "visit numbers" attribute is on a huge scale (from 5 to 20,000 or so). That'll skew the results a bit and you might want to think about normalizing that if it makes sense.
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.1.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="false" class="open_file" compatibility="8.1.001" expanded="true" height="68" name="Open File" width="90" x="45" y="136">
<parameter key="filename" value="/Users/naman/Desktop/MISt/Sem 2/GLIS 630 Mining/Project/Walmart_TripType_Dataset.csv"/>
</operator>
<operator activated="true" class="read_csv" compatibility="8.1.000" expanded="true" height="68" name="Read CSV" width="90" x="45" y="34">
<parameter key="csv_file" value="C:\Users\TomOtt\Downloads\Walmart_TripType_Dataset.csv"/>
<parameter key="column_separators" value=","/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="encoding" value="UTF-8"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="TripType.true.integer.attribute"/>
<parameter key="1" value="VisitNumber.true.integer.attribute"/>
<parameter key="2" value="Weekday.true.polynominal.attribute"/>
<parameter key="3" value="Upc.true.real.attribute"/>
<parameter key="4" value="ScanCount.true.integer.attribute"/>
<parameter key="5" value="DepartmentDescription.true.polynominal.attribute"/>
<parameter key="6" value="FinelineNumber.true.integer.attribute"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.1.001" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Upc"/>
<parameter key="invert_selection" value="true"/>
</operator>
<operator activated="true" class="replace_missing_values" compatibility="8.1.001" expanded="true" height="103" name="Replace Missing Values" width="90" x="313" y="34">
<list key="columns"/>
</operator>
<operator activated="true" class="set_role" compatibility="8.1.001" expanded="true" height="82" name="Set Role" width="90" x="447" y="34">
<parameter key="attribute_name" value="DepartmentDescription"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="x_means" compatibility="8.1.001" expanded="true" height="82" name="X-Means" width="90" x="581" y="34">
<parameter key="measure_types" value="MixedMeasures"/>
</operator>
<operator activated="false" class="sample" compatibility="8.1.001" expanded="true" height="82" name="Sample" width="90" x="581" y="136">
<parameter key="sample" value="relative"/>
<parameter key="sample_size" value="2000"/>
<list key="sample_size_per_class"/>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
</operator>
<connect from_op="Read CSV" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Replace Missing Values" to_port="example set input"/>
<connect from_op="Replace Missing Values" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="X-Means" to_port="example set"/>
<connect from_op="X-Means" from_port="cluster model" to_port="result 1"/>
<connect from_op="X-Means" from_port="clustered set" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>1