Classification of products

Hi there,
I have one problem to solve. I have where each row corresponds to a single product. There are a total of 93 numerical features, which represent counts of different events. There are nine categories for all products. My objective is to classify product into 9 different categories. There are 61878 examples. I tried libSVM, k-NN in rapidminer, but i got very high accuracy,about 99,9 so something isn't done well. Does anybody know how to solve this?
Answers
-
hello @994kaca welcome to the community! Some quick requests so we can help you:
• Post your XML process here in this thread (see this post for instructions on How to Post on the Community)
• Attach your dataset if possible (use a fictionalized version if there are privacy concerns)
• Make sure you have all necessary extensions installed (see https://youtu.be/pjBqG3xtXx4)
Scott[Edit - I moved your post from Radoop to Getting Started as this is the more appropriate place for your query. SG]
0 -
Hi,
I don't have XML process, i can attach my dataset?
0 -
Dataset is too large for this message. Here is link where you can find training set for this problem. https://www.kaggle.com/c/otto-group-product-classification-challenge/data
0 -
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.2.001" expanded="true" height="68" name="Retrieve" width="90" x="45" y="34">
<parameter key="repository_entry" value="../Data/train"/>
</operator>
<operator activated="true" class="sample_stratified" compatibility="8.2.001" expanded="true" height="82" name="Sample (Stratified)" width="90" x="179" y="34">
<parameter key="sample" value="relative"/>
<parameter key="sample_size" value="6187"/>
<parameter key="use_local_random_seed" value="true"/>
</operator>
<operator activated="true" class="concurrency:cross_validation" compatibility="8.2.001" expanded="true" height="145" name="Cross Validation" width="90" x="313" y="34">
<process expanded="true">
<operator activated="true" class="k_nn" compatibility="8.2.001" expanded="true" height="82" name="k-NN" width="90" x="112" y="34"/>
<connect from_port="training set" to_op="k-NN" to_port="training set"/>
<connect from_op="k-NN" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="8.2.001" expanded="true" height="82" name="Apply Model" width="90" x="45" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="8.2.001" expanded="true" height="82" name="Performance" width="90" x="179" y="34">
<parameter key="classification_error" value="true"/>
<parameter key="weighted_mean_recall" value="true"/>
<parameter key="weighted_mean_precision" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="performance 1"/>
<connect from_op="Performance" from_port="example set" to_port="test set results"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_test set results" spacing="0"/>
<portSpacing port="sink_performance 1" spacing="0"/>
<portSpacing port="sink_performance 2" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve" from_port="output" to_op="Sample (Stratified)" to_port="example set input"/>
<connect from_op="Sample (Stratified)" from_port="example set output" to_op="Cross Validation" to_port="example set"/>
<connect from_op="Cross Validation" from_port="model" to_port="result 1"/>
<connect from_op="Cross Validation" from_port="example set" to_port="result 3"/>
<connect from_op="Cross Validation" from_port="test result set" to_port="result 2"/>
<connect from_op="Cross Validation" from_port="performance 1" to_port="result 4"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>Here you are. I didn't undrstand your message at first,sorry. The problem is that accuracy for this k-nn is 99,9 which is too high,and something is wrong, do you know where the problem is?
Thank you
0 -
ah thank you for the XML.
Beats the heck out of me how you get 99% accuracy. I get 71.49%.
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.0.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_csv" compatibility="9.0.001" expanded="true" height="68" name="Read CSV" width="90" x="45" y="34">
<parameter key="csv_file" value="/Users/genzerconsulting/Desktop/all/train.csv"/>
<parameter key="column_separators" value=","/>
<parameter key="skip_comments" value="true"/>
<parameter key="date_format" value="MMM d, yyyy h:mm:ss a z"/>
<list key="annotations"/>
<parameter key="encoding" value="UTF-8"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="id.true.integer.id"/>
<parameter key="1" value="feat_1.true.integer.attribute"/>
<parameter key="2" value="feat_2.true.integer.attribute"/>
<parameter key="3" value="feat_3.true.integer.attribute"/>
<parameter key="4" value="feat_4.true.integer.attribute"/>
<parameter key="5" value="feat_5.true.integer.attribute"/>
<parameter key="6" value="feat_6.true.integer.attribute"/>
<parameter key="7" value="feat_7.true.integer.attribute"/>
<parameter key="8" value="feat_8.true.integer.attribute"/>
<parameter key="9" value="feat_9.true.integer.attribute"/>
<parameter key="10" value="feat_10.true.integer.attribute"/>
<parameter key="11" value="feat_11.true.integer.attribute"/>
<parameter key="12" value="feat_12.true.integer.attribute"/>
<parameter key="13" value="feat_13.true.integer.attribute"/>
<parameter key="14" value="feat_14.true.integer.attribute"/>
<parameter key="15" value="feat_15.true.integer.attribute"/>
<parameter key="16" value="feat_16.true.integer.attribute"/>
<parameter key="17" value="feat_17.true.integer.attribute"/>
<parameter key="18" value="feat_18.true.integer.attribute"/>
<parameter key="19" value="feat_19.true.integer.attribute"/>
<parameter key="20" value="feat_20.true.integer.attribute"/>
<parameter key="21" value="feat_21.true.integer.attribute"/>
<parameter key="22" value="feat_22.true.integer.attribute"/>
<parameter key="23" value="feat_23.true.integer.attribute"/>
<parameter key="24" value="feat_24.true.integer.attribute"/>
<parameter key="25" value="feat_25.true.integer.attribute"/>
<parameter key="26" value="feat_26.true.integer.attribute"/>
<parameter key="27" value="feat_27.true.integer.attribute"/>
<parameter key="28" value="feat_28.true.integer.attribute"/>
<parameter key="29" value="feat_29.true.integer.attribute"/>
<parameter key="30" value="feat_30.true.integer.attribute"/>
<parameter key="31" value="feat_31.true.integer.attribute"/>
<parameter key="32" value="feat_32.true.integer.attribute"/>
<parameter key="33" value="feat_33.true.integer.attribute"/>
<parameter key="34" value="feat_34.true.integer.attribute"/>
<parameter key="35" value="feat_35.true.integer.attribute"/>
<parameter key="36" value="feat_36.true.integer.attribute"/>
<parameter key="37" value="feat_37.true.integer.attribute"/>
<parameter key="38" value="feat_38.true.integer.attribute"/>
<parameter key="39" value="feat_39.true.integer.attribute"/>
<parameter key="40" value="feat_40.true.integer.attribute"/>
<parameter key="41" value="feat_41.true.integer.attribute"/>
<parameter key="42" value="feat_42.true.integer.attribute"/>
<parameter key="43" value="feat_43.true.integer.attribute"/>
<parameter key="44" value="feat_44.true.integer.attribute"/>
<parameter key="45" value="feat_45.true.integer.attribute"/>
<parameter key="46" value="feat_46.true.integer.attribute"/>
<parameter key="47" value="feat_47.true.integer.attribute"/>
<parameter key="48" value="feat_48.true.integer.attribute"/>
<parameter key="49" value="feat_49.true.integer.attribute"/>
<parameter key="50" value="feat_50.true.integer.attribute"/>
<parameter key="51" value="feat_51.true.integer.attribute"/>
<parameter key="52" value="feat_52.true.integer.attribute"/>
<parameter key="53" value="feat_53.true.integer.attribute"/>
<parameter key="54" value="feat_54.true.integer.attribute"/>
<parameter key="55" value="feat_55.true.integer.attribute"/>
<parameter key="56" value="feat_56.true.integer.attribute"/>
<parameter key="57" value="feat_57.true.integer.attribute"/>
<parameter key="58" value="feat_58.true.integer.attribute"/>
<parameter key="59" value="feat_59.true.integer.attribute"/>
<parameter key="60" value="feat_60.true.integer.attribute"/>
<parameter key="61" value="feat_61.true.integer.attribute"/>
<parameter key="62" value="feat_62.true.integer.attribute"/>
<parameter key="63" value="feat_63.true.integer.attribute"/>
<parameter key="64" value="feat_64.true.integer.attribute"/>
<parameter key="65" value="feat_65.true.integer.attribute"/>
<parameter key="66" value="feat_66.true.integer.attribute"/>
<parameter key="67" value="feat_67.true.integer.attribute"/>
<parameter key="68" value="feat_68.true.integer.attribute"/>
<parameter key="69" value="feat_69.true.integer.attribute"/>
<parameter key="70" value="feat_70.true.integer.attribute"/>
<parameter key="71" value="feat_71.true.integer.attribute"/>
<parameter key="72" value="feat_72.true.integer.attribute"/>
<parameter key="73" value="feat_73.true.integer.attribute"/>
<parameter key="74" value="feat_74.true.integer.attribute"/>
<parameter key="75" value="feat_75.true.integer.attribute"/>
<parameter key="76" value="feat_76.true.integer.attribute"/>
<parameter key="77" value="feat_77.true.integer.attribute"/>
<parameter key="78" value="feat_78.true.integer.attribute"/>
<parameter key="79" value="feat_79.true.integer.attribute"/>
<parameter key="80" value="feat_80.true.integer.attribute"/>
<parameter key="81" value="feat_81.true.integer.attribute"/>
<parameter key="82" value="feat_82.true.integer.attribute"/>
<parameter key="83" value="feat_83.true.integer.attribute"/>
<parameter key="84" value="feat_84.true.integer.attribute"/>
<parameter key="85" value="feat_85.true.integer.attribute"/>
<parameter key="86" value="feat_86.true.integer.attribute"/>
<parameter key="87" value="feat_87.true.integer.attribute"/>
<parameter key="88" value="feat_88.true.integer.attribute"/>
<parameter key="89" value="feat_89.true.integer.attribute"/>
<parameter key="90" value="feat_90.true.integer.attribute"/>
<parameter key="91" value="feat_91.true.integer.attribute"/>
<parameter key="92" value="feat_92.true.integer.attribute"/>
<parameter key="93" value="feat_93.true.integer.attribute"/>
<parameter key="94" value="target.true.polynominal.label"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="false"/>
</operator>
<operator activated="true" class="sample_stratified" compatibility="9.0.001" expanded="true" height="82" name="Sample (Stratified)" width="90" x="179" y="34">
<parameter key="sample" value="relative"/>
<parameter key="sample_size" value="6187"/>
<parameter key="use_local_random_seed" value="true"/>
</operator>
<operator activated="true" class="concurrency:cross_validation" compatibility="9.0.001" expanded="true" height="145" name="Cross Validation" width="90" x="313" y="34">
<process expanded="true">
<operator activated="true" class="k_nn" compatibility="9.0.001" expanded="true" height="82" name="k-NN" width="90" x="112" y="34"/>
<connect from_port="training set" to_op="k-NN" to_port="training set"/>
<connect from_op="k-NN" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="apply_model" compatibility="9.0.001" expanded="true" height="82" name="Apply Model" width="90" x="45" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="9.0.001" expanded="true" height="82" name="Performance" width="90" x="179" y="34">
<parameter key="classification_error" value="true"/>
<parameter key="weighted_mean_recall" value="true"/>
<parameter key="weighted_mean_precision" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="performance 1"/>
<connect from_op="Performance" from_port="example set" to_port="test set results"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_test set results" spacing="0"/>
<portSpacing port="sink_performance 1" spacing="0"/>
<portSpacing port="sink_performance 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read CSV" from_port="output" to_op="Sample (Stratified)" to_port="example set input"/>
<connect from_op="Sample (Stratified)" from_port="example set output" to_op="Cross Validation" to_port="example set"/>
<connect from_op="Cross Validation" from_port="model" to_port="result 1"/>
<connect from_op="Cross Validation" from_port="example set" to_port="result 3"/>
<connect from_op="Cross Validation" from_port="test result set" to_port="result 2"/>
<connect from_op="Cross Validation" from_port="performance 1" to_port="result 4"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>Scott
0