Hi everybody,
I've got a dataset with 580,000 instances and about 15 nominal features (except for the label which binominal) ,
when I run the process below , it gives me an error which says that GC over head limit (and a large error which says it's not possible to clone sampleset etc.)
is this usual in rapid miner ? or there's something wrong ?
Note : I should note that I have 6 Giga Bytes of memory available for rapidminer , and I guess the problem is with bagging , as it stops running when it reaches the 6th decision tree, but I don't know how to fix it
thanks
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<process expanded="true" height="611" width="1016">
<operator activated="true" class="retrieve" compatibility="5.2.008" expanded="true" height="60" name="Retrieve" width="90" x="45" y="210">
<parameter key="repository_entry" value="descritized_GI_FROM_MI50"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.2.008" expanded="true" height="76" name="Set Role (2)" width="90" x="246" y="210">
<parameter key="name" value="event"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="false" class="retrieve" compatibility="5.2.008" expanded="true" height="60" name="Retrieve (2)" width="90" x="45" y="480">
<parameter key="repository_entry" value="MI_67"/>
</operator>
<operator activated="false" class="discretize_by_bins" compatibility="5.2.008" expanded="true" height="94" name="Discretize" width="90" x="246" y="30">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="age"/>
<parameter key="number_of_bins" value="5"/>
<parameter key="range_name_type" value="short"/>
</operator>
<operator activated="true" class="sample_stratified" compatibility="5.2.008" expanded="true" height="76" name="Sample (2)" width="90" x="380" y="210">
<parameter key="sample" value="relative"/>
</operator>
<operator activated="true" class="numerical_to_polynominal" compatibility="5.2.008" expanded="true" height="76" name="Numerical to Polynominal" width="90" x="514" y="210"/>
<operator activated="false" class="select_attributes" compatibility="5.2.008" expanded="true" height="76" name="Select Attributes (3)" width="90" x="514" y="480">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="event|age|d_04173|day_30_readmits|i_9985|los|num_diags|num_drugs|num_procs|p_14|prev_readmissions||sex"/>
</operator>
<operator activated="true" class="x_validation" compatibility="5.2.008" expanded="true" height="112" name="Validation (2)" width="90" x="782" y="210">
<process expanded="true" height="836" width="2399">
<operator activated="false" class="naive_bayes_kernel" compatibility="5.2.008" expanded="true" height="76" name="Naive Bayes (Kernel)" width="90" x="514" y="30">
<parameter key="laplace_correction" value="false"/>
<parameter key="estimation_mode" value="full"/>
<parameter key="bandwidth_selection" value="fix"/>
<parameter key="bandwidth" value="0.6"/>
</operator>
<operator activated="true" class="bagging" compatibility="5.2.008" expanded="true" height="76" name="Bagging" width="90" x="581" y="210">
<parameter key="sample_ratio" value="0.4"/>
<parameter key="iterations" value="40"/>
<parameter key="average_confidences" value="false"/>
<process expanded="true" height="677" width="1037">
<operator activated="true" class="decision_tree" compatibility="5.2.008" expanded="true" height="76" name="Decision Tree (2)" width="90" x="459" y="201">
<parameter key="criterion" value="gini_index"/>
<parameter key="minimal_size_for_split" value="320"/>
<parameter key="minimal_leaf_size" value="160"/>
<parameter key="maximal_depth" value="10"/>
<parameter key="confidence" value="0.1"/>
<parameter key="number_of_prepruning_alternatives" value="10"/>
<parameter key="no_pre_pruning" value="true"/>
</operator>
<connect from_port="training set" to_op="Decision Tree (2)" to_port="training set"/>
<connect from_op="Decision Tree (2)" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
</process>
</operator>
<operator activated="false" class="decision_tree" compatibility="5.2.008" expanded="true" height="76" name="Decision Tree" width="90" x="380" y="390"/>
<connect from_port="training" to_op="Bagging" to_port="training set"/>
<connect from_op="Bagging" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true" height="682" width="502">
<operator activated="true" class="apply_model" compatibility="5.2.008" expanded="true" height="76" name="Apply Model" width="90" x="112" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_binominal_classification" compatibility="5.2.008" expanded="true" height="76" name="Performance" width="90" x="313" y="30">
<parameter key="accuracy" value="false"/>
<parameter key="AUC" value="true"/>
<parameter key="f_measure" value="true"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve" from_port="output" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Sample (2)" to_port="example set input"/>
<connect from_op="Sample (2)" from_port="example set output" to_op="Numerical to Polynominal" to_port="example set input"/>
<connect from_op="Numerical to Polynominal" from_port="example set output" to_op="Validation (2)" to_port="training"/>
<connect from_op="Validation (2)" from_port="model" to_port="result 2"/>
<connect from_op="Validation (2)" from_port="averagable 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>