Dear all,
I am working on a text mining use case with a data set of around 80.000 data sets and 33 Attributes (after implementing TF-IDF, SVD, different filter and wrapper methods) tyring to predict around 100 classes. We tested different algorithm like Naive Bayes, k-NN, Fast Large Margin and Gradient Boosted Trees and are quite satisfied with the results.
A big challenge while testing and evaluating the different algorithm were the computation time and the usage of the memory. Even working with a quit enhanced machine (128 gb memory and 8 cores, RapidMiner 7.6 professional license) the computation time for evaluating and building the models enlarged up to one week (for example testing Gradient Boosted Trees with an Evolutionary Parameter Optimization), the usage of the memory raised up to 100% and the machine crashes even while testing Naive Bayes. Only with implementation of the Free Memory and Materialize Data operator into the inner and outer validation, the processes run stable, but takes very long. Of course with the high number of data sets, attributes and classes running processes isn`t very simple and fast.
However, as far as I know RapidMiner has implemented a new core for optimizing the computation time and memory usage, but it seems not to work in our use case respectively are these high computation times and the usage of the memory up to 128 gb usual? So based on that my questions are: Are there any mistakes in our process? If not, how could we optimize the processes? How to use the Free Memory and Materialize Data operator to optimize the computation time and memory usage?
For better understanding attached, you will find the validation process for the Gradient Boosted Trees algorithm including the filtering of the attributes by chi squared and the parameter optimization.
Thanks in advance for your help.
Michel
<?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="split_validation" compatibility="7.6.001" expanded="true" height="124" name="Äußere Validierung Gradient Boosted Tree" width="90" x="581" y="136">
<process expanded="true">
<operator activated="true" class="weight_by_chi_squared_statistic" compatibility="7.6.001" expanded="true" height="82" name="Weight by Chi Squared Statistic (5)" width="90" x="45" y="34"/>
<operator activated="true" class="select_by_weights" compatibility="7.6.001" expanded="true" height="103" name="Select by Weights (9)" width="90" x="45" y="238">
<parameter key="weight" value="20000.0"/>
</operator>
<operator activated="true" class="multiply" compatibility="7.6.001" expanded="true" height="103" name="Multiply (9)" width="90" x="179" y="34"/>
<operator activated="true" class="optimize_parameters_evolutionary" compatibility="7.6.001" expanded="true" height="103" name="Optimize Parameters (Evolutionary)" width="90" x="313" y="187">
<list key="parameters">
<parameter key="Gradient Boosted Trees.number_of_trees" value="[10;1000.0]"/>
<parameter key="Gradient Boosted Trees.maximal_depth" value="[10;300]"/>
<parameter key="Gradient Boosted Trees.learning_rate" value="[0.1;1.0]"/>
</list>
<parameter key="selection_type" value="roulette wheel"/>
<parameter key="use_local_random_seed" value="true"/>
<process expanded="true">
<operator activated="true" class="split_validation" compatibility="7.6.001" expanded="true" height="124" name="Validierung Optimize Gradient Modelvalidierung" width="90" x="447" y="34">
<parameter key="use_local_random_seed" value="true"/>
<process expanded="true">
<operator activated="true" class="h2o:gradient_boosted_trees" compatibility="7.6.001" expanded="true" height="103" name="Gradient Boosted Trees" width="90" x="380" y="34">
<parameter key="number_of_trees" value="636"/>
<parameter key="maximal_depth" value="160"/>
<parameter key="learning_rate" value="0.3528977870143133"/>
<list key="expert_parameters"/>
</operator>
<operator activated="false" class="free_memory" compatibility="7.6.001" expanded="true" height="82" name="Free Memory (2)" width="90" x="45" y="85"/>
<operator activated="false" class="materialize_data" compatibility="7.6.001" expanded="true" height="82" name="Materialize Data (2)" width="90" x="179" y="85"/>
<connect from_port="training" to_op="Gradient Boosted Trees" to_port="training set"/>
<connect from_op="Gradient Boosted Trees" from_port="model" to_port="model"/>
<connect from_op="Free Memory (2)" from_port="through 1" to_op="Materialize Data (2)" to_port="example set input"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true">
<operator activated="false" class="free_memory" compatibility="7.6.001" expanded="true" height="82" name="Free Memory (19)" width="90" x="45" y="85"/>
<operator activated="false" class="materialize_data" compatibility="7.6.001" expanded="true" height="82" name="Materialize Data (19)" width="90" x="179" y="85"/>
<operator activated="true" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Model Gradient Boosted Trees" width="90" x="313" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="7.6.001" expanded="true" height="82" name="Performance Gradient Boosted Trees" width="90" x="514" y="34">
<parameter key="classification_error" value="true"/>
<parameter key="soft_margin_loss" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model Gradient Boosted Trees" to_port="model"/>
<connect from_port="test set" to_op="Apply Model Gradient Boosted Trees" to_port="unlabelled data"/>
<connect from_op="Free Memory (19)" from_port="through 1" to_op="Materialize Data (19)" to_port="example set input"/>
<connect from_op="Apply Model Gradient Boosted Trees" from_port="labelled data" to_op="Performance Gradient Boosted Trees" to_port="labelled data"/>
<connect from_op="Performance Gradient Boosted Trees" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_port="input 1" to_op="Validierung Optimize Gradient Modelvalidierung" to_port="training"/>
<connect from_op="Validierung Optimize Gradient Modelvalidierung" from_port="averagable 1" to_port="performance"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_performance" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
</process>
</operator>
<operator activated="true" class="set_parameters" compatibility="7.6.001" expanded="true" height="82" name="Set Parameters (4)" width="90" x="514" y="187">
<list key="name_map">
<parameter key="Gradient Boosted Trees" value="Learner Gradient Boosted Trees"/>
</list>
</operator>
<operator activated="true" class="h2o:gradient_boosted_trees" compatibility="7.6.001" expanded="true" height="103" name="Learner Gradient Boosted Trees" width="90" x="581" y="34">
<parameter key="number_of_trees" value="1"/>
<parameter key="maximal_depth" value="50"/>
<parameter key="learning_rate" value="0.672469087120154"/>
<list key="expert_parameters"/>
</operator>
<operator activated="false" class="free_memory" compatibility="7.6.001" expanded="true" height="82" name="Free Memory (3)" width="90" x="313" y="34"/>
<operator activated="false" class="materialize_data" compatibility="7.6.001" expanded="true" height="82" name="Materialize Data (3)" width="90" x="447" y="34"/>
<connect from_port="training" to_op="Weight by Chi Squared Statistic (5)" to_port="example set"/>
<connect from_op="Weight by Chi Squared Statistic (5)" from_port="weights" to_op="Select by Weights (9)" to_port="weights"/>
<connect from_op="Weight by Chi Squared Statistic (5)" from_port="example set" to_op="Select by Weights (9)" to_port="example set input"/>
<connect from_op="Select by Weights (9)" from_port="example set output" to_op="Multiply (9)" to_port="input"/>
<connect from_op="Select by Weights (9)" from_port="weights" to_port="through 1"/>
<connect from_op="Multiply (9)" from_port="output 1" to_op="Optimize Parameters (Evolutionary)" to_port="input 1"/>
<connect from_op="Multiply (9)" from_port="output 2" to_op="Learner Gradient Boosted Trees" to_port="training set"/>
<connect from_op="Optimize Parameters (Evolutionary)" from_port="parameter" to_op="Set Parameters (4)" to_port="parameter set"/>
<connect from_op="Learner Gradient Boosted Trees" from_port="model" to_port="model"/>
<connect from_op="Free Memory (3)" from_port="through 1" to_op="Materialize Data (3)" to_port="example set input"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
<portSpacing port="sink_through 2" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="select_by_weights" compatibility="7.6.001" expanded="true" height="103" name="Select by Weights (10)" width="90" x="112" y="136">
<parameter key="weight" value="20000.0"/>
</operator>
<operator activated="true" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Model Gradient Boosted Trees Außen" width="90" x="313" y="34">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance_classification" compatibility="7.6.001" expanded="true" height="82" name="Performance (6)" width="90" x="447" y="34">
<parameter key="classification_error" value="true"/>
<list key="class_weights"/>
</operator>
<connect from_port="model" to_op="Apply Model Gradient Boosted Trees Außen" to_port="model"/>
<connect from_port="test set" to_op="Select by Weights (10)" to_port="example set input"/>
<connect from_port="through 1" to_op="Select by Weights (10)" to_port="weights"/>
<connect from_op="Select by Weights (10)" from_port="example set output" to_op="Apply Model Gradient Boosted Trees Außen" to_port="unlabelled data"/>
<connect from_op="Apply Model Gradient Boosted Trees Außen" from_port="labelled data" to_op="Performance (6)" to_port="labelled data"/>
<connect from_op="Performance (6)" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="source_through 2" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_op="Äußere Validierung Gradient Boosted Tree" from_port="averagable 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>