Dear RapidMiner community,
my friends and I are currently working on a Data Mining project using the RapidMiner Studio, where we attempt to distinguish Twitter bots from genuine users. We have obtained an unbalanced dataset, in which we have 1000 spambots and 1548 real users. We want to compare several classification methods, such as k-NN, Naive Bayes, etc. using a 10-fold Cross-Validation. To give you an overview over the current problem: To train our model on a balanced training set, we generated one as such using upsampling of the minority class, namely the bots, in the training part of the Cross-Validation process. As our objective is however, to test our model on a real-world setting where we assume that there are approximately 90% of genuine users compared to 10% of spambots on Twitter, we want to downsample to a dataset that represents these properties in the testing part of the respective process.
Considering our 1548 genuine user as a baseline, to achieve such a 90:10 sample, we would have 1548 genuine users and 172 bots in an overall sample, which we then divided by 10 (due to 10 folds of the validation) for testing. However, we noticed that it was also possible to enter larger amount of users than we actually should have available for testing, which we believe would mean that we test on data that the model was trained on. We want to avoid this error and therefore would kindly ask for help in answering the following question: Does placing a sampling operator in the test part of a Cross-Validation lead to an overlap of training and test data?
You can find our current current proccess on the examplary case of a k-NN classification below in the XML-version as well as an rmp-file.
We kindly thank you in advance and send our best regards!
<?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="7.6.001" expanded="true" height="68" name="Retrieve BitteEinBot_final" width="90" x="45" y="34">
<parameter key="repository_entry" value="../../../data/BitteEinBot_final"/>
</operator>
<operator activated="true" class="set_role" compatibility="7.6.001" expanded="true" height="82" name="Set Role" width="90" x="112" y="136">
<parameter key="attribute_name" value="isBot"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.6.001" expanded="true" height="82" name="Select Attributes" width="90" x="246" y="187">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="isBot|dist.url.share|hash.share.same|interval.var|tweets.hash.avg|tweets.mention.avg|tweets.mention.inTweet|tweets.perDay|tweets.replyRatio|tweets.share.HashInTweet|tweets.share.fast|tweets.share.web|tweets.share.withURL|user.URL.dup|user.fofo|user.followRate|user.hasPic|user.hasURL|user.name.buzz|user.name.dist|user.name.dup|user.nbrSources|user.rep|user.url.buzz|tweets.day|tweets.dist.url|tweets.evening|tweets.fast|tweets.hash.most|tweets.morning|tweets.night|tweets.replies|tweets.rt|tweets.total|tweets.withHash|tweets.withMention|tweets.withURL|user.followers|user.following|user.lifetweets|tweets.url.api|tweets.share.rt|tweets.share.api"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="7.6.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="313" y="34">
<parameter key="return_preprocessing_model" value="false"/>
<parameter key="create_view" value="false"/>
<parameter key="attribute_filter_type" value="all"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="isBot|user.hasPic|user.hasURL|user.name.buzz|user.url.buzz|tweets.url.api|tweets.share.web|tweets.share.api|user.name.dup|user.URL.dup"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="nominal"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="file_path"/>
<parameter key="block_type" value="single_value"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="single_value"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="coding_type" value="dummy coding"/>
<parameter key="use_comparison_groups" value="false"/>
<list key="comparison_groups"/>
<parameter key="unexpected_value_handling" value="all 0 and warning"/>
<parameter key="use_underscore_in_name" value="false"/>
</operator>
<operator activated="true" class="concurrency:cross_validation" compatibility="7.6.001" expanded="true" height="145" name="Cross Validation" width="90" x="514" y="34">
<parameter key="split_on_batch_attribute" value="false"/>
<parameter key="leave_one_out" value="false"/>
<parameter key="number_of_folds" value="10"/>
<parameter key="sampling_type" value="stratified sampling"/>
<parameter key="use_local_random_seed" value="true"/>
<parameter key="local_random_seed" value="1992"/>
<parameter key="enable_parallel_execution" value="true"/>
<process expanded="true">
<operator activated="true" class="normalize" compatibility="7.6.001" expanded="true" height="103" name="Normalize (2)" width="90" x="45" y="289">
<parameter key="return_preprocessing_model" value="false"/>
<parameter key="create_view" value="false"/>
<parameter key="attribute_filter_type" value="all"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="numeric"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="real"/>
<parameter key="block_type" value="value_series"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_series_end"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="method" value="Z-transformation"/>
<parameter key="min" value="0.0"/>
<parameter key="max" value="1.0"/>
<parameter key="allow_negative_values" value="false"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="7.6.001" expanded="true" height="103" name="Filter Examples" width="90" x="45" y="34">
<parameter key="parameter_expression" value=""/>
<parameter key="condition_class" value="custom_filters"/>
<parameter key="invert_filter" value="false"/>
<list key="filters_list">
<parameter key="filters_entry_key" value="isBot.equals.1"/>
</list>
<parameter key="filters_logic_and" value="true"/>
<parameter key="filters_check_metadata" value="true"/>
</operator>
<operator activated="true" class="multiply" compatibility="7.6.001" expanded="true" height="103" name="Multiply" width="90" x="179" y="34"/>
<operator activated="true" class="sample" compatibility="7.6.001" expanded="true" height="82" name="Sample (2)" width="90" x="313" y="34">
<parameter key="sample" value="absolute"/>
<parameter key="balance_data" value="true"/>
<parameter key="sample_size" value="100"/>
<parameter key="sample_ratio" value="0.1"/>
<parameter key="sample_probability" value="0.1"/>
<list key="sample_size_per_class">
<parameter key="1" value="450"/>
</list>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<operator activated="true" class="append" compatibility="7.6.001" expanded="true" height="124" name="Append" width="90" x="447" y="136">
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<parameter key="merge_type" value="all"/>
</operator>
<operator activated="true" class="remap_binominals" compatibility="7.6.001" expanded="true" height="82" name="Remap Binominals" width="90" x="581" y="187">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="isBot"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="binominal"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="binominal"/>
<parameter key="block_type" value="value_matrix_start"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="negative_value" value="0"/>
<parameter key="positive_value" value="1"/>
</operator>
<operator activated="true" class="k_nn" compatibility="7.6.001" expanded="true" height="82" name="k-NN" width="90" x="581" y="34">
<parameter key="k" value="8"/>
<parameter key="weighted_vote" value="true"/>
<parameter key="measure_types" value="NumericalMeasures"/>
<parameter key="mixed_measure" value="MixedEuclideanDistance"/>
<parameter key="nominal_measure" value="NominalDistance"/>
<parameter key="numerical_measure" value="EuclideanDistance"/>
<parameter key="divergence" value="GeneralizedIDivergence"/>
<parameter key="kernel_type" value="radial"/>
<parameter key="kernel_gamma" value="1.0"/>
<parameter key="kernel_sigma1" value="1.0"/>
<parameter key="kernel_sigma2" value="0.0"/>
<parameter key="kernel_sigma3" value="2.0"/>
<parameter key="kernel_degree" value="3.0"/>
<parameter key="kernel_shift" value="1.0"/>
<parameter key="kernel_a" value="1.0"/>
<parameter key="kernel_b" value="0.0"/>
</operator>
<connect from_port="training set" to_op="Normalize (2)" to_port="example set input"/>
<connect from_op="Normalize (2)" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Normalize (2)" from_port="preprocessing model" to_port="through 1"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Filter Examples" from_port="unmatched example set" to_op="Append" to_port="example set 2"/>
<connect from_op="Multiply" from_port="output 1" to_op="Sample (2)" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 2" to_op="Append" to_port="example set 3"/>
<connect from_op="Sample (2)" from_port="example set output" to_op="Append" to_port="example set 1"/>
<connect from_op="Append" from_port="merged set" to_op="Remap Binominals" to_port="example set input"/>
<connect from_op="Remap Binominals" from_port="example set output" to_op="k-NN" to_port="training set"/>
<connect from_op="k-NN" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
<portSpacing port="sink_through 2" spacing="0"/>
</process>
<process expanded="true">
<operator activated="true" class="sample" compatibility="7.6.001" expanded="true" height="82" name="Sample" width="90" x="45" y="187">
<parameter key="sample" value="absolute"/>
<parameter key="balance_data" value="true"/>
<parameter key="sample_size" value="100"/>
<parameter key="sample_ratio" value="0.1"/>
<parameter key="sample_probability" value="0.1"/>
<list key="sample_size_per_class">
<parameter key="0" value="155"/>
<parameter key="1" value="17"/>
</list>
<list key="sample_ratio_per_class">
<parameter key="0" value="0.9"/>
<parameter key="1" value="0.1"/>
</list>
<list key="sample_probability_per_class"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<operator activated="true" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Preprocessing" width="90" x="179" y="136">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<operator activated="true" class="apply_model" compatibility="7.6.001" expanded="true" height="82" name="Apply Model" width="90" x="45" y="34">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<operator activated="true" class="performance_binominal_classification" compatibility="7.6.001" expanded="true" height="82" name="Performance" width="90" x="179" y="34">
<parameter key="main_criterion" value="first"/>
<parameter key="accuracy" value="false"/>
<parameter key="classification_error" value="false"/>
<parameter key="kappa" value="false"/>
<parameter key="AUC (optimistic)" value="true"/>
<parameter key="AUC" value="true"/>
<parameter key="AUC (pessimistic)" value="true"/>
<parameter key="precision" value="false"/>
<parameter key="recall" value="false"/>
<parameter key="lift" value="false"/>
<parameter key="fallout" value="false"/>
<parameter key="f_measure" value="true"/>
<parameter key="false_positive" value="true"/>
<parameter key="false_negative" value="true"/>
<parameter key="true_positive" value="true"/>
<parameter key="true_negative" value="true"/>
<parameter key="sensitivity" value="false"/>
<parameter key="specificity" value="false"/>
<parameter key="youden" value="false"/>
<parameter key="positive_predictive_value" value="false"/>
<parameter key="negative_predictive_value" value="false"/>
<parameter key="psep" value="false"/>
<parameter key="skip_undefined_labels" value="true"/>
<parameter key="use_example_weights" value="true"/>
</operator>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Sample" to_port="example set input"/>
<connect from_port="through 1" to_op="Apply Preprocessing" to_port="model"/>
<connect from_op="Sample" from_port="example set output" to_op="Apply Preprocessing" to_port="unlabelled data"/>
<connect from_op="Apply Preprocessing" from_port="labelled data" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="performance 1"/>
<connect from_op="Performance" from_port="example set" to_port="test set results"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="source_through 2" spacing="0"/>
<portSpacing port="sink_test set results" spacing="0"/>
<portSpacing port="sink_performance 1" spacing="0"/>
<portSpacing port="sink_performance 2" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve BitteEinBot_final" from_port="output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Cross Validation" to_port="example set"/>
<connect from_op="Cross Validation" from_port="model" to_port="result 1"/>
<connect from_op="Cross Validation" from_port="example set" to_port="result 2"/>
<connect from_op="Cross Validation" from_port="performance 1" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>