"Text Classification: Process Documents from Data very slow"
jabe
New Altair Community Member
Hello,
I am doing a text classification using weka naive bayes to label data. The labeling process works fine, however it slows down with every additional text that is processed.
I have 30 datasets with about 15,000 messages each. I created a loop-process to load the datasets, process documents from data, apply the model and write the results. But with every iteration it takes more and more time for the "process documents to data"-operator to finish. I think the reason for this is that the word vector isn't reset after one iteration (dataset) ist done. This means that the word vector becomes bigger and bigger and takes increasingly time to process additional text.
Is there a way to delete the word vector after each dataset is processed?
Thanks for your help
Best,
Jabe
This is what the process looks like:
I am doing a text classification using weka naive bayes to label data. The labeling process works fine, however it slows down with every additional text that is processed.
I have 30 datasets with about 15,000 messages each. I created a loop-process to load the datasets, process documents from data, apply the model and write the results. But with every iteration it takes more and more time for the "process documents to data"-operator to finish. I think the reason for this is that the word vector isn't reset after one iteration (dataset) ist done. This means that the word vector becomes bigger and bigger and takes increasingly time to process additional text.
Is there a way to delete the word vector after each dataset is processed?
Thanks for your help
Best,
Jabe
This is what the process looks like:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.5.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.5.002" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="parallelize_main_process" value="false"/>
<process expanded="true">
<operator activated="true" class="loop" compatibility="6.5.002" expanded="true" height="76" name="Loop" width="90" x="179" y="30">
<parameter key="set_iteration_macro" value="true"/>
<parameter key="macro_name" value="iteration"/>
<parameter key="macro_start_value" value="1"/>
<parameter key="iterations" value="31"/>
<parameter key="limit_time" value="false"/>
<parameter key="timeout" value="1"/>
<parameter key="parallelize_iteration" value="false"/>
<process expanded="true">
<operator activated="true" class="jdbc_connectors:read_database" compatibility="6.5.002" expanded="true" height="60" name="Read Database" width="90" x="45" y="345">
<parameter key="define_connection" value="predefined"/>
<parameter key="connection" value="local"/>
<parameter key="database_system" value="MySQL"/>
<parameter key="define_query" value="query"/>
<parameter key="query" value="SELECT ID, TEXT FROM DATA where ID=%{iteration}"/>
<parameter key="use_default_schema" value="true"/>
<parameter key="prepare_statement" value="false"/>
<enumeration key="parameters"/>
<parameter key="datamanagement" value="double_array"/>
</operator>
<operator activated="true" class="retrieve" compatibility="6.5.002" expanded="true" height="60" name="Retrieve Model" width="90" x="45" y="165">
<parameter key="repository_entry" value="//Local Repository/data/model"/>
</operator>
<operator activated="true" class="retrieve" compatibility="6.5.002" expanded="true" height="60" name="Retrieve Wordlist" width="90" x="45" y="255">
<parameter key="repository_entry" value="//Local Repository/data/wordlist"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="6.5.002" expanded="true" height="76" name="Nominal to Text" width="90" x="179" y="345">
<parameter key="attribute_filter_type" value="all"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="CONTENT"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="nominal"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="file_path"/>
<parameter key="block_type" value="single_value"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="single_value"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="6.5.000" expanded="true" height="76" name="Process Documents from Data (2)" width="90" x="313" y="255">
<parameter key="create_word_vector" value="true"/>
<parameter key="vector_creation" value="TF-IDF"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="false"/>
<parameter key="prune_method" value="none"/>
<parameter key="prune_below_percent" value="1.0"/>
<parameter key="prune_above_percent" value="100.0"/>
<parameter key="prune_below_absolute" value="4"/>
<parameter key="prune_above_absolute" value="9999"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="select_attributes_and_weights" value="false"/>
<list key="specify_weights"/>
<parameter key="parallelize_vector_creation" value="false"/>
<process expanded="true">
<operator activated="true" class="text:transform_cases" compatibility="6.5.000" expanded="true" height="60" name="Transform Cases (3)" width="90" x="45" y="30">
<parameter key="transform_to" value="lower case"/>
</operator>
<operator activated="true" class="wordnet:open_wordnet_dictionary" compatibility="5.3.000" expanded="true" height="60" name="Open WordNet Dictionary (3)" width="90" x="179" y="345">
<parameter key="resource_type" value="directory"/>
<parameter key="directory" value="\dict"/>
</operator>
<operator activated="true" class="text:replace_tokens" compatibility="6.5.000" expanded="true" height="60" name="Rep Links/Accounts (2)" width="90" x="179" y="30">
<list key="replace_dictionary">
<parameter key="text" value=" "/>
</list>
</operator>
<operator activated="true" class="text:replace_tokens" compatibility="6.5.000" expanded="true" height="60" name="Rep Sonderzeichen (2)" width="90" x="313" y="30">
<list key="replace_dictionary">
<parameter key="text" value=" "/>
</list>
</operator>
<operator activated="true" class="text:replace_tokens" compatibility="6.5.000" expanded="true" height="60" name="Rep Companies (2)" width="90" x="447" y="30">
<list key="replace_dictionary">
<parameter key="text" value=" "/>
</list>
</operator>
<operator activated="true" class="text:replace_tokens" compatibility="6.5.000" expanded="true" height="60" name="Rep Abbrv (2)" width="90" x="581" y="30">
<list key="replace_dictionary">
<parameter key="text" value=" "/>
</list>
</operator>
<operator activated="true" class="text:replace_tokens" compatibility="6.5.000" expanded="true" height="60" name="Rep Words (2)" width="90" x="715" y="30">
<list key="replace_dictionary">
<parameter key="text" value=" "/>
</list>
</operator>
<operator activated="true" class="text:replace_tokens" compatibility="6.5.000" expanded="true" height="60" name="Rep Numbers (2)" width="90" x="849" y="30">
<list key="replace_dictionary">
<parameter key="text" value=" "/>
</list>
</operator>
<operator activated="true" class="text:tokenize" compatibility="6.5.000" expanded="true" height="60" name="Tokenize (2)" width="90" x="45" y="210">
<parameter key="mode" value="non letters"/>
<parameter key="characters" value=".:"/>
<parameter key="language" value="English"/>
<parameter key="max_token_length" value="3"/>
</operator>
<operator activated="true" class="text:filter_stopwords_dictionary" compatibility="6.5.000" expanded="true" height="76" name="Filter Stopwords (2)" width="90" x="179" y="210">
<parameter key="file" value="smart_stopwords.txt"/>
<parameter key="case_sensitive" value="false"/>
<parameter key="encoding" value="SYSTEM"/>
</operator>
<operator activated="true" class="wordnet:stem_wordnet" compatibility="5.3.000" expanded="true" height="76" name="Stem WordNet (2)" width="90" x="380" y="210">
<parameter key="allow_ambiguity" value="false"/>
<parameter key="keep_unmatched_stems" value="true"/>
<parameter key="keep_unmatched_tokens" value="true"/>
<parameter key="work_on_type_noun" value="true"/>
<parameter key="work_on_type_verb" value="true"/>
<parameter key="work_on_type_adjective" value="true"/>
<parameter key="work_on_type_adverb" value="true"/>
</operator>
<operator activated="true" class="text:filter_by_length" compatibility="6.5.000" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="514" y="210">
<parameter key="min_chars" value="2"/>
<parameter key="max_chars" value="25"/>
</operator>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="6.5.000" expanded="true" height="60" name="Generate n-Grams (3)" width="90" x="849" y="210">
<parameter key="max_length" value="2"/>
</operator>
<connect from_port="document" to_op="Transform Cases (3)" to_port="document"/>
<connect from_op="Transform Cases (3)" from_port="document" to_op="Rep Links/Accounts (2)" to_port="document"/>
<connect from_op="Open WordNet Dictionary (3)" from_port="dictionary" to_op="Stem WordNet (2)" to_port="dictionary"/>
<connect from_op="Rep Links/Accounts (2)" from_port="document" to_op="Rep Sonderzeichen (2)" to_port="document"/>
<connect from_op="Rep Sonderzeichen (2)" from_port="document" to_op="Rep Companies (2)" to_port="document"/>
<connect from_op="Rep Companies (2)" from_port="document" to_op="Rep Abbrv (2)" to_port="document"/>
<connect from_op="Rep Abbrv (2)" from_port="document" to_op="Rep Words (2)" to_port="document"/>
<connect from_op="Rep Words (2)" from_port="document" to_op="Rep Numbers (2)" to_port="document"/>
<connect from_op="Rep Numbers (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>
<connect from_op="Tokenize (2)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
<connect from_op="Filter Stopwords (2)" from_port="document" to_op="Stem WordNet (2)" to_port="document"/>
<connect from_op="Stem WordNet (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/>
<connect from_op="Filter Tokens (2)" from_port="document" to_op="Generate n-Grams (3)" to_port="document"/>
<connect from_op="Generate n-Grams (3)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" compatibility="6.5.002" expanded="true" height="76" name="Apply Model" width="90" x="514" y="165">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="6.5.002" expanded="true" height="76" name="Select Attributes" width="90" x="648" y="165">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="ARTICLE_ID"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<operator activated="true" class="write_excel" compatibility="6.5.002" expanded="true" height="76" name="Write Excel Company" width="90" x="782" y="165">
<parameter key="excel_file" value="%{iteration}.xlsx"/>
<parameter key="file_format" value="xlsx"/>
<parameter key="encoding" value="SYSTEM"/>
<parameter key="sheet_name" value="RapidMiner Data"/>
<parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
<parameter key="number_format" value="#.0"/>
</operator>
<connect from_op="Read Database" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Retrieve Model" from_port="output" to_op="Apply Model" to_port="model"/>
<connect from_op="Retrieve Wordlist" from_port="output" to_op="Process Documents from Data (2)" to_port="word list"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data (2)" to_port="example set"/>
<connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Write Excel Company" to_port="input"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<connect from_op="Loop" from_port="output 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Tagged:
0