Hello !
i am trying to create associations between words in a document corpus. After preprocessing(process docs from files, occurrence-vector), i apply a normal market basket analysis with fp-growth and association rules. However, the fp-growth operator never finishes due to
an exceed of computing ressources.
now my question: how can i reduce the set of attributes to only the top 30 most occuring?
i tried "loop attributes" with a hosted "aggregate" operator, but fail to make the right settings to make it work.
After that, i wanted to apply a "sort" operator and a "filter attributes" operator. is this the right approach?
Can anyone help me with that?
the exampleset contains 20 examples and 10000+ attributes,
preprocessing with lower case, tokenize, filter stopwords, snowball stemmer:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" breakpoints="after" class="text:process_document_from_file" compatibility="5.3.001" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="165">
<list key="text_directories">
<parameter key="textmining" value="C:\Users\Marc\Desktop\SA2\Literatur\Text_Mining\HTML_txt"/>
</list>
<parameter key="vector_creation" value="Term Occurrences"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.001" expanded="true" height="60" name="Tokenize" width="90" x="179" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.001" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="179" y="120"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.001" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="179" y="210"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.001" expanded="true" height="60" name="Transform Cases" width="90" x="179" y="300"/>
<operator activated="true" class="text:stem_porter" compatibility="5.3.001" expanded="true" height="60" name="Stem (Porter)" width="90" x="447" y="165"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Stem (Porter)" to_port="document"/>
<connect from_op="Stem (Porter)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" breakpoints="after" class="loop_attributes" compatibility="5.3.013" expanded="true" height="76" name="Loop Attributes" width="90" x="246" y="165">
<parameter key="include_special_attributes" value="true"/>
<process expanded="true">
<operator activated="true" class="aggregate" compatibility="5.3.013" expanded="true" height="76" name="Aggregate" width="90" x="112" y="30">
<list key="aggregation_attributes"/>
</operator>
<connect from_port="example set" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Aggregate" from_port="example set output" to_port="example set"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
</process>
</operator>
<operator activated="true" breakpoints="after" class="numerical_to_binominal" compatibility="5.3.013" expanded="true" height="76" name="Numerical to Binominal" width="90" x="514" y="165"/>
<operator activated="true" breakpoints="after" class="fp_growth" compatibility="5.3.013" expanded="true" height="76" name="FP-Growth" width="90" x="648" y="75">
<parameter key="min_number_of_itemsets" value="10"/>
<parameter key="min_support" value="0.9"/>
</operator>
<operator activated="true" class="create_association_rules" compatibility="5.3.013" expanded="true" height="76" name="Create Association Rules" width="90" x="648" y="210">
<parameter key="min_confidence" value="0.5"/>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Loop Attributes" to_port="example set"/>
<connect from_op="Loop Attributes" from_port="example set" to_op="Numerical to Binominal" to_port="example set input"/>
<connect from_op="Numerical to Binominal" from_port="example set output" to_op="FP-Growth" to_port="example set"/>
<connect from_op="FP-Growth" from_port="frequent sets" to_op="Create Association Rules" to_port="item sets"/>
<connect from_op="Create Association Rules" from_port="rules" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>