Looping Clusters and store them in Repository
Hi everybody,
My dataset consists 4000 examples, 4 special attributes (ID, cluster, text and outlier), and 570 regular attributes from textprocessing. What I have done with the data so far was only to cluster it. Now I have 37 clusters and I want to store the 1 example set for each cluster in my repository.
Thats where my problem is: I think it should be possible with macros, "loop cluster" - and the "store" -operator, but I cant figure out how to set the parameters right.
I have a snippet attached from the data.
And the XML of my process so far:
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.2.000" expanded="true" height="68" name="Retrieve Daten KAM clustered (opt.)" width="90" x="112" y="34">
<parameter key="repository_entry" value="//Datenbearbeitung MA/Filter Outliers/Daten KAM clustered (opt.)"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.2.000" expanded="true" height="82" name="Select Attributes" width="90" x="246" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="ID|label|text"/>
</operator>
<operator activated="true" class="set_role" compatibility="8.2.000" expanded="true" height="82" name="Set Role" width="90" x="380" y="34">
<parameter key="attribute_name" value="label"/>
<parameter key="target_role" value="cluster"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="loop_clusters" compatibility="8.2.000" expanded="true" height="82" name="Loop Clusters" width="90" x="648" y="34">
<process expanded="true">
<operator activated="true" class="filter_examples" compatibility="8.2.000" expanded="true" height="103" name="Filter Examples" width="90" x="179" y="34">
<list key="filters_list">
<parameter key="filters_entry_key" value="label.equals.%{myMacro_0}"/>
</list>
</operator>
<operator activated="true" class="store" compatibility="8.2.000" expanded="true" height="68" name="Store" width="90" x="648" y="34">
<parameter key="repository_entry" value="999TEST"/>
</operator>
<connect from_port="cluster subset" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Store" to_port="input"/>
<connect from_op="Store" from_port="through" to_port="out 1"/>
<portSpacing port="source_cluster subset" spacing="0"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="set_macros" compatibility="8.2.000" expanded="true" height="68" name="Set Macros" width="90" x="313" y="136">
<list key="macros">
<parameter key="myMacro_0" value=""cluster_0""/>
</list>
</operator>
<connect from_op="Retrieve Daten KAM clustered (opt.)" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Loop Clusters" to_port="example set"/>
<connect from_op="Loop Clusters" from_port="out 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
My goal is to apply the "Extract Topics from Document (LDA)" operator on every cluster with number of topics = 1 so that I can see the top words for each cluster.
Thank you all in advance
flo
Best Answers
-
Hi,
Group into Collection and Loop Collection from Toolbox does it.
Let me know if you need help with LDA. It's somewhat my baby.
BR,
Martin
Edit: I guess you do not want to use LDA, but simple process documents or so.
2 -
Hi @flo,
have a look at the attached process. Is should do what you want?
BR,
Martin
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.2.001" expanded="true" height="68" name="Retrieve OpenRanks Reviews Beijing" width="90" x="45" y="34">
<parameter key="repository_entry" value="data/OpenRanks Reviews Beijing"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="8.2.001" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Review"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="34">
<parameter key="vector_creation" value="Term Occurrences"/>
<parameter key="add_meta_information" value="false"/>
<parameter key="prune_method" value="percentual"/>
<parameter key="prune_below_percent" value="5.0"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="45" y="34"/>
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="246" y="34"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="514" y="85"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="concurrency:k_means" compatibility="8.2.001" expanded="true" height="82" name="Clustering" width="90" x="447" y="34"/>
<operator activated="true" class="operator_toolbox:group_into_collection" compatibility="1.3.000-SNAPSHOT" expanded="true" height="82" name="Group Into Collection" width="90" x="715" y="34">
<parameter key="group_by_attribute" value="cluster"/>
</operator>
<operator activated="true" class="loop_collection" compatibility="8.2.001" expanded="true" height="82" name="Loop Collection" width="90" x="849" y="34">
<process expanded="true">
<operator activated="true" class="extract_macro" compatibility="8.2.001" expanded="true" height="68" name="Extract Macro" width="90" x="45" y="34">
<parameter key="macro" value="clusterId"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="cluster"/>
<parameter key="example_index" value="1"/>
<list key="additional_macros"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.2.001" expanded="true" height="82" name="Select Attributes" width="90" x="112" y="136">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="cluster"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="aggregate" compatibility="8.2.001" expanded="true" height="82" name="Aggregate (2)" width="90" x="179" y="34">
<parameter key="use_default_aggregation" value="true"/>
<parameter key="default_aggregation_function" value="sum"/>
<list key="aggregation_attributes"/>
</operator>
<operator activated="true" class="transpose" compatibility="8.2.001" expanded="true" height="82" name="Transpose" width="90" x="313" y="34"/>
<operator activated="true" class="sort" compatibility="8.2.001" expanded="true" height="82" name="Sort" width="90" x="447" y="34">
<parameter key="attribute_name" value="att_1"/>
<parameter key="sorting_direction" value="decreasing"/>
</operator>
<operator activated="true" class="filter_example_range" compatibility="8.2.001" expanded="true" height="82" name="Filter Example Range" width="90" x="581" y="34">
<parameter key="first_example" value="1"/>
<parameter key="last_example" value="5"/>
<description align="center" color="transparent" colored="false" width="126">Take Top5</description>
</operator>
<operator activated="true" class="replace" compatibility="8.2.001" expanded="true" height="82" name="Replace" width="90" x="715" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="id"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="replace_what" value="sum\((.+)\)"/>
<parameter key="replace_by" value="$1"/>
</operator>
<operator activated="true" class="rename" compatibility="8.2.001" expanded="true" height="82" name="Rename" width="90" x="849" y="34">
<parameter key="old_name" value="att_1"/>
<parameter key="new_name" value="sum"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="8.2.001" expanded="true" height="82" name="Generate Attributes" width="90" x="983" y="34">
<list key="function_descriptions">
<parameter key="cluster" value="%{clusterId}"/>
</list>
</operator>
<connect from_port="single" to_op="Extract Macro" to_port="example set"/>
<connect from_op="Extract Macro" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Aggregate (2)" to_port="example set input"/>
<connect from_op="Aggregate (2)" from_port="example set output" to_op="Transpose" to_port="example set input"/>
<connect from_op="Transpose" from_port="example set output" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
<connect from_op="Filter Example Range" from_port="example set output" to_op="Replace" to_port="example set input"/>
<connect from_op="Replace" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve OpenRanks Reviews Beijing" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="clustered set" to_op="Group Into Collection" to_port="exa"/>
<connect from_op="Group Into Collection" from_port="col" to_op="Loop Collection" to_port="collection"/>
<connect from_op="Loop Collection" from_port="output 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<description align="center" color="yellow" colored="false" height="50" resized="true" width="481" x="271" y="235">Task: Calculate the top 5 most frequent words per cluster</description>
</process>
</operator>
</process>Edit: Also have a look at this blog post: https://medium.com/@mSchmitz_/understanding-clustering-cf0117148ef4
i think this is closer to what you really want.
1
Answers
-
Hi,
Group into Collection and Loop Collection from Toolbox does it.
Let me know if you need help with LDA. It's somewhat my baby.
BR,
Martin
Edit: I guess you do not want to use LDA, but simple process documents or so.
2 -
Hi @mschmitz,
This topic inspire me 2 questions about your (nice) baby Martin :
In deed, I executed the tutorial of this operator. For recall, in this tutorial, we create and analyze 5 documents which are strictly the same :
- when number of topics = 5, all documents have the same topic :
- when number of topics = 10, the document have different topics :
My first question is why, in this last case for similar documents, we don't have the same topic (like in the first case) ?
My second question is how should we interpret the weight of words : The more the weight is high, the more the word is "caracteristic" of the topic / the more the word "explain" the topic ?
Thank you,
Regards,
Lionel
0 -
Hey @lionelderkrikor,
this is totally artificial since the data is the same. The optimization uses some randomness for the start. It assings a word to a topic and so on. Thus the different names. If there would be something in, then this would change. I think you just get the priors out.
I got a topic extraction on Tripadvisor Reviews somewhere. I thought i posted a blog post on it - but i can't find it? @sgenzer did i maybe just not post it?
BR,
Martin
1 -
Hi,
Thank you @mschmitz those operators were exactly what I was looking for.
Since I have each of the clusters in one Collection I thought I could use the "Extract Topics from Document" (with number of topics : 1) on those Collections to see the TOP words for each cluster....
But I have been thinking now:
What I did was to cluster my text data by k means first and after that I did the LDA "Extract Topics from Document", so my question is:
Isn't that somewhat the same ? I mean both operators seperates texts into "clusters" or "topics" except LDA can give me the TOP x words for each topic.
Best regards
flo
0 -
Hi @flo,
exactly. LDA is somewhat like a clustering. It also groups your documents into k-groups. The big difference is, that LDA is a Latent model.
This means:
- One Topic has many associated words. One word can be part of one or more topics.
- Each document can be part of one or more topics.
Which makes it different to normal clusterings. I think what you want is just a Process Documents on each cluster and use WordList to Data to get the frequency overview.
Best,
Martin
1 -
Hello
Excuse me here
Dear friend @flo
Did you perform the LDA algorithm on any cluster?
Thanks if you tell me
With respect0 -
Hello @m_keshavarz_com,
I tried to perform LDA on the clusters but it didnt work (log 0.0000). But what I will try is just to get a wordlist from each cluster and sort them top down. That should deliver a similar result to the LDA hopefully.
Sorry I cant help more than that ....
Best
flo
0 -
@mschmitz wrote:
[...]Which makes it different to normal clusterings. I think what you want is just a Process Documents on each cluster and use WordList to Data to get the frequency overview.
Best,
Martin
Hi @mschmitz
I hope I am not bothering you.
Thank you so far for your input - the process documents ( vector creation: term occurrences) on each cluster gives a good overview.
What I end up with is the following table:
My question is now is there a way to show only the top 5 words per cluster ( no occurrences ) through some magic ETL which I dont know yet or is there no other choice than to transpose this table and and sort each cluster in deacreasing order manually ?
Best regards
flo
0 -
Hi @flo,
have a look at the attached process. Is should do what you want?
BR,
Martin
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.2.001" expanded="true" height="68" name="Retrieve OpenRanks Reviews Beijing" width="90" x="45" y="34">
<parameter key="repository_entry" value="data/OpenRanks Reviews Beijing"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="8.2.001" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Review"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="34">
<parameter key="vector_creation" value="Term Occurrences"/>
<parameter key="add_meta_information" value="false"/>
<parameter key="prune_method" value="percentual"/>
<parameter key="prune_below_percent" value="5.0"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="45" y="34"/>
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="246" y="34"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="514" y="85"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="concurrency:k_means" compatibility="8.2.001" expanded="true" height="82" name="Clustering" width="90" x="447" y="34"/>
<operator activated="true" class="operator_toolbox:group_into_collection" compatibility="1.3.000-SNAPSHOT" expanded="true" height="82" name="Group Into Collection" width="90" x="715" y="34">
<parameter key="group_by_attribute" value="cluster"/>
</operator>
<operator activated="true" class="loop_collection" compatibility="8.2.001" expanded="true" height="82" name="Loop Collection" width="90" x="849" y="34">
<process expanded="true">
<operator activated="true" class="extract_macro" compatibility="8.2.001" expanded="true" height="68" name="Extract Macro" width="90" x="45" y="34">
<parameter key="macro" value="clusterId"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="cluster"/>
<parameter key="example_index" value="1"/>
<list key="additional_macros"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.2.001" expanded="true" height="82" name="Select Attributes" width="90" x="112" y="136">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="cluster"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="aggregate" compatibility="8.2.001" expanded="true" height="82" name="Aggregate (2)" width="90" x="179" y="34">
<parameter key="use_default_aggregation" value="true"/>
<parameter key="default_aggregation_function" value="sum"/>
<list key="aggregation_attributes"/>
</operator>
<operator activated="true" class="transpose" compatibility="8.2.001" expanded="true" height="82" name="Transpose" width="90" x="313" y="34"/>
<operator activated="true" class="sort" compatibility="8.2.001" expanded="true" height="82" name="Sort" width="90" x="447" y="34">
<parameter key="attribute_name" value="att_1"/>
<parameter key="sorting_direction" value="decreasing"/>
</operator>
<operator activated="true" class="filter_example_range" compatibility="8.2.001" expanded="true" height="82" name="Filter Example Range" width="90" x="581" y="34">
<parameter key="first_example" value="1"/>
<parameter key="last_example" value="5"/>
<description align="center" color="transparent" colored="false" width="126">Take Top5</description>
</operator>
<operator activated="true" class="replace" compatibility="8.2.001" expanded="true" height="82" name="Replace" width="90" x="715" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="id"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="replace_what" value="sum\((.+)\)"/>
<parameter key="replace_by" value="$1"/>
</operator>
<operator activated="true" class="rename" compatibility="8.2.001" expanded="true" height="82" name="Rename" width="90" x="849" y="34">
<parameter key="old_name" value="att_1"/>
<parameter key="new_name" value="sum"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="8.2.001" expanded="true" height="82" name="Generate Attributes" width="90" x="983" y="34">
<list key="function_descriptions">
<parameter key="cluster" value="%{clusterId}"/>
</list>
</operator>
<connect from_port="single" to_op="Extract Macro" to_port="example set"/>
<connect from_op="Extract Macro" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Aggregate (2)" to_port="example set input"/>
<connect from_op="Aggregate (2)" from_port="example set output" to_op="Transpose" to_port="example set input"/>
<connect from_op="Transpose" from_port="example set output" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
<connect from_op="Filter Example Range" from_port="example set output" to_op="Replace" to_port="example set input"/>
<connect from_op="Replace" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve OpenRanks Reviews Beijing" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="clustered set" to_op="Group Into Collection" to_port="exa"/>
<connect from_op="Group Into Collection" from_port="col" to_op="Loop Collection" to_port="collection"/>
<connect from_op="Loop Collection" from_port="output 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<description align="center" color="yellow" colored="false" height="50" resized="true" width="481" x="271" y="235">Task: Calculate the top 5 most frequent words per cluster</description>
</process>
</operator>
</process>Edit: Also have a look at this blog post: https://medium.com/@mSchmitz_/understanding-clustering-cf0117148ef4
i think this is closer to what you really want.
1 -
Hi @mschmitz
Yes that was very much what I wanted to do. I have modified the process a little bit so that it shows me the TOP 5 words for each cluster in one example set more or less like this:
ClusterID TOP1 TOP2 TOP3 TOP4 TOP5
Cluster_1
Cluster_2
Cluster_3
Thank you very much.
Best regards
flo
0 -
Hello Dear friends and forum professors sorry..... I also want to find repetitive words in each cluster and the centers of each cluster But I do not know how Somebody tell me?
and
I'm from @mschmitz
I used . But for 8 clusters, only cluster words are 0,1,2,5,6,7
Gave the And the words did not give clusters 4 and 5
what's wrong?thanks for your help
0 -
Hi dear friend @flo
Thank you very much for your help
For me, lda also had the result likelihood = 0 on clusters
I did not understand your sentence
Can you explain more?and how?
"But what I will try is just to get a wordlist from every cluster and sort them down the top. That would hopefully bring a similar result to the LDA."
Thanks a lot0 -
Hi @m_keshavarz_com , @student_compute
maybe this can help you, if you are looking for the most frequent words for each cluster:
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.2.001" expanded="true" height="68" name="Retrieve 01_KAM Text Alles Kategorie" width="90" x="45" y="85">
<parameter key="repository_entry" value="//Masterarbeit Final/01_Text Preprocessing/Daten/01_KAM Text Alles Kategorie"/>
</operator>
<operator activated="true" class="sample" compatibility="8.2.001" expanded="true" height="82" name="Sample" width="90" x="45" y="187">
<parameter key="sample" value="relative"/>
<parameter key="sample_size" value="40"/>
<parameter key="sample_ratio" value="0.5"/>
<list key="sample_size_per_class"/>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.2.001" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="289">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="ID|Verkettet"/>
</operator>
<operator activated="true" class="set_role" compatibility="8.2.001" expanded="true" height="82" name="Set Role" width="90" x="45" y="391">
<parameter key="attribute_name" value="ID"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles">
<parameter key="Verkettet" value="regular"/>
</list>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="8.2.001" expanded="true" height="82" name="Nominal to Text" width="90" x="246" y="85">
<parameter key="attribute" value="Verkettet"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="246" y="187">
<parameter key="prune_method" value="percentual"/>
<parameter key="prune_below_percent" value="5.0"/>
<parameter key="prune_above_percent" value="50.0"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="179" y="34"/>
<operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="313" y="34"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="447" y="34"/>
<operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="648" y="34"/>
<operator activated="true" class="text:stem_porter" compatibility="8.1.000" expanded="true" height="68" name="Stem (Porter)" width="90" x="782" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
<connect from_op="Stem (Porter)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="concurrency:k_means" compatibility="8.2.001" expanded="true" height="82" name="Clustering" width="90" x="246" y="289">
<parameter key="k" value="5"/>
</operator>
<operator activated="true" class="operator_toolbox:group_into_collection" compatibility="1.2.000" expanded="true" height="82" name="Group Into Collection" width="90" x="246" y="391">
<parameter key="group_by_attribute" value="cluster"/>
</operator>
<operator activated="true" class="loop_collection" compatibility="8.2.001" expanded="true" height="82" name="Loop Collection" width="90" x="447" y="85">
<parameter key="set_iteration_macro" value="true"/>
<parameter key="unfold" value="true"/>
<process expanded="true">
<operator activated="true" class="extract_macro" compatibility="8.2.001" expanded="true" height="68" name="Extract Macro" width="90" x="45" y="136">
<parameter key="macro" value="clusterID"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="cluster"/>
<parameter key="example_index" value="1"/>
<list key="additional_macros"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.2.001" expanded="true" height="82" name="Select Attributes (3)" width="90" x="179" y="136">
<parameter key="attribute" value="cluster"/>
</operator>
<operator activated="true" class="aggregate" compatibility="8.2.001" expanded="true" height="82" name="Aggregate" width="90" x="179" y="238">
<parameter key="use_default_aggregation" value="true"/>
<parameter key="default_aggregation_function" value="sum"/>
<list key="aggregation_attributes"/>
</operator>
<operator activated="true" class="transpose" compatibility="8.2.001" expanded="true" height="82" name="Transpose" width="90" x="179" y="340"/>
<operator activated="true" class="sort" compatibility="8.2.001" expanded="true" height="82" name="Sort" width="90" x="179" y="442">
<parameter key="attribute_name" value="att_1"/>
<parameter key="sorting_direction" value="decreasing"/>
</operator>
<operator activated="true" breakpoints="after" class="filter_example_range" compatibility="8.2.001" expanded="true" height="82" name="Filter Example Range" width="90" x="313" y="136">
<parameter key="first_example" value="1"/>
<parameter key="last_example" value="7"/>
</operator>
<operator activated="true" breakpoints="after" class="replace" compatibility="8.2.001" expanded="true" height="82" name="Replace" width="90" x="313" y="238">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="id"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="replace_what" value="sum\("/>
</operator>
<operator activated="true" breakpoints="after" class="replace" compatibility="8.2.001" expanded="true" height="82" name="Replace (2)" width="90" x="313" y="340">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="id"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="replace_what" value="\)"/>
</operator>
<operator activated="true" breakpoints="after" class="rename" compatibility="8.2.001" expanded="true" height="82" name="Rename" width="90" x="313" y="442">
<parameter key="old_name" value="att_1"/>
<parameter key="new_name" value="sum"/>
<list key="rename_additional_attributes">
<parameter key="id" value="word"/>
</list>
</operator>
<operator activated="true" breakpoints="after" class="set_role" compatibility="8.2.001" expanded="true" height="82" name="Set Role (2)" width="90" x="447" y="136">
<parameter key="attribute_name" value="word"/>
<list key="set_additional_roles">
<parameter key="sum" value="regular"/>
</list>
</operator>
<operator activated="true" breakpoints="after" class="transpose" compatibility="8.2.001" expanded="true" height="82" name="Transpose (2)" width="90" x="447" y="238"/>
<operator activated="true" breakpoints="after" class="generate_attributes" compatibility="8.2.001" expanded="true" height="82" name="Generate Attributes" width="90" x="447" y="340">
<list key="function_descriptions">
<parameter key="clusterid" value="%{clusterID}"/>
</list>
</operator>
<operator activated="true" breakpoints="after" class="set_role" compatibility="8.2.001" expanded="true" height="82" name="Set Role (3)" width="90" x="447" y="442">
<parameter key="attribute_name" value="clusterid"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles">
<parameter key="id" value="regular"/>
</list>
</operator>
<operator activated="true" breakpoints="after" class="filter_examples" compatibility="8.2.001" expanded="true" height="103" name="Filter Examples" width="90" x="581" y="136">
<list key="filters_list">
<parameter key="filters_entry_key" value="id.equals.word"/>
</list>
</operator>
<operator activated="true" breakpoints="after" class="rename" compatibility="8.2.001" expanded="true" height="82" name="Renaming att_x into TOP x" width="90" x="715" y="238">
<parameter key="old_name" value="att_1"/>
<parameter key="new_name" value="TOP_1"/>
<list key="rename_additional_attributes">
<parameter key="att_2" value="TOP_2"/>
<parameter key="att_3" value="TOP_3"/>
<parameter key="att_4" value="TOP_4"/>
<parameter key="att_5" value="TOP_5"/>
<parameter key="att_6" value="TOP_6"/>
<parameter key="att_7" value="TOP_7"/>
</list>
</operator>
<operator activated="true" breakpoints="after" class="select_attributes" compatibility="8.2.001" expanded="true" height="82" name="Select Attributes (4)" width="90" x="581" y="340">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="id"/>
<parameter key="invert_selection" value="true"/>
</operator>
<connect from_port="single" to_op="Extract Macro" to_port="example set"/>
<connect from_op="Extract Macro" from_port="example set" to_op="Select Attributes (3)" to_port="example set input"/>
<connect from_op="Select Attributes (3)" from_port="example set output" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Aggregate" from_port="example set output" to_op="Transpose" to_port="example set input"/>
<connect from_op="Transpose" from_port="example set output" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
<connect from_op="Filter Example Range" from_port="example set output" to_op="Replace" to_port="example set input"/>
<connect from_op="Replace" from_port="example set output" to_op="Replace (2)" to_port="example set input"/>
<connect from_op="Replace (2)" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Transpose (2)" to_port="example set input"/>
<connect from_op="Transpose (2)" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Set Role (3)" to_port="example set input"/>
<connect from_op="Set Role (3)" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Renaming att_x into TOP x" to_port="example set input"/>
<connect from_op="Renaming att_x into TOP x" from_port="example set output" to_op="Select Attributes (4)" to_port="example set input"/>
<connect from_op="Select Attributes (4)" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
<description align="center" color="yellow" colored="false" height="63" resized="true" width="158" x="273" y="57">select number of TOP words you want to see</description>
</process>
</operator>
<operator activated="true" class="append" compatibility="8.2.001" expanded="true" height="82" name="Append" width="90" x="447" y="187"/>
<connect from_op="Retrieve 01_KAM Text Alles Kategorie" from_port="output" to_op="Sample" to_port="example set input"/>
<connect from_op="Sample" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="clustered set" to_op="Group Into Collection" to_port="exa"/>
<connect from_op="Group Into Collection" from_port="col" to_op="Loop Collection" to_port="collection"/>
<connect from_op="Loop Collection" from_port="output 1" to_op="Append" to_port="example set 1"/>
<connect from_op="Append" from_port="merged set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>Of course thanks to @mschmitz for most of the process.
Best
flo
1 -
Hello. thank you very much dear friend:smileyhappy:
But I want to know how to find the repetitive words of each cluster and the center of each cluster?
Thank you so much for your kindness0