Hi guys, Any idea how best to tweak the parameters in optimizing the LDA model? I'm playing about with this example using RSS news feeds and not 100% sure if the Optimize model is working well enough on small values for topics. Is it not enough of a large dataset do you think?
Please note: increasing the start & end values for number of topics gives better results, I'm wanting to build a generic example on usage and any suggested pointers.
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.1.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="subprocess" compatibility="8.1.001" expanded="true" height="103" name="Get News Feeds" width="90" x="45" y="34">
<process expanded="true">
<operator activated="true" class="web:read_rss" compatibility="7.3.000" expanded="true" height="68" name="BBC Top Stories" width="90" x="45" y="34">
<parameter key="url" value="http://feeds.bbci.co.uk/news/rss.xml"/>
</operator>
<operator activated="true" class="web:read_rss" compatibility="7.3.000" expanded="true" height="68" name="BBC Asia" width="90" x="45" y="85">
<parameter key="url" value="http://feeds.bbci.co.uk/news/world/asia/rss.xml"/>
</operator>
<operator activated="true" class="web:read_rss" compatibility="7.3.000" expanded="true" height="68" name="BBC Business" width="90" x="45" y="136">
<parameter key="url" value="http://feeds.bbci.co.uk/news/business/rss.xml"/>
</operator>
<operator activated="true" class="web:read_rss" compatibility="7.3.000" expanded="true" height="68" name="BBC Entertainment" width="90" x="45" y="187">
<parameter key="url" value="http://feeds.bbci.co.uk/news/entertainment_and_arts/rss.xml"/>
</operator>
<operator activated="true" class="append" compatibility="8.1.001" expanded="true" height="145" name="Append" width="90" x="179" y="34"/>
<operator activated="true" class="generate_copy" compatibility="8.1.001" expanded="true" height="82" name="Generate Copy" width="90" x="313" y="34">
<parameter key="attribute_name" value="Title"/>
<parameter key="new_name" value="Title2"/>
</operator>
<operator activated="true" class="text_to_nominal" compatibility="8.1.001" expanded="true" height="82" name="Text to Nominal" width="90" x="447" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="Link|Title2"/>
<description align="center" color="transparent" colored="false" width="126">Don't convert article link to document text.</description>
</operator>
<operator activated="true" class="split_data" compatibility="8.1.001" expanded="true" height="103" name="Split Data" width="90" x="581" y="34">
<enumeration key="partitions">
<parameter key="ratio" value="0.7"/>
<parameter key="ratio" value="0.3"/>
</enumeration>
<parameter key="sampling_type" value="shuffled sampling"/>
<description align="center" color="transparent" colored="false" width="126">Randomly sort the data from the feeds. Split into training &amp; testing.</description>
</operator>
<connect from_op="BBC Top Stories" from_port="output" to_op="Append" to_port="example set 1"/>
<connect from_op="BBC Asia" from_port="output" to_op="Append" to_port="example set 2"/>
<connect from_op="BBC Business" from_port="output" to_op="Append" to_port="example set 3"/>
<connect from_op="BBC Entertainment" from_port="output" to_op="Append" to_port="example set 4"/>
<connect from_op="Append" from_port="merged set" to_op="Generate Copy" to_port="example set input"/>
<connect from_op="Generate Copy" from_port="example set output" to_op="Text to Nominal" to_port="example set input"/>
<connect from_op="Text to Nominal" from_port="example set output" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_port="out 1"/>
<connect from_op="Split Data" from_port="partition 2" to_port="out 2"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
<portSpacing port="sink_out 3" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:data_to_documents" compatibility="8.1.000" expanded="true" height="68" name="Data to Documents (2)" width="90" x="179" y="289">
<list key="specify_weights"/>
</operator>
<operator activated="true" class="text:data_to_documents" compatibility="8.1.000" expanded="true" height="68" name="Data to Documents" width="90" x="179" y="34">
<list key="specify_weights"/>
</operator>
<operator activated="true" class="loop_collection" compatibility="8.1.001" expanded="true" height="82" name="Loop Collection" width="90" x="313" y="34">
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
<operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
<operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="313" y="34">
<parameter key="min_chars" value="2"/>
</operator>
<operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="447" y="34"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="8.1.000" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="581" y="34"/>
<connect from_port="single" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="output 1"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
<description align="center" color="transparent" colored="false" width="126">Text Prep using Text Mining</description>
</operator>
<operator activated="true" class="loop_collection" compatibility="8.1.001" expanded="true" height="82" name="Loop Collection (3)" width="90" x="313" y="238">
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize (3)" width="90" x="45" y="34"/>
<operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="179" y="34"/>
<operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (3)" width="90" x="313" y="34">
<parameter key="min_chars" value="2"/>
</operator>
<operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (3)" width="90" x="447" y="34"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="8.1.000" expanded="true" height="68" name="Generate n-Grams (2)" width="90" x="581" y="34"/>
<connect from_port="single" to_op="Tokenize (3)" to_port="document"/>
<connect from_op="Tokenize (3)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
<connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Tokens (3)" to_port="document"/>
<connect from_op="Filter Tokens (3)" from_port="document" to_op="Filter Stopwords (3)" to_port="document"/>
<connect from_op="Filter Stopwords (3)" from_port="document" to_op="Generate n-Grams (2)" to_port="document"/>
<connect from_op="Generate n-Grams (2)" from_port="document" to_port="output 1"/>
<portSpacing port="source_single" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
<description align="center" color="transparent" colored="false" width="126">Text Prep using Text Mining</description>
</operator>
<operator activated="true" class="concurrency:optimize_parameters_grid" compatibility="8.1.001" expanded="true" height="187" name="Optimize Parameters (Grid)" width="90" x="514" y="34">
<list key="parameters">
<parameter key="LDA.number_of_topics" value="[5;20;5;linear]"/>
</list>
<process expanded="true">
<operator activated="true" class="operator_toolbox:lda" compatibility="1.0.000" expanded="true" height="124" name="LDA" width="90" x="179" y="34">
<parameter key="number_of_topics" value="20"/>
<parameter key="iterations" value="100"/>
<parameter key="use_local_random_seed" value="true"/>
<parameter key="local_random_seed" value="1997"/>
</operator>
<operator activated="true" class="generate_direct_mailing_data" compatibility="8.1.001" expanded="true" height="68" name="Generate Direct Mailing Data" width="90" x="112" y="238"/>
<operator activated="true" class="concurrency:parallel_decision_tree" compatibility="8.1.001" expanded="true" height="103" name="Decision Tree" width="90" x="246" y="238">
<description align="center" color="transparent" colored="false" width="126">This is because the optimize operator doesn't recognize the LDA model type.</description>
</operator>
<connect from_port="input 1" to_op="LDA" to_port="col"/>
<connect from_op="LDA" from_port="exa" to_port="output 1"/>
<connect from_op="LDA" from_port="top" to_port="output 2"/>
<connect from_op="LDA" from_port="mod" to_port="output 3"/>
<connect from_op="LDA" from_port="per" to_port="performance"/>
<connect from_op="Generate Direct Mailing Data" from_port="output" to_op="Decision Tree" to_port="training set"/>
<connect from_op="Decision Tree" from_port="model" to_port="model"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_performance" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
<portSpacing port="sink_output 3" spacing="0"/>
<portSpacing port="sink_output 4" spacing="0"/>
</process>
</operator>
<operator activated="true" class="operator_toolbox:apply_model_documents" compatibility="1.0.000" expanded="true" height="103" name="Apply Model (Documents)" width="90" x="581" y="238"/>
<connect from_op="Get News Feeds" from_port="out 1" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Get News Feeds" from_port="out 2" to_op="Data to Documents (2)" to_port="example set"/>
<connect from_op="Data to Documents (2)" from_port="documents" to_op="Loop Collection (3)" to_port="collection"/>
<connect from_op="Data to Documents" from_port="documents" to_op="Loop Collection" to_port="collection"/>
<connect from_op="Loop Collection" from_port="output 1" to_op="Optimize Parameters (Grid)" to_port="input 1"/>
<connect from_op="Loop Collection (3)" from_port="output 1" to_op="Apply Model (Documents)" to_port="doc"/>
<connect from_op="Optimize Parameters (Grid)" from_port="performance" to_port="result 2"/>
<connect from_op="Optimize Parameters (Grid)" from_port="output 1" to_port="result 3"/>
<connect from_op="Optimize Parameters (Grid)" from_port="output 2" to_port="result 4"/>
<connect from_op="Optimize Parameters (Grid)" from_port="output 3" to_op="Apply Model (Documents)" to_port="mod"/>
<connect from_op="Apply Model (Documents)" from_port="exa" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="84"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
</process>
</operator>
</process>