A program to recognize and reward our most engaged community members
<?xml version="1.0" encoding="UTF-8" standalone="no"?><process version="5.3.015"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="5.3.015" expanded="true" name="Process"> <process expanded="true"> <operator activated="true" class="retrieve" compatibility="5.3.015" expanded="true" height="60" name="Retrieve" width="90" x="45" y="30"/> <operator activated="true" class="text:data_to_documents" compatibility="5.3.002" expanded="true" height="60" name="Data to Documents" width="90" x="246" y="30"> <description>Need to select the attributes in the pro settings of the Data to Document </description> <parameter key="select_attributes_and_weights" value="true"/> <list key="specify_weights"> <parameter key="OriginalTextFromDocument" value="1.0"/> </list> </operator> <operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Non Letters (2)" width="90" x="447" y="30"> <description>Process the entire dataset as if it were DIT data and use the merged stop phrase list to remove the boiler plate</description> <parameter key="keep_text" value="true"/> <process expanded="true"> <operator activated="true" class="web:extract_html_text_content" compatibility="5.3.002" expanded="true" height="60" name="Extract Content (3)" width="90" x="45" y="30"/> <operator activated="true" class="text:replace_tokens" compatibility="5.3.002" expanded="true" height="60" name="Split textText or TextText" width="90" x="180" y="30"> <description>Uses the regex from rapidminer forum to split where capitialised letters are in the middle of words because punctuation is missing from the original text. It finds captialised words and replaces them with a space and the captured text</description> <list key="replace_dictionary"> <parameter key="([A-Z])" value=" $1"/> </list> </operator> <operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="lower case (2)" width="90" x="315" y="30"/> <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.3.002" expanded="true" height="76" name="Filter Stop Phrases" width="90" x="450" y="30"> <parameter key="file" value="C:\Users\Michael\Google Drive\My Masters\RapidMinerRepo\Text Mining\Assignment\AdditionalFiles\MergedStopPhrases.txt"/> </operator> <operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize non letters (2)" width="90" x="585" y="30"/> <operator activated="true" class="text:replace_tokens" compatibility="5.3.002" expanded="true" height="60" name="Replace Regex" width="90" x="581" y="120"> <description>Remove �� from the start of some words and sometime ���� and replace it with just the word found after itRegular Expression Replacement��{1,2}() $1</description> <list key="replace_dictionary"> <parameter key="��{1,2}()" value="$1"/> </list> </operator> <operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Stop Eng (2)" width="90" x="313" y="187"/> <operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (2)" width="90" x="447" y="255"> <parameter key="min_chars" value="3"/> </operator> <operator activated="true" class="text:write_document" compatibility="5.3.002" expanded="true" height="76" name="Write Document" width="90" x="581" y="255"> <parameter key="file" value="C:\Users\michael.obrien\Google Drive\My Masters\Text and Web Mining\Text Mining Assignment\Python\MergedDataset\%{a}.txt"/> </operator> <connect from_port="document" to_op="Extract Content (3)" to_port="document"/> <connect from_op="Extract Content (3)" from_port="document" to_op="Split textText or TextText" to_port="document"/> <connect from_op="Split textText or TextText" from_port="document" to_op="lower case (2)" to_port="document"/> <connect from_op="lower case (2)" from_port="document" to_op="Filter Stop Phrases" to_port="document"/> <connect from_op="Filter Stop Phrases" from_port="document" to_op="Tokenize non letters (2)" to_port="document"/> <connect from_op="Tokenize non letters (2)" from_port="document" to_op="Replace Regex" to_port="document"/> <connect from_op="Replace Regex" from_port="document" to_op="Stop Eng (2)" to_port="document"/> <connect from_op="Stop Eng (2)" from_port="document" to_op="Filter Tokens (2)" to_port="document"/> <connect from_op="Filter Tokens (2)" from_port="document" to_op="Write Document" to_port="document"/> <connect from_op="Write Document" from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <connect from_op="Retrieve" from_port="output" to_op="Data to Documents" to_port="example set"/> <connect from_op="Data to Documents" from_port="documents" to_op="Process Non Letters (2)" to_port="documents 1"/> <connect from_op="Process Non Letters (2)" from_port="word list" to_port="result 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> </process> </operator></process>