A program to recognize and reward our most engaged community members
<?xml version="1.0" encoding="UTF-8" standalone="no"?><process version="5.2.009"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="5.2.009" expanded="true" name="Process"> <process expanded="true" height="415" width="748"> <operator activated="true" class="text:process_document_from_file" compatibility="5.2.005" expanded="true" height="76" name="Process Documents (Training)" width="90" x="45" y="120"> <list key="text_directories"> <parameter key="class1" value="C:\Users\mhelf\Documents\schulungen\4 - Text and Web Mining\Data\files - newsgroup"/> <parameter key="class2" value="C:\Users\mhelf\Documents\schulungen\4 - Text and Web Mining\Data\files - various encodings"/> </list> <process expanded="true" height="546" width="658"> <operator activated="true" class="text:tokenize" compatibility="5.2.005" expanded="true" height="60" name="Tokenize" width="90" x="179" y="30"/> <connect from_port="document" to_op="Tokenize" to_port="document"/> <connect from_op="Tokenize" from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="text:process_document_from_file" compatibility="5.2.005" expanded="true" height="76" name="Process Documents (Application)" width="90" x="45" y="300"> <list key="text_directories"> <parameter key="class1" value="C:\Users\mhelf\Documents\schulungen\4 - Text and Web Mining\Data\files - newsgroup"/> <parameter key="class2" value="C:\Users\mhelf\Documents\schulungen\4 - Text and Web Mining\Data\files - various encodings"/> </list> <process expanded="true"> <operator activated="true" class="text:tokenize" compatibility="5.2.005" expanded="true" name="Tokenize (2)"/> <connect from_port="document" to_op="Tokenize (2)" to_port="document"/> <connect from_op="Tokenize (2)" from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="naive_bayes" compatibility="5.2.009" expanded="true" height="76" name="Naive Bayes" width="90" x="313" y="30"/> <operator activated="true" class="apply_model" compatibility="5.2.009" expanded="true" height="76" name="Apply Model" width="90" x="447" y="120"> <list key="application_parameters"/> </operator> <operator activated="true" class="loop_examples" compatibility="5.2.009" expanded="true" height="76" name="Loop Examples" width="90" x="581" y="120"> <process expanded="true" height="546" width="658"> <operator activated="true" class="extract_macro" compatibility="5.2.009" expanded="true" height="60" name="Extract Macro" width="90" x="45" y="30"> <parameter key="macro" value="class"/> <parameter key="macro_type" value="data_value"/> <parameter key="attribute_name" value="prediction(label)"/> <parameter key="example_index" value="%{example}"/> </operator> <operator activated="true" class="extract_macro" compatibility="5.2.009" expanded="true" height="60" name="Extract Macro (2)" width="90" x="179" y="30"> <parameter key="macro" value="path"/> <parameter key="macro_type" value="data_value"/> <parameter key="attribute_name" value="metadata_path"/> <parameter key="example_index" value="%{example}"/> </operator> <operator activated="true" class="execute_program" compatibility="5.2.009" expanded="true" height="76" name="Execute Program" width="90" x="380" y="30"> <parameter key="command" value="your_system_command_to_move "%{path}" "destination_path/%{class}""/> </operator> <connect from_port="example set" to_op="Extract Macro" to_port="example set"/> <connect from_op="Extract Macro" from_port="example set" to_op="Extract Macro (2)" to_port="example set"/> <connect from_op="Extract Macro (2)" from_port="example set" to_op="Execute Program" to_port="through 1"/> <connect from_op="Execute Program" from_port="through 1" to_port="example set"/> <portSpacing port="source_example set" spacing="0"/> <portSpacing port="sink_example set" spacing="0"/> <portSpacing port="sink_output 1" spacing="0"/> </process> </operator> <connect from_op="Process Documents (Training)" from_port="example set" to_op="Naive Bayes" to_port="training set"/> <connect from_op="Process Documents (Training)" from_port="word list" to_op="Process Documents (Application)" to_port="word list"/> <connect from_op="Process Documents (Application)" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/> <connect from_op="Naive Bayes" from_port="model" to_op="Apply Model" to_port="model"/> <connect from_op="Apply Model" from_port="labelled data" to_op="Loop Examples" to_port="example set"/> <connect from_op="Loop Examples" from_port="example set" to_port="result 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> </process> </operator></process>
<?xml version="1.0" encoding="UTF-8" standalone="no"?><process version="5.2.008"> <context> <input/> <output/> <macros/> </context> <operator activated="false" class="loop_files" compatibility="5.2.008" expanded="true" height="76" name="Loop Files" width="90" x="45" y="75"> <parameter key="directory" value="C:\trainpdfs\"/> <parameter key="filtered_string" value="file name (last part of the path)"/> <parameter key="file_name_macro" value="file_name"/> <parameter key="file_path_macro" value="file_path"/> <parameter key="parent_path_macro" value="parent_path"/> <parameter key="recursive" value="true"/> <parameter key="iterate_over_files" value="true"/> <parameter key="iterate_over_subdirs" value="false"/> <parameter key="parallelize_nested_process" value="false"/> <process expanded="true" height="650" width="1080"> <operator activated="false" class="text:read_document" compatibility="5.2.004" expanded="true" height="60" name="Read Document" width="90" x="504" y="30"> <parameter key="extract_text_only" value="true"/> <parameter key="use_file_extension_as_type" value="true"/> <parameter key="content_type" value="pdf"/> <parameter key="encoding" value="SYSTEM"/> </operator> <connect from_port="file object" to_op="Read Document" to_port="file"/> <connect from_op="Read Document" from_port="output" to_port="out 1"/> <portSpacing port="source_file object" spacing="0"/> <portSpacing port="source_in 1" spacing="0"/> <portSpacing port="sink_out 1" spacing="0"/> <portSpacing port="sink_out 2" spacing="0"/> </process> </operator></process><?xml version="1.0" encoding="UTF-8" standalone="no"?><process version="5.2.008"> <context> <input/> <output/> <macros/> </context> <operator activated="false" class="text:process_documents" compatibility="5.2.004" expanded="true" height="94" name="Process Documents (2)" width="90" x="179" y="75"> <parameter key="create_word_vector" value="true"/> <parameter key="vector_creation" value="TF-IDF"/> <parameter key="add_meta_information" value="true"/> <parameter key="keep_text" value="false"/> <parameter key="prune_method" value="none"/> <parameter key="prunde_below_percent" value="3.0"/> <parameter key="prune_above_percent" value="30.0"/> <parameter key="prune_below_rank" value="0.05"/> <parameter key="prune_above_rank" value="0.05"/> <parameter key="datamanagement" value="double_sparse_array"/> <parameter key="parallelize_vector_creation" value="false"/> <process expanded="true" height="632" width="1080"> <operator activated="false" class="text:tokenize" compatibility="5.2.004" expanded="true" height="60" name="Tokenize" width="90" x="504" y="30"> <parameter key="mode" value="non letters"/> <parameter key="characters" value=".:"/> <parameter key="language" value="English"/> <parameter key="max_token_length" value="3"/> </operator> <connect from_port="document" to_op="Tokenize" to_port="document"/> <connect from_op="Tokenize" from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator></process><?xml version="1.0" encoding="UTF-8" standalone="no"?><process version="5.2.008"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="set_role" compatibility="5.2.008" expanded="true" height="76" name="Set Role" width="90" x="175" y="266"> <parameter key="name" value="metadata_file"/> <parameter key="target_role" value="label"/> <list key="set_additional_roles"/> </operator></process>