This process will assign a language for documents and RSS feeds. After tokenizing the text it creates trigrams which are matched against the training labels. The model then scores new text and assigns a language label.
Text that has a mixture of languages (i.e., Spanish and English) can end up marked as either language based on how many training examples you use. You may need to have a large number of examples for your preferred language.
To mark text categories or sentiment remove the ngram operator and use topics instead (Finance, Sports, Entertainment).
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.0.0" expanded="true" name="Root">
<description>Using a simple Naive Bayes classifier.</description>
<process expanded="true" height="611" width="949">
<operator activated="true" class="read_database" compatibility="5.0.10" expanded="true" height="60" name="Read DB - Train" width="90" x="45" y="30">
<list key="data_set_meta_data_information"/>
<parameter key="attribute_names_already_defined" value="true"/>
<parameter key="connection" value="rsstext"/>
<parameter key="query" value="SELECT "id", "title" ,"lang_train" FROM "textfile" WHERE lang_train is not null"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.0.10" expanded="true" height="76" name="ID Train" width="90" x="179" y="30">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
</operator>
<operator activated="true" class="set_role" compatibility="5.0.10" expanded="true" height="76" name="Label Train" width="90" x="45" y="120">
<parameter key="name" value="lang_train"/>
<parameter key="target_role" value="label"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="5.0.10" expanded="true" height="76" name="NomText Train" width="90" x="179" y="120">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="title"/>
<parameter key="attributes" value="posttitle|postdesc"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.0.6" expanded="true" height="76" name="ProcessDocs Train" width="90" x="313" y="210">
<list key="specify_weights"/>
<process expanded="true" height="565" width="827">
<operator activated="true" class="text:transform_cases" compatibility="5.0.6" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="30"/>
<operator activated="true" class="text:tokenize" compatibility="5.0.6" expanded="true" height="60" name="Tokenize" width="90" x="246" y="30"/>
<operator activated="true" class="text:generate_n_grams_characters" compatibility="5.0.7" expanded="true" height="60" name="Generate n-Grams (Characters)" width="90" x="380" y="30"/>
<connect from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Generate n-Grams (Characters)" to_port="document"/>
<connect from_op="Generate n-Grams (Characters)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="support_vector_machine_libsvm" compatibility="5.0.10" expanded="true" height="76" name="SVM" width="90" x="447" y="210">
<parameter key="kernel_type" value="linear"/>
<list key="class_weights"/>
</operator>
<operator activated="true" class="read_database" compatibility="5.0.10" expanded="true" height="60" name="Read DB - Apply" width="90" x="45" y="345">
<list key="data_set_meta_data_information"/>
<parameter key="attribute_names_already_defined" value="true"/>
<parameter key="connection" value="rsstext"/>
<parameter key="query" value="SELECT "id", "title" FROM "textfile" "/>
</operator>
<operator activated="true" class="set_role" compatibility="5.0.10" expanded="true" height="76" name="ID Apply" width="90" x="45" y="435">
<parameter key="name" value="id"/>
<parameter key="target_role" value="id"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="5.0.10" expanded="true" height="76" name="NomText Apply" width="90" x="179" y="435">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="title"/>
<parameter key="attributes" value="posttitle|postdesc"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.0.6" expanded="true" height="76" name="ProcessDocs Apply" width="90" x="380" y="345">
<list key="specify_weights"/>
<process expanded="true" height="657" width="827">
<operator activated="true" class="text:transform_cases" compatibility="5.0.6" expanded="true" height="60" name="Transform Cases (2)" width="90" x="45" y="30"/>
<operator activated="true" class="text:tokenize" compatibility="5.0.6" expanded="true" height="60" name="Tokenize (2)" width="90" x="180" y="30"/>
<operator activated="true" class="text:generate_n_grams_characters" compatibility="5.0.7" expanded="true" height="60" name="Generate n-Grams (2)" width="90" x="503" y="30"/>
<connect from_port="document" to_op="Transform Cases (2)" to_port="document"/>
<connect from_op="Transform Cases (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>
<connect from_op="Tokenize (2)" from_port="document" to_op="Generate n-Grams (2)" to_port="document"/>
<connect from_op="Generate n-Grams (2)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="apply_model" compatibility="5.0.10" expanded="true" height="76" name="Apply Model" width="90" x="648" y="255">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="write_database" compatibility="5.0.10" expanded="true" height="60" name="Write Database" width="90" x="782" y="255">
<parameter key="connection" value="rsstext"/>
<parameter key="table_name" value="langupdate"/>
<parameter key="overwrite_mode" value="overwrite"/>
</operator>
<connect from_op="Read DB - Train" from_port="output" to_op="ID Train" to_port="example set input"/>
<connect from_op="ID Train" from_port="example set output" to_op="Label Train" to_port="example set input"/>
<connect from_op="Label Train" from_port="example set output" to_op="NomText Train" to_port="example set input"/>
<connect from_op="NomText Train" from_port="example set output" to_op="ProcessDocs Train" to_port="example set"/>
<connect from_op="ProcessDocs Train" from_port="example set" to_op="SVM" to_port="training set"/>
<connect from_op="ProcessDocs Train" from_port="word list" to_op="ProcessDocs Apply" to_port="word list"/>
<connect from_op="SVM" from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_op="Read DB - Apply" from_port="output" to_op="ID Apply" to_port="example set input"/>
<connect from_op="ID Apply" from_port="example set output" to_op="NomText Apply" to_port="example set input"/>
<connect from_op="NomText Apply" from_port="example set output" to_op="ProcessDocs Apply" to_port="example set"/>
<connect from_op="ProcessDocs Apply" from_port="example set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Write Database" to_port="input"/>
<connect from_op="Write Database" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="216"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>