Cleaning twitter data

I'm new to RapidMiner, and I am struggling to understand how the Filter commands can be used to clean up twitter feeds. I am importing these from a CSV file and am trying to create sub-processes within the process documents operator to remove twitter handles (@), RT and hashtags. I have tried for example to use Filter Tokens by Content specifying that the condition is contains the string @. Although the process runs without errors I cannot see in the results that the twitter handles were removed. Can anybody please advise on how to go about cleaning up the data?
Answers
-
When you load in the tweets from CSV they will come in as a Nominal datatype. To use the Filter Tokens by Content, you would need to convert those tweets into a Text data type via a Nominal to Text operator.
Here's a sample using the Search Twitter operator that does some cleaning.
<?xml version="1.0" encoding="UTF-8"?><process version="7.5.000">
<context>
<input/>
<output/>
<macros>
<macro>
<key>keywords</key>
<value>Donald Trump</value>
</macro>
</macros>
</context>
<operator activated="true" class="process" compatibility="7.5.000" expanded="true" name="Process">
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="social_media:search_twitter" compatibility="7.3.000" expanded="true" height="68" name="Search Twitter" width="90" x="45" y="34">
<parameter key="connection" value="ThomasOtt"/>
<parameter key="query" value="%{keywords}"/>
<parameter key="limit" value="1000"/>
<parameter key="language" value="en"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.5.000" expanded="true" height="82" name="Select Attributes" width="90" x="179" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="Text|Id|Retweet-Count"/>
</operator>
<operator activated="true" class="replace" compatibility="7.5.000" expanded="true" height="82" name="Replace" width="90" x="313" y="34">
<parameter key="replace_what" value="#(.*)"/>
<parameter key="replace_by" value="hashtag_$1"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="7.5.000" expanded="true" height="82" name="Nominal to Text" width="90" x="447" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Text"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="34">
<parameter key="prune_method" value="percentual"/>
<parameter key="prune_above_percent" value="50.0"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
<operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34"/>
<operator activated="true" class="text:filter_by_length" compatibility="7.4.001" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="313" y="34"/>
<operator activated="true" class="text:replace_tokens" compatibility="7.4.001" expanded="true" height="68" name="Replace Tokens" width="90" x="447" y="34">
<list key="replace_dictionary">
<parameter key="https" value="link"/>
<parameter key="http" value="link"/>
</list>
</operator>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="7.4.001" expanded="true" height="68" name="Generate n-Grams (Terms)" width="90" x="581" y="34"/>
<operator activated="true" class="text:filter_tokens_by_content" compatibility="7.4.001" expanded="true" height="68" name="Filter Tokens (by Content)" width="90" x="715" y="34">
<parameter key="string" value="link"/>
<parameter key="invert condition" value="true"/>
</operator>
<operator activated="true" class="text:filter_stopwords_english" compatibility="7.4.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="849" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Replace Tokens" to_port="document"/>
<connect from_op="Replace Tokens" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Filter Tokens (by Content)" to_port="document"/>
<connect from_op="Filter Tokens (by Content)" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:wordlist_to_data" compatibility="7.4.001" expanded="true" height="82" name="WordList to Data" width="90" x="715" y="85"/>
<operator activated="true" class="sort" compatibility="7.5.000" expanded="true" height="82" name="Sort" width="90" x="849" y="85">
<parameter key="attribute_name" value="total"/>
<parameter key="sorting_direction" value="decreasing"/>
</operator>
<operator activated="true" class="write_excel" compatibility="7.5.000" expanded="true" height="82" name="Write Excel" width="90" x="983" y="85">
<parameter key="excel_file" value="C:\Users\ThomasOtt\Desktop\Important Twitter Words for %{keywords}.xlsx"/>
<parameter key="encoding" value="SYSTEM"/>
</operator>
<connect from_op="Search Twitter" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Replace" to_port="example set input"/>
<connect from_op="Replace" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
<connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
<connect from_op="WordList to Data" from_port="example set" to_op="Sort" to_port="example set input"/>
<connect from_op="Sort" from_port="example set output" to_op="Write Excel" to_port="input"/>
<connect from_op="Write Excel" from_port="through" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>0