Delete hyphens after reading pdf-files
I'm very new to RapidMiner. I'm reading german pdf-files and tokenizing them, which is working fine... However, the pdf-files contain hyphens that seperate a fair amount of words in to two parts, like the following example:
"die Bedeutung der finan-
ziellen Interessen der Union"
I'm trying to dehyphenate the broken text to:
"die Bedeutung der finanziellen Interessen der Union"
I'm using the replace tokens operator to joines the lines and remove the '-'. It works when I try it with examples within the operator, but when I play the process the words are all still broken and nothing seems to have been replaced. Maybe someone from the community can help? Thanks in advance! Here is my process:
<?xml version="1.0" encoding="UTF-8"?><process version="9.10.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.4.000" expanded="true" name="Process" origin="GENERATED_TUTORIAL">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.10.001" expanded="true" height="68" name="Retrieve BPW_Dictionary_zusammengefasst" width="90" x="179" y="187">
<parameter key="repository_entry" value="BPW_Dictionary_zusammengefasst"/>
</operator>
<operator activated="true" class="operator_toolbox:dictionary_sentiment_learner" compatibility="2.12.000" expanded="true" height="103" name="Dictionary-Based Sentiment (Documents)" width="90" x="380" y="187">
<parameter key="value_attribute" value="C"/>
<parameter key="key_attribute" value="A"/>
<parameter key="negation_attribute" value=""/>
<parameter key="negation_window_size" value="1"/>
<parameter key="negation_strength" value=""/>
<parameter key="use_symmetric_negation_window" value="false"/>
<parameter key="use_intensifier" value="false"/>
<parameter key="intensifier_word" value=""/>
<parameter key="intensifier_value" value=""/>
<parameter key="use_symmetric_intensifier_window" value="false"/>
</operator>
<operator activated="true" class="concurrency:loop_files" compatibility="9.10.001" expanded="true" height="82" name="Loop Files" width="90" x="179" y="34">
<parameter key="directory" value="D:/Masterarbeit/Daten_RapidMiner/Lageberichte_txt/2010"/>
<parameter key="filter_type" value="glob"/>
<parameter key="recursive" value="true"/>
<parameter key="enable_macros" value="false"/>
<parameter key="macro_for_file_name" value="file_name"/>
<parameter key="macro_for_file_type" value="file_type"/>
<parameter key="macro_for_folder_name" value="folder_name"/>
<parameter key="reuse_results" value="false"/>
<parameter key="enable_parallel_execution" value="true"/>
<process expanded="true">
<operator activated="true" class="text:read_document" compatibility="9.4.000" expanded="true" height="68" name="Read Document" width="90" x="45" y="34">
<parameter key="extract_text_only" value="true"/>
<parameter key="use_file_extension_as_type" value="true"/>
<parameter key="content_type" value="pdf"/>
<parameter key="encoding" value="SYSTEM"/>
</operator>
<operator activated="true" class="retrieve" compatibility="9.10.001" expanded="true" height="68" name="Retrieve Stopwords_zusammengefasst" width="90" x="715" y="187">
<parameter key="repository_entry" value="Stopwords_zusammengefasst"/>
</operator>
<operator activated="true" class="text:replace_tokens" compatibility="9.4.000" expanded="true" height="68" name="Replace Tokens" width="90" x="179" y="34">
<list key="replace_dictionary">
<parameter key="\n" value=" "/>
</list>
</operator>
<operator activated="true" class="text:replace_tokens" compatibility="9.4.000" expanded="true" height="68" name="Replace Tokens (2)" width="90" x="313" y="34">
<list key="replace_dictionary">
<parameter key="()" value="$1"/>
</list>
</operator>
<operator activated="true" class="text:tokenize" compatibility="9.4.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="447" y="34">
<parameter key="mode" value="non letters"/>
<parameter key="characters" value=".:"/>
<parameter key="language" value="German"/>
<parameter key="max_token_length" value="3"/>
</operator>
<operator activated="true" class="text:filter_by_length" compatibility="9.4.000" expanded="true" height="68" name="Filter Tokens (by Length) (2)" width="90" x="581" y="34">
<parameter key="min_chars" value="3"/>
<parameter key="max_chars" value="999"/>
</operator>
<operator activated="true" class="text:transform_cases" compatibility="9.4.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="715" y="34">
<parameter key="transform_to" value="lower case"/>
</operator>
<operator activated="true" class="operator_toolbox:filter_tokens_using_exampleset" compatibility="2.12.000" expanded="true" height="82" name="Filter Tokens Using ExampleSet" width="90" x="916" y="34">
<parameter key="attribute" value="A"/>
<parameter key="case_sensitive" value="true"/>
<parameter key="invert_filter" value="false"/>
</operator>
<connect from_port="file object" to_op="Read Document" to_port="file"/>
<connect from_op="Read Document" from_port="output" to_op="Replace Tokens" to_port="document"/>
<connect from_op="Retrieve Stopwords_zusammengefasst" from_port="output" to_op="Filter Tokens Using ExampleSet" to_port="example set"/>
<connect from_op="Replace Tokens" from_port="document" to_op="Replace Tokens (2)" to_port="document"/>
<connect from_op="Replace Tokens (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>
<connect from_op="Tokenize (2)" from_port="document" to_op="Filter Tokens (by Length) (2)" to_port="document"/>
<connect from_op="Filter Tokens (by Length) (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
<connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Tokens Using ExampleSet" to_port="document"/>
<connect from_op="Filter Tokens Using ExampleSet" from_port="document" to_port="output 1"/>
<portSpacing port="source_file object" spacing="0"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="operator_toolbox:apply_model_documents" compatibility="2.12.000" expanded="true" height="103" name="Apply Model (Documents)" width="90" x="581" y="34">
<list key="application_parameters"/>
</operator>
<connect from_port="input 1" to_op="Loop Files" to_port="input 1"/>
<connect from_op="Retrieve BPW_Dictionary_zusammengefasst" from_port="output" to_op="Dictionary-Based Sentiment (Documents)" to_port="exa"/>
<connect from_op="Dictionary-Based Sentiment (Documents)" from_port="mod" to_op="Apply Model (Documents)" to_port="mod"/>
<connect from_op="Loop Files" from_port="output 1" to_op="Apply Model (Documents)" to_port="doc"/>
<connect from_op="Apply Model (Documents)" from_port="exa" to_port="result 1"/>
<connect from_op="Apply Model (Documents)" from_port="doc" to_port="result 2"/>
<connect from_op="Apply Model (Documents)" from_port="mod" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<background height="232" location="//Samples/Tutorials/Basics/02/tutorial2" width="1502" x="26" y="47"/>