Hi there,
I'm very new to RapidMiner. I'm reading german pdf-files and tokenizing them, which is working fine... However, the pdf-files contain hyphens that seperate a fair amount of words in to two parts, like the following example:
"die Bedeutung der finan-
ziellen Interessen der Union"
I'm trying to dehyphenate the broken text to:
"die Bedeutung der finanziellen Interessen der Union"
I'm using the replace tokens operator to joines the lines and remove the '-'. It works when I try it with examples within the operator, but when I play the process the words are all still broken and nothing seems to have been replaced. Maybe someone from the community can help? Thanks in advance! Here is my process:
<?xml version="1.0" encoding="UTF-8"?><process version="9.10.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.4.000" expanded="true" name="Process" origin="GENERATED_TUTORIAL">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="9.10.001" expanded="true" height="68" name="Retrieve BPW_Dictionary_zusammengefasst" width="90" x="179" y="187">
        <parameter key="repository_entry" value="BPW_Dictionary_zusammengefasst"/>
      </operator>
      <operator activated="true" class="operator_toolbox:dictionary_sentiment_learner" compatibility="2.12.000" expanded="true" height="103" name="Dictionary-Based Sentiment (Documents)" width="90" x="380" y="187">
        <parameter key="value_attribute" value="C"/>
        <parameter key="key_attribute" value="A"/>
        <parameter key="negation_attribute" value=""/>
        <parameter key="negation_window_size" value="1"/>
        <parameter key="negation_strength" value=""/>
        <parameter key="use_symmetric_negation_window" value="false"/>
        <parameter key="use_intensifier" value="false"/>
        <parameter key="intensifier_word" value=""/>
        <parameter key="intensifier_value" value=""/>
        <parameter key="use_symmetric_intensifier_window" value="false"/>
      </operator>
      <operator activated="true" class="concurrency:loop_files" compatibility="9.10.001" expanded="true" height="82" name="Loop Files" width="90" x="179" y="34">
        <parameter key="directory" value="D:/Masterarbeit/Daten_RapidMiner/Lageberichte_txt/2010"/>
        <parameter key="filter_type" value="glob"/>
        <parameter key="recursive" value="true"/>
        <parameter key="enable_macros" value="false"/>
        <parameter key="macro_for_file_name" value="file_name"/>
        <parameter key="macro_for_file_type" value="file_type"/>
        <parameter key="macro_for_folder_name" value="folder_name"/>
        <parameter key="reuse_results" value="false"/>
        <parameter key="enable_parallel_execution" value="true"/>
        <process expanded="true">
          <operator activated="true" class="text:read_document" compatibility="9.4.000" expanded="true" height="68" name="Read Document" width="90" x="45" y="34">
            <parameter key="extract_text_only" value="true"/>
            <parameter key="use_file_extension_as_type" value="true"/>
            <parameter key="content_type" value="pdf"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <operator activated="true" class="retrieve" compatibility="9.10.001" expanded="true" height="68" name="Retrieve Stopwords_zusammengefasst" width="90" x="715" y="187">
            <parameter key="repository_entry" value="Stopwords_zusammengefasst"/>
          </operator>
          <operator activated="true" class="text:replace_tokens" compatibility="9.4.000" expanded="true" height="68" name="Replace Tokens" width="90" x="179" y="34">
            <list key="replace_dictionary">
              <parameter key="\n" value=" "/>
            </list>
          </operator>
          <operator activated="true" class="text:replace_tokens" compatibility="9.4.000" expanded="true" height="68" name="Replace Tokens (2)" width="90" x="313" y="34">
            <list key="replace_dictionary">
              <parameter key="()" value="$1"/>
            </list>
          </operator>
          <operator activated="true" class="text:tokenize" compatibility="9.4.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="447" y="34">
            <parameter key="mode" value="non letters"/>
            <parameter key="characters" value=".:"/>
            <parameter key="language" value="German"/>
            <parameter key="max_token_length" value="3"/>
          </operator>
          <operator activated="true" class="text:filter_by_length" compatibility="9.4.000" expanded="true" height="68" name="Filter Tokens (by Length) (2)" width="90" x="581" y="34">
            <parameter key="min_chars" value="3"/>
            <parameter key="max_chars" value="999"/>
          </operator>
          <operator activated="true" class="text:transform_cases" compatibility="9.4.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="715" y="34">
            <parameter key="transform_to" value="lower case"/>
          </operator>
          <operator activated="true" class="operator_toolbox:filter_tokens_using_exampleset" compatibility="2.12.000" expanded="true" height="82" name="Filter Tokens Using ExampleSet" width="90" x="916" y="34">
            <parameter key="attribute" value="A"/>
            <parameter key="case_sensitive" value="true"/>
            <parameter key="invert_filter" value="false"/>
          </operator>
          <connect from_port="file object" to_op="Read Document" to_port="file"/>
          <connect from_op="Read Document" from_port="output" to_op="Replace Tokens" to_port="document"/>
          <connect from_op="Retrieve Stopwords_zusammengefasst" from_port="output" to_op="Filter Tokens Using ExampleSet" to_port="example set"/>
          <connect from_op="Replace Tokens" from_port="document" to_op="Replace Tokens (2)" to_port="document"/>
          <connect from_op="Replace Tokens (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_op="Filter Tokens (by Length) (2)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length) (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
          <connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Tokens Using ExampleSet" to_port="document"/>
          <connect from_op="Filter Tokens Using ExampleSet" from_port="document" to_port="output 1"/>
          <portSpacing port="source_file object" spacing="0"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="source_input 2" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="operator_toolbox:apply_model_documents" compatibility="2.12.000" expanded="true" height="103" name="Apply Model (Documents)" width="90" x="581" y="34">
        <list key="application_parameters"/>
      </operator>
      <connect from_port="input 1" to_op="Loop Files" to_port="input 1"/>
      <connect from_op="Retrieve BPW_Dictionary_zusammengefasst" from_port="output" to_op="Dictionary-Based Sentiment (Documents)" to_port="exa"/>
      <connect from_op="Dictionary-Based Sentiment (Documents)" from_port="mod" to_op="Apply Model (Documents)" to_port="mod"/>
      <connect from_op="Loop Files" from_port="output 1" to_op="Apply Model (Documents)" to_port="doc"/>
      <connect from_op="Apply Model (Documents)" from_port="exa" to_port="result 1"/>
      <connect from_op="Apply Model (Documents)" from_port="doc" to_port="result 2"/>
      <connect from_op="Apply Model (Documents)" from_port="mod" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="source_input 2" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <background height="232" location="//Samples/Tutorials/Basics/02/tutorial2" width="1502" x="26" y="47"/>