Altair RISE

A program to recognize and reward our most engaged community members
Nominate Yourself Now!

Delete hyphens after reading pdf-files

User: "tobow"
New Altair Community Member
Updated by Jocelyn
Hi there,

I'm very new to RapidMiner. I'm reading german pdf-files and tokenizing them, which is working fine... However, the pdf-files contain hyphens that seperate a fair amount of words in to two parts, like the following example:

"die Bedeutung der finan-

ziellen Interessen der Union"

I'm trying to dehyphenate the broken text to:

"die Bedeutung der finanziellen Interessen der Union"

I'm using the replace tokens operator to joines the lines and remove the '-'. It works when I try it with examples within the operator, but when I play the process the words are all still broken and nothing seems to have been replaced. Maybe someone from the community can help? Thanks in advance! Here is my process:

<?xml version="1.0" encoding="UTF-8"?><process version="9.10.001">

  <context>

    <input/>

    <output/>

    <macros/>

  </context>

  <operator activated="true" class="process" compatibility="9.4.000" expanded="true" name="Process" origin="GENERATED_TUTORIAL">

    <parameter key="logverbosity" value="init"/>

    <parameter key="random_seed" value="2001"/>

    <parameter key="send_mail" value="never"/>

    <parameter key="notification_email" value=""/>

    <parameter key="process_duration_for_mail" value="30"/>

    <parameter key="encoding" value="SYSTEM"/>

    <process expanded="true">

      <operator activated="true" class="retrieve" compatibility="9.10.001" expanded="true" height="68" name="Retrieve BPW_Dictionary_zusammengefasst" width="90" x="179" y="187">

        <parameter key="repository_entry" value="BPW_Dictionary_zusammengefasst"/>

      </operator>

      <operator activated="true" class="operator_toolbox:dictionary_sentiment_learner" compatibility="2.12.000" expanded="true" height="103" name="Dictionary-Based Sentiment (Documents)" width="90" x="380" y="187">

        <parameter key="value_attribute" value="C"/>

        <parameter key="key_attribute" value="A"/>

        <parameter key="negation_attribute" value=""/>

        <parameter key="negation_window_size" value="1"/>

        <parameter key="negation_strength" value=""/>

        <parameter key="use_symmetric_negation_window" value="false"/>

        <parameter key="use_intensifier" value="false"/>

        <parameter key="intensifier_word" value=""/>

        <parameter key="intensifier_value" value=""/>

        <parameter key="use_symmetric_intensifier_window" value="false"/>

      </operator>

      <operator activated="true" class="concurrency:loop_files" compatibility="9.10.001" expanded="true" height="82" name="Loop Files" width="90" x="179" y="34">

        <parameter key="directory" value="D:/Masterarbeit/Daten_RapidMiner/Lageberichte_txt/2010"/>

        <parameter key="filter_type" value="glob"/>

        <parameter key="recursive" value="true"/>

        <parameter key="enable_macros" value="false"/>

        <parameter key="macro_for_file_name" value="file_name"/>

        <parameter key="macro_for_file_type" value="file_type"/>

        <parameter key="macro_for_folder_name" value="folder_name"/>

        <parameter key="reuse_results" value="false"/>

        <parameter key="enable_parallel_execution" value="true"/>

        <process expanded="true">

          <operator activated="true" class="text:read_document" compatibility="9.4.000" expanded="true" height="68" name="Read Document" width="90" x="45" y="34">

            <parameter key="extract_text_only" value="true"/>

            <parameter key="use_file_extension_as_type" value="true"/>

            <parameter key="content_type" value="pdf"/>

            <parameter key="encoding" value="SYSTEM"/>

          </operator>

          <operator activated="true" class="retrieve" compatibility="9.10.001" expanded="true" height="68" name="Retrieve Stopwords_zusammengefasst" width="90" x="715" y="187">

            <parameter key="repository_entry" value="Stopwords_zusammengefasst"/>

          </operator>

          <operator activated="true" class="text:replace_tokens" compatibility="9.4.000" expanded="true" height="68" name="Replace Tokens" width="90" x="179" y="34">

            <list key="replace_dictionary">

              <parameter key="\n" value=" "/>

            </list>

          </operator>

          <operator activated="true" class="text:replace_tokens" compatibility="9.4.000" expanded="true" height="68" name="Replace Tokens (2)" width="90" x="313" y="34">

            <list key="replace_dictionary">

              <parameter key="()­" value="$1"/>

            </list>

          </operator>

          <operator activated="true" class="text:tokenize" compatibility="9.4.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="447" y="34">

            <parameter key="mode" value="non letters"/>

            <parameter key="characters" value=".:"/>

            <parameter key="language" value="German"/>

            <parameter key="max_token_length" value="3"/>

          </operator>

          <operator activated="true" class="text:filter_by_length" compatibility="9.4.000" expanded="true" height="68" name="Filter Tokens (by Length) (2)" width="90" x="581" y="34">

            <parameter key="min_chars" value="3"/>

            <parameter key="max_chars" value="999"/>

          </operator>

          <operator activated="true" class="text:transform_cases" compatibility="9.4.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="715" y="34">

            <parameter key="transform_to" value="lower case"/>

          </operator>

          <operator activated="true" class="operator_toolbox:filter_tokens_using_exampleset" compatibility="2.12.000" expanded="true" height="82" name="Filter Tokens Using ExampleSet" width="90" x="916" y="34">

            <parameter key="attribute" value="A"/>

            <parameter key="case_sensitive" value="true"/>

            <parameter key="invert_filter" value="false"/>

          </operator>

          <connect from_port="file object" to_op="Read Document" to_port="file"/>

          <connect from_op="Read Document" from_port="output" to_op="Replace Tokens" to_port="document"/>

          <connect from_op="Retrieve Stopwords_zusammengefasst" from_port="output" to_op="Filter Tokens Using ExampleSet" to_port="example set"/>

          <connect from_op="Replace Tokens" from_port="document" to_op="Replace Tokens (2)" to_port="document"/>

          <connect from_op="Replace Tokens (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>

          <connect from_op="Tokenize (2)" from_port="document" to_op="Filter Tokens (by Length) (2)" to_port="document"/>

          <connect from_op="Filter Tokens (by Length) (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>

          <connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Tokens Using ExampleSet" to_port="document"/>

          <connect from_op="Filter Tokens Using ExampleSet" from_port="document" to_port="output 1"/>

          <portSpacing port="source_file object" spacing="0"/>

          <portSpacing port="source_input 1" spacing="0"/>

          <portSpacing port="source_input 2" spacing="0"/>

          <portSpacing port="sink_output 1" spacing="0"/>

          <portSpacing port="sink_output 2" spacing="0"/>

        </process>

      </operator>

      <operator activated="true" class="operator_toolbox:apply_model_documents" compatibility="2.12.000" expanded="true" height="103" name="Apply Model (Documents)" width="90" x="581" y="34">

        <list key="application_parameters"/>

      </operator>

      <connect from_port="input 1" to_op="Loop Files" to_port="input 1"/>

      <connect from_op="Retrieve BPW_Dictionary_zusammengefasst" from_port="output" to_op="Dictionary-Based Sentiment (Documents)" to_port="exa"/>

      <connect from_op="Dictionary-Based Sentiment (Documents)" from_port="mod" to_op="Apply Model (Documents)" to_port="mod"/>

      <connect from_op="Loop Files" from_port="output 1" to_op="Apply Model (Documents)" to_port="doc"/>

      <connect from_op="Apply Model (Documents)" from_port="exa" to_port="result 1"/>

      <connect from_op="Apply Model (Documents)" from_port="doc" to_port="result 2"/>

      <connect from_op="Apply Model (Documents)" from_port="mod" to_port="result 3"/>

      <portSpacing port="source_input 1" spacing="0"/>

      <portSpacing port="source_input 2" spacing="0"/>

      <portSpacing port="sink_result 1" spacing="0"/>

      <portSpacing port="sink_result 2" spacing="0"/>

      <portSpacing port="sink_result 3" spacing="0"/>

      <portSpacing port="sink_result 4" spacing="0"/>

      <background height="232" location="//Samples/Tutorials/Basics/02/tutorial2" width="1502" x="26" y="47"/>


Sort by:
1 - 1 of 11
    User: "BalazsBaranyRM"
    New Altair Community Member
    Accepted Answer
    Hi!

    Try some combination of or \n\r instead of just \n. 
    \n is the "Unix line ending", just a newline character.
    is Carriage Return + Newline, the Windows tradition. 

    It depends on your documents and how they are processed. 

    Regards,
    Balázs