Delete hyphens after reading pdf-files

User: "tobow"

New Altair Community Member

Updated Nov 5, 2024 by Jocelyn

Hi there,

I'm very new to RapidMiner. I'm reading german pdf-files and tokenizing them, which is working fine... However, the pdf-files contain hyphens that seperate a fair amount of words in to two parts, like the following example:

"die Bedeutung der finan-

ziellen Interessen der Union"

I'm trying to dehyphenate the broken text to:

"die Bedeutung der finanziellen Interessen der Union"

I'm using the replace tokens operator to joines the lines and remove the '-'. It works when I try it with examples within the operator, but when I play the process the words are all still broken and nothing seems to have been replaced. Maybe someone from the community can help? Thanks in advance! Here is my process:

<?xml version="1.0" encoding="UTF-8"?><process version="9.10.001">

<context>

<input/>

<output/>

<macros/>

</context>

<operator activated="true" class="process" compatibility="9.4.000" expanded="true" name="Process" origin="GENERATED_TUTORIAL">

<parameter key="logverbosity" value="init"/>

<parameter key="random_seed" value="2001"/>

<parameter key="send_mail" value="never"/>

<parameter key="notification_email" value=""/>

<parameter key="process_duration_for_mail" value="30"/>

<parameter key="encoding" value="SYSTEM"/>

<process expanded="true">

<operator activated="true" class="retrieve" compatibility="9.10.001" expanded="true" height="68" name="Retrieve BPW_Dictionary_zusammengefasst" width="90" x="179" y="187">

<parameter key="repository_entry" value="BPW_Dictionary_zusammengefasst"/>

</operator>

<operator activated="true" class="operator_toolbox:dictionary_sentiment_learner" compatibility="2.12.000" expanded="true" height="103" name="Dictionary-Based Sentiment (Documents)" width="90" x="380" y="187">

<parameter key="value_attribute" value="C"/>

<parameter key="key_attribute" value="A"/>

<parameter key="negation_attribute" value=""/>

<parameter key="negation_window_size" value="1"/>

<parameter key="negation_strength" value=""/>

<parameter key="use_symmetric_negation_window" value="false"/>

<parameter key="use_intensifier" value="false"/>

<parameter key="intensifier_word" value=""/>

<parameter key="intensifier_value" value=""/>

<parameter key="use_symmetric_intensifier_window" value="false"/>

</operator>

<operator activated="true" class="concurrency:loop_files" compatibility="9.10.001" expanded="true" height="82" name="Loop Files" width="90" x="179" y="34">

<parameter key="directory" value="D:/Masterarbeit/Daten_RapidMiner/Lageberichte_txt/2010"/>

<parameter key="filter_type" value="glob"/>

<parameter key="recursive" value="true"/>

<parameter key="enable_macros" value="false"/>

<parameter key="macro_for_file_name" value="file_name"/>

<parameter key="macro_for_file_type" value="file_type"/>

<parameter key="macro_for_folder_name" value="folder_name"/>

<parameter key="reuse_results" value="false"/>

<parameter key="enable_parallel_execution" value="true"/>

<process expanded="true">

<operator activated="true" class="text:read_document" compatibility="9.4.000" expanded="true" height="68" name="Read Document" width="90" x="45" y="34">

<parameter key="extract_text_only" value="true"/>

<parameter key="use_file_extension_as_type" value="true"/>

<parameter key="content_type" value="pdf"/>

<parameter key="encoding" value="SYSTEM"/>

</operator>

<operator activated="true" class="retrieve" compatibility="9.10.001" expanded="true" height="68" name="Retrieve Stopwords_zusammengefasst" width="90" x="715" y="187">

<parameter key="repository_entry" value="Stopwords_zusammengefasst"/>

</operator>

<operator activated="true" class="text:replace_tokens" compatibility="9.4.000" expanded="true" height="68" name="Replace Tokens" width="90" x="179" y="34">

<list key="replace_dictionary">

<parameter key="\n" value=" "/>

</list>

</operator>

<operator activated="true" class="text:replace_tokens" compatibility="9.4.000" expanded="true" height="68" name="Replace Tokens (2)" width="90" x="313" y="34">

<list key="replace_dictionary">

<parameter key="()" value="$1"/>

</list>

</operator>

<operator activated="true" class="text:tokenize" compatibility="9.4.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="447" y="34">

<parameter key="mode" value="non letters"/>

<parameter key="characters" value=".:"/>

<parameter key="language" value="German"/>

<parameter key="max_token_length" value="3"/>

</operator>

<operator activated="true" class="text:filter_by_length" compatibility="9.4.000" expanded="true" height="68" name="Filter Tokens (by Length) (2)" width="90" x="581" y="34">

<parameter key="min_chars" value="3"/>

<parameter key="max_chars" value="999"/>

</operator>

<operator activated="true" class="text:transform_cases" compatibility="9.4.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="715" y="34">

<parameter key="transform_to" value="lower case"/>

</operator>

<operator activated="true" class="operator_toolbox:filter_tokens_using_exampleset" compatibility="2.12.000" expanded="true" height="82" name="Filter Tokens Using ExampleSet" width="90" x="916" y="34">

<parameter key="attribute" value="A"/>

<parameter key="case_sensitive" value="true"/>

<parameter key="invert_filter" value="false"/>

</operator>

<connect from_port="file object" to_op="Read Document" to_port="file"/>

<connect from_op="Read Document" from_port="output" to_op="Replace Tokens" to_port="document"/>

<connect from_op="Retrieve Stopwords_zusammengefasst" from_port="output" to_op="Filter Tokens Using ExampleSet" to_port="example set"/>

<connect from_op="Replace Tokens" from_port="document" to_op="Replace Tokens (2)" to_port="document"/>

<connect from_op="Replace Tokens (2)" from_port="document" to_op="Tokenize (2)" to_port="document"/>

<connect from_op="Tokenize (2)" from_port="document" to_op="Filter Tokens (by Length) (2)" to_port="document"/>

<connect from_op="Filter Tokens (by Length) (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>

<connect from_op="Transform Cases (2)" from_port="document" to_op="Filter Tokens Using ExampleSet" to_port="document"/>

<connect from_op="Filter Tokens Using ExampleSet" from_port="document" to_port="output 1"/>

<portSpacing port="source_file object" spacing="0"/>

<portSpacing port="source_input 1" spacing="0"/>

<portSpacing port="source_input 2" spacing="0"/>

<portSpacing port="sink_output 1" spacing="0"/>

<portSpacing port="sink_output 2" spacing="0"/>

</process>

</operator>

<operator activated="true" class="operator_toolbox:apply_model_documents" compatibility="2.12.000" expanded="true" height="103" name="Apply Model (Documents)" width="90" x="581" y="34">

<list key="application_parameters"/>

</operator>

<connect from_port="input 1" to_op="Loop Files" to_port="input 1"/>

<connect from_op="Retrieve BPW_Dictionary_zusammengefasst" from_port="output" to_op="Dictionary-Based Sentiment (Documents)" to_port="exa"/>

<connect from_op="Dictionary-Based Sentiment (Documents)" from_port="mod" to_op="Apply Model (Documents)" to_port="mod"/>

<connect from_op="Loop Files" from_port="output 1" to_op="Apply Model (Documents)" to_port="doc"/>

<connect from_op="Apply Model (Documents)" from_port="exa" to_port="result 1"/>

<connect from_op="Apply Model (Documents)" from_port="doc" to_port="result 2"/>

<connect from_op="Apply Model (Documents)" from_port="mod" to_port="result 3"/>

<portSpacing port="source_input 1" spacing="0"/>

<portSpacing port="source_input 2" spacing="0"/>

<portSpacing port="sink_result 1" spacing="0"/>

<portSpacing port="sink_result 2" spacing="0"/>

<portSpacing port="sink_result 3" spacing="0"/>

<portSpacing port="sink_result 4" spacing="0"/>

<background height="232" location="//Samples/Tutorials/Basics/02/tutorial2" width="1502" x="26" y="47"/>

Find more posts tagged with

Text Mining + NLP

Sentiment Analysis

Sort by:

1 - 1 of 11

User: "BalazsBaranyRM"

New Altair Community Member

Accepted Answer

Hi!

Try some combination of or \n\r instead of just \n.
\n is the "Unix line ending", just a newline character.
is Carriage Return + Newline, the Windows tradition.

It depends on your documents and how they are processed.

Regards,
Balázs

View in context

Quick Links