tokenize
bmartin
New Altair Community Member
Hi
I am using rapidminer to try to tokenize a column in a database which contains text data.
I want to keep the ID with the Text column so instead of:
ID TEXT
12 I love data mining
it would appear as
ID TOKEN_TEXT
12 I
12 love
12 data
12 mining
Can I do this with the 'Process Documents from Data' cos the output is either the word list (with no ID even though I have set the role of ID as ID) or exampleset containing the ID. But I need both together!
Is there a way of doing this?
Note: reason for doing this is so I can then join to a list of words that tell me the sentiment (if any) related to each word.
THanks in advance
I am using rapidminer to try to tokenize a column in a database which contains text data.
I want to keep the ID with the Text column so instead of:
ID TEXT
12 I love data mining
it would appear as
ID TOKEN_TEXT
12 I
12 love
12 data
12 mining
Can I do this with the 'Process Documents from Data' cos the output is either the word list (with no ID even though I have set the role of ID as ID) or exampleset containing the ID. But I need both together!
Is there a way of doing this?
Note: reason for doing this is so I can then join to a list of words that tell me the sentiment (if any) related to each word.
THanks in advance
Tagged:
0
Answers
-
Hi,
did you consider using the "Split" operator? Since your task does not seem to include text processing tasks, I would not use the tokenize approach, since word lists and vectors have different aims than just dividing words.
If splitting is not enough, you can add the "De-Pivot" operator to create a table form similar to the one you posted as example. Here a little process to illustrate the use of both operators:<?xml version="1.0" encoding="UTF-8" standalone="no"?>
Regards
<process version="5.1.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.008" expanded="true" name="Process">
<process expanded="true" height="566" width="748">
<operator activated="true" class="subprocess" compatibility="5.1.008" expanded="true" height="76" name="Subprocess" width="90" x="45" y="30">
<process expanded="true" height="607" width="705">
<operator activated="true" class="generate_nominal_data" compatibility="5.1.008" expanded="true" height="60" name="Generate Nominal Data" width="90" x="45" y="30">
<parameter key="number_examples" value="1"/>
<parameter key="number_of_attributes" value="2"/>
</operator>
<operator activated="true" class="set_data" compatibility="5.1.008" expanded="true" height="76" name="Set Data" width="90" x="180" y="30">
<parameter key="example_index" value="1"/>
<parameter key="attribute_name" value="att1"/>
<parameter key="value" value="12"/>
<list key="additional_values"/>
</operator>
<operator activated="true" class="set_data" compatibility="5.1.008" expanded="true" height="76" name="Set Data (2)" width="90" x="315" y="30">
<parameter key="example_index" value="1"/>
<parameter key="attribute_name" value="att2"/>
<parameter key="value" value="I love data mining"/>
<list key="additional_values"/>
</operator>
<operator activated="true" class="rename" compatibility="5.1.008" expanded="true" height="76" name="Rename" width="90" x="450" y="30">
<parameter key="old_name" value="att1"/>
<parameter key="new_name" value="ID"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="rename" compatibility="5.1.008" expanded="true" height="76" name="Rename (2)" width="90" x="585" y="30">
<parameter key="old_name" value="att2"/>
<parameter key="new_name" value="Text"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.1.008" expanded="true" height="76" name="Select Attributes" width="90" x="45" y="120">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="label"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<connect from_op="Generate Nominal Data" from_port="output" to_op="Set Data" to_port="example set input"/>
<connect from_op="Set Data" from_port="example set output" to_op="Set Data (2)" to_port="example set input"/>
<connect from_op="Set Data (2)" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
<connect from_op="Rename (2)" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="90"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="split" compatibility="5.1.008" expanded="true" height="76" name="Split" width="90" x="179" y="30">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Text"/>
<parameter key="split_pattern" value="\s"/>
</operator>
<operator activated="true" class="de_pivot" compatibility="5.1.008" expanded="true" height="76" name="De-Pivot" width="90" x="313" y="30">
<list key="attribute_name">
<parameter key="text" value="Text_\d+"/>
</list>
<parameter key="index_attribute" value="index"/>
</operator>
<connect from_op="Subprocess" from_port="out 1" to_op="Split" to_port="example set input"/>
<connect from_op="Split" from_port="example set output" to_op="De-Pivot" to_port="example set input"/>
<connect from_op="De-Pivot" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Matthias0 -
Many thanks for your reply.
The SPLIT operator correctly seperates the string as I expected.
However, I have large volumes of source data and this seems to take much time to run so I may look at moving this functionality back into the database (oracle).
Many thanks
Brian0