Dear all,
I want to compare two nominal productdescriptions by using the "data to similarity" operator.
My Dataset looks like that: id; ai_description_mod; gp_description_mod; [and some other uninteresting attributes...]
My idea was to "loop attributes" of the exampleset.
The Subprocess of "loop attributes" should multiply the input. after forking the input, in one line input "descriptionA" is selected and renamed to "description", in the other one "descriptionB" is selected and renamed to "description". After that the two examples are put together by "append" operator.
Then I go on like it is done in that tutorial: http://vancouverdata.blogspot.de/2010/11/text-analytics-with-rapidminer-part-4.html by "Process documents from data", "Tokenize", "Transform Cases" and after all "Data to similarity".
Unfortunately "Data to similarity" is done over alle Examples, not only the two description with the same id, I wanted to compare. Later I just want to work on with the similarity-value.3
This is my code:
</operator>
<operator activated="true" class="loop_examples" compatibility="7.4.000" expanded="true" height="82" name="Loop Examples" width="90" x="1117" y="34">
<parameter key="iteration_macro" value="example"/>
<process expanded="true">
<operator activated="true" class="multiply" compatibility="7.4.000" expanded="true" height="103" name="Multiply" width="90" x="45" y="34"/>
<operator activated="true" class="select_attributes" compatibility="7.4.000" expanded="true" height="82" name="selectAi_desc" width="90" x="179" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="ai_description_mod"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<operator activated="true" class="rename" compatibility="7.4.000" expanded="true" height="82" name="Rename" width="90" x="313" y="34">
<parameter key="old_name" value="ai_description_mod"/>
<parameter key="new_name" value="description"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="7.4.000" expanded="true" height="82" name="selectGp_desc" width="90" x="179" y="136">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="gp_description_mod"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<operator activated="true" class="rename" compatibility="7.4.000" expanded="true" height="82" name="Rename (2)" width="90" x="313" y="136">
<parameter key="old_name" value="gp_description_mod"/>
<parameter key="new_name" value="description"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="append" compatibility="7.4.000" expanded="true" height="103" name="Append" width="90" x="447" y="85">
<parameter key="datamanagement" value="double_array"/>
<parameter key="data_management" value="auto"/>
<parameter key="merge_type" value="all"/>
</operator>
<operator activated="true" breakpoints="after" class="text:process_document_from_data" compatibility="7.4.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="581" y="85">
<parameter key="create_word_vector" value="true"/>
<parameter key="vector_creation" value="TF-IDF"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="keep_text" value="false"/>
<parameter key="prune_method" value="none"/>
<parameter key="prune_below_percent" value="3.0"/>
<parameter key="prune_above_percent" value="30.0"/>
<parameter key="prune_below_rank" value="0.05"/>
<parameter key="prune_above_rank" value="0.95"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="select_attributes_and_weights" value="true"/>
<list key="specify_weights">
<parameter key="description" value="1.0"/>
</list>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="7.4.001" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34">
<parameter key="mode" value="non letters"/>
<parameter key="characters" value=".:"/>
<parameter key="language" value="English"/>
<parameter key="max_token_length" value="3"/>
</operator>
<operator activated="true" class="text:transform_cases" compatibility="7.4.001" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34">
<parameter key="transform_to" value="lower case"/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" breakpoints="after" class="data_to_similarity" compatibility="7.4.000" expanded="true" height="82" name="Data to Similarity" width="90" x="715" y="85">
<parameter key="measure_types" value="NumericalMeasures"/>
<parameter key="mixed_measure" value="MixedEuclideanDistance"/>
<parameter key="nominal_measure" value="NominalDistance"/>
<parameter key="numerical_measure" value="CosineSimilarity"/>
<parameter key="divergence" value="GeneralizedIDivergence"/>
<parameter key="kernel_type" value="radial"/>
<parameter key="kernel_gamma" value="1.0"/>
<parameter key="kernel_sigma1" value="1.0"/>
<parameter key="kernel_sigma2" value="0.0"/>
<parameter key="kernel_sigma3" value="2.0"/>
<parameter key="kernel_degree" value="3.0"/>
<parameter key="kernel_shift" value="1.0"/>
<parameter key="kernel_a" value="1.0"/>
<parameter key="kernel_b" value="0.0"/>
</operator>
<connect from_port="example set" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="selectAi_desc" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 2" to_op="selectGp_desc" to_port="example set input"/>
<connect from_op="selectAi_desc" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Append" to_port="example set 1"/>
<connect from_op="selectGp_desc" from_port="example set output" to_op="Rename (2)" to_port="example set input"/>
<connect from_op="Rename (2)" from_port="example set output" to_op="Append" to_port="example set 2"/>
<connect from_op="Append" from_port="merged set" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Data to Similarity" to_port="example set"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
</process>
</operator>
Has anyone an idea how to do that?
Yours
Johannes