Hey everybody,
hope, you could help me a second time because after thinking I solved a tricky problem it reveals as not so easy.
I want to compare an Input-Document to the Documents of a Collection to find the most similar document of the collection.
--> Via k-means-clustering I divide the collection-documents into groups, and compare them with the input document to find the most similar cluster (centroid vector)
--> via "join" I extract the documents from the most similar cluster
--> via "Cross Distances" with "only top k" I may find the most similar document of the collection to the input document. (input=req, documents=ref)
First, it all seemed to work, but then I found, that "Cross Distances" does not show the most similar documents, but the documents with the smallest id (which are the first entries in the cluster).
I have no idea why this operator is not working properly. The columns of req and ref are the same (except of the regular ones, which are the vector entries ,and differ for that reason in amount and weight). For the processing of both document collection and input document I use the same inner process.
Hope so much, anyone does have a good idea, because its very urgent to solve this problem for me.
All the best!!
Note: the "read" operator is the output from the "create document-process documents..."-part ... just for that you see, how I process the input document.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.3.013">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.3.013" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:create_document" compatibility="5.3.001" expanded="true" height="60" name="Create Document (2)" width="90" x="246" y="345">
<parameter key="text" value=" CHICAGO, March 16 (Reuters) - U.S. farmers were expected to plant 91.291 million acres (36.95 million hectares) of corn, 77.193 million acres of soybeans and 57.435 million acres of wheat, according to a survey by research and brokerage firm Allendale Inc. The U.S. Agriculture Department's latest estimate calls for predicted corn acreage at 92 million, soybean acreage at 78 million and wheat acreage at 57 million. (Reporting by Mark Weinraub; Editing by Marguerita Choy) ((mark.weinraub@thomsonreuters.com; +1 312 408 8587; Reuters Messaging: mark.weinraub.@thomsonreuters.com)) ((For help: Click "Contact Us" in your desk top, click here [HELP] or call 1-800-738-8377 for Reuters Products and 1-888-463-3383 for Thomson products; For client training: training.americas@thomsonreuters.com ; +1 646-223-5546)) Keywords: US CROPS/ACREAGE "/>
</operator>
<operator activated="true" class="text:documents_to_data" compatibility="5.3.001" expanded="true" height="76" name="Documents to Data (2)" width="90" x="380" y="345">
<parameter key="text_attribute" value="text"/>
<parameter key="label_attribute" value="1"/>
</operator>
<operator activated="true" class="generate_id" compatibility="5.3.013" expanded="true" height="76" name="Generate ID (2)" width="90" x="514" y="345"/>
<operator activated="true" class="read" compatibility="5.3.013" expanded="true" height="60" name="Read" width="90" x="45" y="300">
<parameter key="object_file" value="C:\Users\Susann\Documents\Repository_Ausgang\Input2"/>
<parameter key="io_object" value="ExampleSet"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="5.3.001" expanded="true" height="76" name="Process Documents from Data (2)" width="90" x="648" y="345">
<parameter key="vector_creation" value="Term Frequency"/>
<parameter key="keep_text" value="true"/>
<parameter key="select_attributes_and_weights" value="true"/>
<list key="specify_weights">
<parameter key="text" value="1.0"/>
</list>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.001" expanded="true" height="60" name="Tokenize (2)" width="90" x="45" y="30"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.001" expanded="true" height="60" name="Transform Cases (3)" width="90" x="179" y="30"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.001" expanded="true" height="60" name="Filter Stopwords (3)" width="90" x="313" y="30"/>
<operator activated="true" class="text:stem_porter" compatibility="5.3.001" expanded="true" height="60" name="Stem (3)" width="90" x="447" y="30"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="5.3.001" expanded="true" height="60" name="Generate n-Grams (Terms)" width="90" x="581" y="30">
<parameter key="max_length" value="3"/>
</operator>
<connect from_port="document" to_op="Tokenize (2)" to_port="document"/>
<connect from_op="Tokenize (2)" from_port="document" to_op="Transform Cases (3)" to_port="document"/>
<connect from_op="Transform Cases (3)" from_port="document" to_op="Filter Stopwords (3)" to_port="document"/>
<connect from_op="Filter Stopwords (3)" from_port="document" to_op="Stem (3)" to_port="document"/>
<connect from_op="Stem (3)" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="retrieve" compatibility="5.3.013" expanded="true" height="60" name="Retrieve Test_Output" width="90" x="45" y="30">
<parameter key="repository_entry" value="Test_Output"/>
</operator>
<operator activated="true" class="sample" compatibility="5.3.013" expanded="true" height="76" name="Sample" width="90" x="179" y="30">
<list key="sample_size_per_class"/>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.013" expanded="true" height="76" name="Select Attributes" width="90" x="313" y="30">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="TREND"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="k_means" compatibility="5.3.013" expanded="true" height="76" name="Clustering" width="90" x="447" y="30">
<parameter key="k" value="10"/>
<parameter key="measure_types" value="NumericalMeasures"/>
</operator>
<operator activated="true" class="apply_model" compatibility="5.3.013" expanded="true" height="76" name="Apply Model" width="90" x="179" y="165">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="join" compatibility="5.3.013" expanded="true" height="76" name="Join" width="90" x="313" y="165">
<parameter key="remove_double_attributes" value="false"/>
<parameter key="use_id_attribute_as_key" value="false"/>
<list key="key_attributes">
<parameter key="cluster" value="cluster"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.013" expanded="true" height="76" name="Select Attributes (2)" width="90" x="447" y="165">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="cluster"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="cross_distances" compatibility="5.3.013" expanded="true" height="94" name="Cross Distances" width="90" x="648" y="165">
<parameter key="measure_types" value="NumericalMeasures"/>
<parameter key="numerical_measure" value="CosineSimilarity"/>
<parameter key="only_top_k" value="true"/>
<parameter key="k" value="1"/>
</operator>
<connect from_op="Create Document (2)" from_port="output" to_op="Documents to Data (2)" to_port="documents 1"/>
<connect from_op="Documents to Data (2)" from_port="example set" to_op="Generate ID (2)" to_port="example set input"/>
<connect from_op="Generate ID (2)" from_port="example set output" to_op="Process Documents from Data (2)" to_port="example set"/>
<connect from_op="Read" from_port="output" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Process Documents from Data (2)" from_port="example set" to_op="Cross Distances" to_port="request set"/>
<connect from_op="Retrieve Test_Output" from_port="output" to_op="Sample" to_port="example set input"/>
<connect from_op="Sample" from_port="example set output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="cluster model" to_op="Apply Model" to_port="model"/>
<connect from_op="Clustering" from_port="clustered set" to_op="Join" to_port="left"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Join" to_port="right"/>
<connect from_op="Join" from_port="join" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Cross Distances" to_port="reference set"/>
<connect from_op="Cross Distances" from_port="result set" to_port="result 1"/>
<connect from_op="Cross Distances" from_port="request set" to_port="result 2"/>
<connect from_op="Cross Distances" from_port="reference set" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>