Hi,
I allow myself to create a dedicated topic for a subject that has not been answered in a previous topic.
In this previous topic, the goal was to calculate the similarity between "employees caracteristics" and "a position".
I decided to use the Cross Distances operator, but I got weird results :
The calculated similarity is always the same regardless of the "position" and "employees caracteristics".
I performed some tests without results and this topic running through my mind.
NB : I used Read Excel operator to introduce my example sets.
You can find my process here :
<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="8.0.001" expanded="true" height="68" name="Employees" width="90" x="45" y="85">
<parameter key="excel_file" value="C:\Users\Lionel\Documents\Formations_DataScience\Rapidminer\Tests_Rapidminer\HR_Sourcing\Employees.xlsx"/>
<parameter key="imported_cell_range" value="A1:F5"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="Id_employee.true.integer.id"/>
<parameter key="1" value="name.true.polynominal.attribute"/>
<parameter key="2" value="skills.true.polynominal.attribute"/>
<parameter key="3" value="department.true.polynominal.attribute"/>
<parameter key="4" value="language.true.polynominal.attribute"/>
<parameter key="5" value="experience.true.integer.attribute"/>
</list>
</operator>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply" width="90" x="179" y="85"/>
<operator activated="true" class="select_attributes" compatibility="8.0.001" expanded="true" height="82" name="Select Attributes" width="90" x="313" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="Id_employee|department|experience|language|skills"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.0.001" expanded="true" height="82" name="Select Attributes (3)" width="90" x="313" y="136">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="name|Id_employee"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="read_excel" compatibility="8.0.001" expanded="true" height="68" name="Position" width="90" x="45" y="238">
<parameter key="excel_file" value="C:\Users\Lionel\Documents\Formations_DataScience\Rapidminer\Tests_Rapidminer\HR_Sourcing\Employees.xlsx"/>
<parameter key="sheet_number" value="2"/>
<parameter key="imported_cell_range" value="A1:E2"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="Id_position.true.integer.id"/>
<parameter key="1" value="skills.true.polynominal.attribute"/>
<parameter key="2" value="department.true.polynominal.attribute"/>
<parameter key="3" value="language.true.polynominal.attribute"/>
<parameter key="4" value="experience.true.integer.attribute"/>
</list>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.0.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="179" y="238">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="department|experience|language|skills|Id_position"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" breakpoints="before" class="cross_distances" compatibility="8.0.001" expanded="true" height="103" name="Cross Distances" width="90" x="447" y="85">
<parameter key="measure_types" value="NumericalMeasures"/>
<parameter key="numerical_measure" value="CosineSimilarity"/>
<parameter key="compute_similarities" value="true"/>
</operator>
<operator activated="true" class="rename" compatibility="8.0.001" expanded="true" height="82" name="Rename" width="90" x="581" y="85">
<parameter key="old_name" value="document"/>
<parameter key="new_name" value="Employee"/>
<list key="rename_additional_attributes">
<parameter key="request" value="position"/>
<parameter key="distance" value="similarity"/>
</list>
</operator>
<operator activated="true" class="set_role" compatibility="8.0.001" expanded="true" height="82" name="Set Role (3)" width="90" x="715" y="85">
<parameter key="attribute_name" value="Employee"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="join" compatibility="8.0.001" expanded="true" height="82" name="Join" width="90" x="849" y="136">
<list key="key_attributes"/>
</operator>
<connect from_op="Employees" from_port="output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Multiply" from_port="output 2" to_op="Select Attributes (3)" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Cross Distances" to_port="reference set"/>
<connect from_op="Select Attributes (3)" from_port="example set output" to_op="Join" to_port="left"/>
<connect from_op="Position" from_port="output" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Cross Distances" to_port="request set"/>
<connect from_op="Cross Distances" from_port="result set" to_op="Rename" to_port="example set input"/>
<connect from_op="Cross Distances" from_port="request set" to_port="result 3"/>
<connect from_op="Cross Distances" from_port="reference set" to_port="result 1"/>
<connect from_op="Rename" from_port="example set output" to_op="Set Role (3)" to_port="example set input"/>
<connect from_op="Set Role (3)" from_port="example set output" to_op="Join" to_port="right"/>
<connect from_op="Join" from_port="join" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>
My (fictive) example sets can be downloaded by following this link :
https://drive.google.com/open?id=18JFovsp_pk7l-1SNx-oeywdwzVSeG-r0
Is it a bug ? if not can you tell me what I missed/forgot?
Thanks you for your responses,
Regards,
Lionel