Turn values from JSON row into attribute
ogjtech
New Altair Community Member
I have in my ExampleSet (data from an Excel sheet) one attribute named 'amenities' with massive JSON arrays of values, like so;
{TV,"Wireless Internet","Air conditioning","Pets live on this property",Cat(s),"Indoor fireplace",Heating,"Smoke detector","Carbon monoxide detector",Essentials}
As you can see from the two rows above, a lot of these rows have similar values in them. I am trying to execute a polynomial regression process, so I need all attributes to contain only numerical values. I want to turn all values from these JSON rows into separate attributes, with zeroes and ones as values representing whether it applies to the record in question (if the attribute exists in the JSON row in the amenities attribute), then afterwards exclude the amenities attribute from the ExampleSet and do the regression.
Normally I would just place a Nominal to Numerical operator, but the rows consist of endlessly different values in no particular order.
I have browsed the internet all over for tutorials and answers but nothing has seemed to do the trick for me.
Attached is the process I have so far and the dataset, doing some data prepping for the other attributes in the Excel file and a ready made polynomial regression operator for the end result.
<?xml version="1.0" encoding="UTF-8"?><process version="9.3.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.3.001" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.3.001" expanded="true" height="68" name="Retrieve airbnb" width="90" x="45" y="34">
<parameter key="repository_entry" value="//Local Repository/data/airbnb"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="9.3.001" expanded="true" height="103" name="Filter Examples" width="90" x="179" y="34">
<parameter key="parameter_expression" value=""/>
<parameter key="condition_class" value="custom_filters"/>
<parameter key="invert_filter" value="false"/>
<list key="filters_list">
<parameter key="filters_entry_key" value="city.equals.NYC"/>
<parameter key="filters_entry_key" value="number_of_reviews.ge.5"/>
<parameter key="filters_entry_key" value="city.is_not_missing."/>
<parameter key="filters_entry_key" value="number_of_reviews.is_not_missing."/>
<parameter key="filters_entry_key" value="property_type.is_not_missing."/>
<parameter key="filters_entry_key" value="room_type.is_not_missing."/>
<parameter key="filters_entry_key" value="amenities.is_not_missing."/>
<parameter key="filters_entry_key" value="accommodates.is_not_missing."/>
<parameter key="filters_entry_key" value="bathrooms.is_not_missing."/>
<parameter key="filters_entry_key" value="bed_type.is_not_missing."/>
<parameter key="filters_entry_key" value="cancellation_policy.is_not_missing."/>
<parameter key="filters_entry_key" value="cleaning_fee.is_not_missing."/>
<parameter key="filters_entry_key" value="host_has_profile_pic.is_not_missing."/>
<parameter key="filters_entry_key" value="host_identity_verified.is_not_missing."/>
<parameter key="filters_entry_key" value="host_response_rate.is_not_missing."/>
<parameter key="filters_entry_key" value="instant_bookable.is_not_missing."/>
<parameter key="filters_entry_key" value="neighbourhood.is_not_missing."/>
<parameter key="filters_entry_key" value="number_of_reviews.is_not_missing."/>
<parameter key="filters_entry_key" value="review_scores_rating.is_not_missing."/>
<parameter key="filters_entry_key" value="bedrooms.is_not_missing."/>
<parameter key="filters_entry_key" value="beds.is_not_missing."/>
<parameter key="filters_entry_key" value="review_scores_rating.ge.80"/>
</list>
<parameter key="filters_logic_and" value="true"/>
<parameter key="filters_check_metadata" value="true"/>
</operator>
<operator activated="true" class="subprocess" compatibility="9.3.001" expanded="true" height="82" name="Subprocess" width="90" x="313" y="34">
<process expanded="true">
<operator activated="true" class="select_attributes" compatibility="9.3.001" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="accommodates|amenities|bathrooms|bed_type|bedrooms|beds|cancellation_policy|cleaning_fee|host_has_profile_pic|host_identity_verified|host_response_rate|instant_bookable|number_of_reviews|property_type|review_scores_rating|room_type"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<operator activated="true" class="set_role" compatibility="9.3.001" expanded="true" height="82" name="Set Role" width="90" x="246" y="34">
<parameter key="attribute_name" value="review_scores_rating"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="9.3.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="581" y="34">
<parameter key="return_preprocessing_model" value="false"/>
<parameter key="create_view" value="false"/>
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="bed_type|cancellation_policy|property_type|room_type|amenities|cleaning_fee|host_has_profile_pic|host_identity_verified|instant_bookable"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="nominal"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="file_path"/>
<parameter key="block_type" value="single_value"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="single_value"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="coding_type" value="dummy coding"/>
<parameter key="use_comparison_groups" value="false"/>
<list key="comparison_groups"/>
<parameter key="unexpected_value_handling" value="all 0 and warning"/>
<parameter key="use_underscore_in_name" value="true"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="9.3.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="849" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="bed_type_Airbed|bed_type_Couch|cancellation_policy_long_term|cancellation_policy_super_strict_30|cancellation_policy_super_strict_60|property_type_Bed & Breakfast|property_type_Boat|property_type_Boutique hotel|property_type_Bungalow|property_type_Cabin|property_type_Camper/RV|property_type_Castle|property_type_Cave|property_type_Chalet|property_type_Dorm|property_type_Earth House|property_type_Guest suite|property_type_Guesthouse|property_type_Hostel|property_type_In-law|property_type_Serviced apartment|property_type_Tent|property_type_Timeshare|property_type_Tipi|property_type_Train|property_type_Treehouse|property_type_Vacation home|property_type_Villa|property_type_Yurt|room_type_Shared room"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<connect from_port="in 1" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="split_data" compatibility="9.3.001" expanded="true" height="103" name="Split Data" width="90" x="447" y="34">
<enumeration key="partitions">
<parameter key="ratio" value="0.7"/>
<parameter key="ratio" value="0.3"/>
</enumeration>
<parameter key="sampling_type" value="shuffled sampling"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<operator activated="true" class="polynomial_regression" compatibility="9.3.001" expanded="true" height="82" name="Polynomial Regression" width="90" x="648" y="34">
<parameter key="max_iterations" value="5000"/>
<parameter key="replication_factor" value="1"/>
<parameter key="max_degree" value="5"/>
<parameter key="min_coefficient" value="-100.0"/>
<parameter key="max_coefficient" value="100.0"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<operator activated="true" class="apply_model" compatibility="9.3.001" expanded="true" height="82" name="Apply Model" width="90" x="715" y="238">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<connect from_op="Retrieve airbnb" from_port="output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Subprocess" to_port="in 1"/>
<connect from_op="Subprocess" from_port="out 1" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_op="Polynomial Regression" to_port="training set"/>
<connect from_op="Split Data" from_port="partition 2" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Polynomial Regression" from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Tagged:
1
Best Answer
-
Hi,
when doing this kind of operation, you usually split your data into different tables. The one that you'll de-pivot and pivot will only have an Id and the attribute you're splitting up. Then you work on this table and join back the results.
For Pivot you need to select the Id as the "group by" attribute and the Amenity as the "column grouping attribute". Then you calculate the count of records, the result will be 1 or missing. You replace the missings by 0 for example, and you might want to rename the attributes automatically so that they don't have the redundant part anymore.<?xml version="1.0" encoding="UTF-8"?><process version="9.2.001"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="9.2.001" expanded="true" name="Process"> <parameter key="logverbosity" value="init"/> <parameter key="random_seed" value="2001"/> <parameter key="send_mail" value="never"/> <parameter key="notification_email" value=""/> <parameter key="process_duration_for_mail" value="30"/> <parameter key="encoding" value="SYSTEM"/> <process expanded="true"> <operator activated="true" class="read_excel" compatibility="9.2.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34"> <parameter key="excel_file" value="/home/barany/Downloads/airbnb.xlsx"/> <parameter key="sheet_selection" value="sheet number"/> <parameter key="sheet_number" value="1"/> <parameter key="imported_cell_range" value="A1"/> <parameter key="encoding" value="SYSTEM"/> <parameter key="first_row_as_names" value="true"/> <list key="annotations"/> <parameter key="date_format" value=""/> <parameter key="time_zone" value="SYSTEM"/> <parameter key="locale" value="English (United States)"/> <parameter key="read_all_values_as_polynominal" value="false"/> <list key="data_set_meta_data_information"> <parameter key="0" value="id.true.integer.attribute"/> <parameter key="1" value="property_type.true.polynominal.attribute"/> <parameter key="2" value="room_type.true.polynominal.attribute"/> <parameter key="3" value="amenities.true.polynominal.attribute"/> <parameter key="4" value="accommodates.true.integer.attribute"/> <parameter key="5" value="bathrooms.true.real.attribute"/> <parameter key="6" value="bed_type.true.polynominal.attribute"/> <parameter key="7" value="cancellation_policy.true.polynominal.attribute"/> <parameter key="8" value="cleaning_fee.true.polynominal.attribute"/> <parameter key="9" value="city.true.polynominal.attribute"/> <parameter key="10" value="description.true.polynominal.attribute"/> <parameter key="11" value="first_review.true.date.attribute"/> <parameter key="12" value="host_has_profile_pic.true.polynominal.attribute"/> <parameter key="13" value="host_identity_verified.true.polynominal.attribute"/> <parameter key="14" value="host_response_rate.true.real.attribute"/> <parameter key="15" value="host_since.true.date.attribute"/> <parameter key="16" value="instant_bookable.true.polynominal.attribute"/> <parameter key="17" value="last_review.true.date.attribute"/> <parameter key="18" value="latitude.true.real.attribute"/> <parameter key="19" value="longitude.true.real.attribute"/> <parameter key="20" value="name.true.polynominal.attribute"/> <parameter key="21" value="neighbourhood.true.polynominal.attribute"/> <parameter key="22" value="number_of_reviews.true.integer.attribute"/> <parameter key="23" value="review_scores_rating.true.integer.attribute"/> <parameter key="24" value="thumbnail_url.true.polynominal.attribute"/> <parameter key="25" value="zipcode.true.integer.attribute"/> <parameter key="26" value="bedrooms.true.integer.attribute"/> <parameter key="27" value="beds.true.integer.attribute"/> </list> <parameter key="read_not_matching_values_as_missings" value="false"/> <parameter key="datamanagement" value="double_array"/> <parameter key="data_management" value="auto"/> </operator> <operator activated="true" class="filter_examples" compatibility="9.2.001" expanded="true" height="103" name="Filter Examples" width="90" x="179" y="34"> <parameter key="parameter_expression" value=""/> <parameter key="condition_class" value="custom_filters"/> <parameter key="invert_filter" value="false"/> <list key="filters_list"> <parameter key="filters_entry_key" value="city.equals.NYC"/> <parameter key="filters_entry_key" value="number_of_reviews.ge.5"/> <parameter key="filters_entry_key" value="city.is_not_missing."/> <parameter key="filters_entry_key" value="number_of_reviews.is_not_missing."/> <parameter key="filters_entry_key" value="property_type.is_not_missing."/> <parameter key="filters_entry_key" value="room_type.is_not_missing."/> <parameter key="filters_entry_key" value="amenities.is_not_missing."/> <parameter key="filters_entry_key" value="accommodates.is_not_missing."/> <parameter key="filters_entry_key" value="bathrooms.is_not_missing."/> <parameter key="filters_entry_key" value="bed_type.is_not_missing."/> <parameter key="filters_entry_key" value="cancellation_policy.is_not_missing."/> <parameter key="filters_entry_key" value="cleaning_fee.is_not_missing."/> <parameter key="filters_entry_key" value="host_has_profile_pic.is_not_missing."/> <parameter key="filters_entry_key" value="host_identity_verified.is_not_missing."/> <parameter key="filters_entry_key" value="host_response_rate.is_not_missing."/> <parameter key="filters_entry_key" value="instant_bookable.is_not_missing."/> <parameter key="filters_entry_key" value="neighbourhood.is_not_missing."/> <parameter key="filters_entry_key" value="number_of_reviews.is_not_missing."/> <parameter key="filters_entry_key" value="review_scores_rating.is_not_missing."/> <parameter key="filters_entry_key" value="bedrooms.is_not_missing."/> <parameter key="filters_entry_key" value="beds.is_not_missing."/> <parameter key="filters_entry_key" value="review_scores_rating.ge.80"/> </list> <parameter key="filters_logic_and" value="true"/> <parameter key="filters_check_metadata" value="true"/> </operator> <operator activated="true" class="subprocess" compatibility="9.2.001" expanded="true" height="82" name="Subprocess" width="90" x="313" y="34"> <process expanded="true"> <operator activated="true" class="select_attributes" compatibility="9.2.001" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="34"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="amenities|id"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> </operator> <operator activated="true" class="replace" compatibility="9.2.001" expanded="true" height="82" name="Replace" width="90" x="179" y="34"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="amenities"/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="nominal"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="file_path"/> <parameter key="block_type" value="single_value"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="single_value"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="replace_what" value="[{}"]"/> </operator> <operator activated="true" class="split" compatibility="9.2.001" expanded="true" height="82" name="Split" width="90" x="313" y="34"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="amenities"/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="nominal"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="file_path"/> <parameter key="block_type" value="single_value"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="single_value"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="split_pattern" value=","/> <parameter key="split_mode" value="ordered_split"/> </operator> <operator activated="true" class="de_pivot" compatibility="9.2.001" expanded="true" height="82" name="De-Pivot" width="90" x="447" y="34"> <list key="attribute_name"> <parameter key="Amenity" value="amenities.*"/> </list> <parameter key="index_attribute" value="nr"/> <parameter key="create_nominal_index" value="false"/> <parameter key="keep_missings" value="false"/> </operator> <operator activated="true" class="blending:pivot" compatibility="9.2.001" expanded="true" height="82" name="Pivot" width="90" x="581" y="34"> <parameter key="group_by_attributes" value="id"/> <parameter key="column_grouping_attribute" value="Amenity"/> <list key="aggregation_attributes"> <parameter key="nr" value="count"/> </list> <parameter key="use_default_aggregation" value="false"/> <parameter key="default_aggregation_function" value="first"/> </operator> <operator activated="true" class="replace_missing_values" compatibility="9.2.001" expanded="true" height="103" name="Replace Missing Values" width="90" x="715" y="34"> <parameter key="return_preprocessing_model" value="false"/> <parameter key="create_view" value="false"/> <parameter key="attribute_filter_type" value="all"/> <parameter key="attribute" value=""/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="default" value="zero"/> <list key="columns"/> </operator> <operator activated="true" class="rename_by_replacing" compatibility="9.2.001" expanded="true" height="82" name="Rename by Replacing" width="90" x="849" y="34"> <parameter key="attribute_filter_type" value="all"/> <parameter key="attribute" value=""/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="replace_what" value="count.nr._ *"/> </operator> <operator activated="true" class="set_role" compatibility="9.2.001" expanded="true" height="82" name="Set Role" width="90" x="179" y="187"> <parameter key="attribute_name" value="review_scores_rating"/> <parameter key="target_role" value="label"/> <list key="set_additional_roles"/> <description align="center" color="transparent" colored="false" width="126">Make review_score_rating prediction value</description> </operator> <operator activated="false" breakpoints="after" class="nominal_to_numerical" compatibility="9.2.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="380" y="238"> <parameter key="return_preprocessing_model" value="false"/> <parameter key="create_view" value="false"/> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="bed_type|cancellation_policy|property_type|room_type|amenities|cleaning_fee|host_has_profile_pic|host_identity_verified|instant_bookable"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="nominal"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="file_path"/> <parameter key="block_type" value="single_value"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="single_value"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="coding_type" value="dummy coding"/> <parameter key="use_comparison_groups" value="false"/> <list key="comparison_groups"/> <parameter key="unexpected_value_handling" value="all 0 and warning"/> <parameter key="use_underscore_in_name" value="true"/> <description align="center" color="transparent" colored="false" width="126">Change all textual attributes to booleans</description> </operator> <operator activated="true" class="select_attributes" compatibility="9.2.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="514" y="187"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="bed_type_Airbed|bed_type_Couch|cancellation_policy_long_term|cancellation_policy_super_strict_30|cancellation_policy_super_strict_60|property_type_Bed & Breakfast|property_type_Boat|property_type_Boutique hotel|property_type_Bungalow|property_type_Cabin|property_type_Camper/RV|property_type_Castle|property_type_Cave|property_type_Chalet|property_type_Dorm|property_type_Earth House|property_type_Guest suite|property_type_Guesthouse|property_type_Hostel|property_type_In-law|property_type_Serviced apartment|property_type_Tent|property_type_Timeshare|property_type_Tipi|property_type_Train|property_type_Treehouse|property_type_Vacation home|property_type_Villa|property_type_Yurt|room_type_Shared room"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="true"/> <parameter key="include_special_attributes" value="false"/> <description align="center" color="transparent" colored="false" width="126">Filter out (almost)<br>never used labels<br/></description> </operator> <operator activated="true" breakpoints="after" class="concurrency:join" compatibility="9.2.001" expanded="true" height="82" name="Join" width="90" x="849" y="187"> <parameter key="remove_double_attributes" value="true"/> <parameter key="join_type" value="inner"/> <parameter key="use_id_attribute_as_key" value="false"/> <list key="key_attributes"> <parameter key="id" value="id"/> </list> <parameter key="keep_both_join_attributes" value="false"/> </operator> <connect from_port="in 1" to_op="Select Attributes" to_port="example set input"/> <connect from_op="Select Attributes" from_port="example set output" to_op="Replace" to_port="example set input"/> <connect from_op="Select Attributes" from_port="original" to_op="Set Role" to_port="example set input"/> <connect from_op="Replace" from_port="example set output" to_op="Split" to_port="example set input"/> <connect from_op="Split" from_port="example set output" to_op="De-Pivot" to_port="example set input"/> <connect from_op="De-Pivot" from_port="example set output" to_op="Pivot" to_port="input"/> <connect from_op="Pivot" from_port="output" to_op="Replace Missing Values" to_port="example set input"/> <connect from_op="Replace Missing Values" from_port="example set output" to_op="Rename by Replacing" to_port="example set input"/> <connect from_op="Rename by Replacing" from_port="example set output" to_op="Join" to_port="left"/> <connect from_op="Set Role" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/> <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Join" to_port="right"/> <connect from_op="Join" from_port="join" to_port="out 1"/> <portSpacing port="source_in 1" spacing="0"/> <portSpacing port="source_in 2" spacing="0"/> <portSpacing port="sink_out 1" spacing="0"/> <portSpacing port="sink_out 2" spacing="0"/> </process> <description align="center" color="transparent" colored="false" width="126">Data preparation<br/></description> </operator> <operator activated="true" class="split_data" compatibility="9.2.001" expanded="true" height="103" name="Split Data" width="90" x="447" y="34"> <enumeration key="partitions"> <parameter key="ratio" value="0.7"/> <parameter key="ratio" value="0.3"/> </enumeration> <parameter key="sampling_type" value="shuffled sampling"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> </operator> <operator activated="true" class="polynomial_regression" compatibility="9.2.001" expanded="true" height="82" name="Polynomial Regression" width="90" x="648" y="34"> <parameter key="max_iterations" value="5000"/> <parameter key="replication_factor" value="1"/> <parameter key="max_degree" value="5"/> <parameter key="min_coefficient" value="-100.0"/> <parameter key="max_coefficient" value="100.0"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> </operator> <operator activated="true" class="apply_model" compatibility="9.2.001" expanded="true" height="82" name="Apply Model" width="90" x="715" y="238"> <list key="application_parameters"/> <parameter key="create_view" value="false"/> </operator> <connect from_op="Read Excel" from_port="output" to_op="Filter Examples" to_port="example set input"/> <connect from_op="Filter Examples" from_port="example set output" to_op="Subprocess" to_port="in 1"/> <connect from_op="Subprocess" from_port="out 1" to_op="Split Data" to_port="example set"/> <connect from_op="Split Data" from_port="partition 1" to_op="Polynomial Regression" to_port="training set"/> <connect from_op="Split Data" from_port="partition 2" to_op="Apply Model" to_port="unlabelled data"/> <connect from_op="Polynomial Regression" from_port="model" to_op="Apply Model" to_port="model"/> <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> </process> </operator> </process>
Regards,
Balázs10
Answers
-
Hi!
I guess you're trying to process the "amenities" attribute. It seems similar to JSON but it's not.
Just converting the entire field to numerical won't work. What you want to do is:
1. Strip the { and } characters from the field (e. g. Replace)
2. Split the amenities attribute on the , character
3. De-Pivot the newly generated amenities_1-n attributes to get a list of amenities for each object
Example:<?xml version="1.0" encoding="UTF-8"?><process version="9.2.001"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="9.2.001" expanded="true" name="Process"> <parameter key="logverbosity" value="init"/> <parameter key="random_seed" value="2001"/> <parameter key="send_mail" value="never"/> <parameter key="notification_email" value=""/> <parameter key="process_duration_for_mail" value="30"/> <parameter key="encoding" value="SYSTEM"/> <process expanded="true"> <operator activated="true" class="read_excel" compatibility="9.2.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34"> <parameter key="excel_file" value="/home/barany/Downloads/airbnb.xlsx"/> <parameter key="sheet_selection" value="sheet number"/> <parameter key="sheet_number" value="1"/> <parameter key="imported_cell_range" value="A1"/> <parameter key="encoding" value="SYSTEM"/> <parameter key="first_row_as_names" value="true"/> <list key="annotations"/> <parameter key="date_format" value=""/> <parameter key="time_zone" value="SYSTEM"/> <parameter key="locale" value="English (United States)"/> <parameter key="read_all_values_as_polynominal" value="false"/> <list key="data_set_meta_data_information"> <parameter key="0" value="id.true.integer.attribute"/> <parameter key="1" value="property_type.true.polynominal.attribute"/> <parameter key="2" value="room_type.true.polynominal.attribute"/> <parameter key="3" value="amenities.true.polynominal.attribute"/> <parameter key="4" value="accommodates.true.integer.attribute"/> <parameter key="5" value="bathrooms.true.real.attribute"/> <parameter key="6" value="bed_type.true.polynominal.attribute"/> <parameter key="7" value="cancellation_policy.true.polynominal.attribute"/> <parameter key="8" value="cleaning_fee.true.polynominal.attribute"/> <parameter key="9" value="city.true.polynominal.attribute"/> <parameter key="10" value="description.true.polynominal.attribute"/> <parameter key="11" value="first_review.true.date.attribute"/> <parameter key="12" value="host_has_profile_pic.true.polynominal.attribute"/> <parameter key="13" value="host_identity_verified.true.polynominal.attribute"/> <parameter key="14" value="host_response_rate.true.real.attribute"/> <parameter key="15" value="host_since.true.date.attribute"/> <parameter key="16" value="instant_bookable.true.polynominal.attribute"/> <parameter key="17" value="last_review.true.date.attribute"/> <parameter key="18" value="latitude.true.real.attribute"/> <parameter key="19" value="longitude.true.real.attribute"/> <parameter key="20" value="name.true.polynominal.attribute"/> <parameter key="21" value="neighbourhood.true.polynominal.attribute"/> <parameter key="22" value="number_of_reviews.true.integer.attribute"/> <parameter key="23" value="review_scores_rating.true.integer.attribute"/> <parameter key="24" value="thumbnail_url.true.polynominal.attribute"/> <parameter key="25" value="zipcode.true.integer.attribute"/> <parameter key="26" value="bedrooms.true.integer.attribute"/> <parameter key="27" value="beds.true.integer.attribute"/> </list> <parameter key="read_not_matching_values_as_missings" value="false"/> <parameter key="datamanagement" value="double_array"/> <parameter key="data_management" value="auto"/> </operator> <operator activated="true" class="filter_examples" compatibility="9.2.001" expanded="true" height="103" name="Filter Examples" width="90" x="179" y="34"> <parameter key="parameter_expression" value=""/> <parameter key="condition_class" value="custom_filters"/> <parameter key="invert_filter" value="false"/> <list key="filters_list"> <parameter key="filters_entry_key" value="city.equals.NYC"/> <parameter key="filters_entry_key" value="number_of_reviews.ge.5"/> <parameter key="filters_entry_key" value="city.is_not_missing."/> <parameter key="filters_entry_key" value="number_of_reviews.is_not_missing."/> <parameter key="filters_entry_key" value="property_type.is_not_missing."/> <parameter key="filters_entry_key" value="room_type.is_not_missing."/> <parameter key="filters_entry_key" value="amenities.is_not_missing."/> <parameter key="filters_entry_key" value="accommodates.is_not_missing."/> <parameter key="filters_entry_key" value="bathrooms.is_not_missing."/> <parameter key="filters_entry_key" value="bed_type.is_not_missing."/> <parameter key="filters_entry_key" value="cancellation_policy.is_not_missing."/> <parameter key="filters_entry_key" value="cleaning_fee.is_not_missing."/> <parameter key="filters_entry_key" value="host_has_profile_pic.is_not_missing."/> <parameter key="filters_entry_key" value="host_identity_verified.is_not_missing."/> <parameter key="filters_entry_key" value="host_response_rate.is_not_missing."/> <parameter key="filters_entry_key" value="instant_bookable.is_not_missing."/> <parameter key="filters_entry_key" value="neighbourhood.is_not_missing."/> <parameter key="filters_entry_key" value="number_of_reviews.is_not_missing."/> <parameter key="filters_entry_key" value="review_scores_rating.is_not_missing."/> <parameter key="filters_entry_key" value="bedrooms.is_not_missing."/> <parameter key="filters_entry_key" value="beds.is_not_missing."/> <parameter key="filters_entry_key" value="review_scores_rating.ge.80"/> </list> <parameter key="filters_logic_and" value="true"/> <parameter key="filters_check_metadata" value="true"/> </operator> <operator activated="true" class="subprocess" compatibility="9.2.001" expanded="true" height="82" name="Subprocess" width="90" x="313" y="34"> <process expanded="true"> <operator activated="true" class="select_attributes" compatibility="9.2.001" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="34"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="amenities|name"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> </operator> <operator activated="true" class="replace" compatibility="9.2.001" expanded="true" height="82" name="Replace" width="90" x="179" y="34"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="amenities"/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="nominal"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="file_path"/> <parameter key="block_type" value="single_value"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="single_value"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="replace_what" value="[{}]"/> </operator> <operator activated="true" breakpoints="after" class="split" compatibility="9.2.001" expanded="true" height="82" name="Split" width="90" x="313" y="34"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="amenities"/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="nominal"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="file_path"/> <parameter key="block_type" value="single_value"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="single_value"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="split_pattern" value=","/> <parameter key="split_mode" value="ordered_split"/> </operator> <operator activated="true" breakpoints="after" class="de_pivot" compatibility="9.2.001" expanded="true" height="82" name="De-Pivot" width="90" x="447" y="34"> <list key="attribute_name"> <parameter key="Amenity" value="amenities.*"/> </list> <parameter key="index_attribute" value="nr"/> <parameter key="create_nominal_index" value="false"/> <parameter key="keep_missings" value="false"/> </operator> <operator activated="true" breakpoints="after" class="set_role" compatibility="9.2.001" expanded="true" height="82" name="Set Role" width="90" x="581" y="34"> <parameter key="attribute_name" value="review_scores_rating"/> <parameter key="target_role" value="label"/> <list key="set_additional_roles"/> <description align="center" color="transparent" colored="false" width="126">Make review_score_rating prediction value</description> </operator> <operator activated="true" breakpoints="after" class="nominal_to_numerical" compatibility="9.2.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="715" y="34"> <parameter key="return_preprocessing_model" value="false"/> <parameter key="create_view" value="false"/> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="bed_type|cancellation_policy|property_type|room_type|amenities|cleaning_fee|host_has_profile_pic|host_identity_verified|instant_bookable"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="nominal"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="file_path"/> <parameter key="block_type" value="single_value"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="single_value"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="coding_type" value="dummy coding"/> <parameter key="use_comparison_groups" value="false"/> <list key="comparison_groups"/> <parameter key="unexpected_value_handling" value="all 0 and warning"/> <parameter key="use_underscore_in_name" value="true"/> <description align="center" color="transparent" colored="false" width="126">Change all textual attributes to booleans</description> </operator> <operator activated="true" class="select_attributes" compatibility="9.2.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="849" y="34"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="bed_type_Airbed|bed_type_Couch|cancellation_policy_long_term|cancellation_policy_super_strict_30|cancellation_policy_super_strict_60|property_type_Bed & Breakfast|property_type_Boat|property_type_Boutique hotel|property_type_Bungalow|property_type_Cabin|property_type_Camper/RV|property_type_Castle|property_type_Cave|property_type_Chalet|property_type_Dorm|property_type_Earth House|property_type_Guest suite|property_type_Guesthouse|property_type_Hostel|property_type_In-law|property_type_Serviced apartment|property_type_Tent|property_type_Timeshare|property_type_Tipi|property_type_Train|property_type_Treehouse|property_type_Vacation home|property_type_Villa|property_type_Yurt|room_type_Shared room"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="true"/> <parameter key="include_special_attributes" value="false"/> <description align="center" color="transparent" colored="false" width="126">Filter out (almost)<br>never used labels<br/></description> </operator> <connect from_port="in 1" to_op="Select Attributes" to_port="example set input"/> <connect from_op="Select Attributes" from_port="example set output" to_op="Replace" to_port="example set input"/> <connect from_op="Replace" from_port="example set output" to_op="Split" to_port="example set input"/> <connect from_op="Split" from_port="example set output" to_op="De-Pivot" to_port="example set input"/> <connect from_op="De-Pivot" from_port="example set output" to_op="Set Role" to_port="example set input"/> <connect from_op="Set Role" from_port="example set output" to_op="Nominal to Numerical" to_port="example set input"/> <connect from_op="Nominal to Numerical" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/> <connect from_op="Select Attributes (2)" from_port="example set output" to_port="out 1"/> <portSpacing port="source_in 1" spacing="0"/> <portSpacing port="source_in 2" spacing="0"/> <portSpacing port="sink_out 1" spacing="0"/> <portSpacing port="sink_out 2" spacing="0"/> </process> <description align="center" color="transparent" colored="false" width="126">Data preparation<br/></description> </operator> <operator activated="true" class="split_data" compatibility="9.2.001" expanded="true" height="103" name="Split Data" width="90" x="447" y="34"> <enumeration key="partitions"> <parameter key="ratio" value="0.7"/> <parameter key="ratio" value="0.3"/> </enumeration> <parameter key="sampling_type" value="shuffled sampling"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> </operator> <operator activated="true" class="polynomial_regression" compatibility="9.2.001" expanded="true" height="82" name="Polynomial Regression" width="90" x="648" y="34"> <parameter key="max_iterations" value="5000"/> <parameter key="replication_factor" value="1"/> <parameter key="max_degree" value="5"/> <parameter key="min_coefficient" value="-100.0"/> <parameter key="max_coefficient" value="100.0"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> </operator> <operator activated="true" class="apply_model" compatibility="9.2.001" expanded="true" height="82" name="Apply Model" width="90" x="715" y="238"> <list key="application_parameters"/> <parameter key="create_view" value="false"/> </operator> <connect from_op="Read Excel" from_port="output" to_op="Filter Examples" to_port="example set input"/> <connect from_op="Filter Examples" from_port="example set output" to_op="Subprocess" to_port="in 1"/> <connect from_op="Subprocess" from_port="out 1" to_op="Split Data" to_port="example set"/> <connect from_op="Split Data" from_port="partition 1" to_op="Polynomial Regression" to_port="training set"/> <connect from_op="Split Data" from_port="partition 2" to_op="Apply Model" to_port="unlabelled data"/> <connect from_op="Polynomial Regression" from_port="model" to_op="Apply Model" to_port="model"/> <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> </process> </operator> </process>
After that, you could Pivot again for example. That will give you the 1/0 attributes for each amenity.
Regards,
Balázs
1 -
Hi Balázs, thanks for the reply! This has helped me turn the amenities into separate text, however I am still confused as to how to turn them into the 0/1 attributes like you said.I tried using the Pivot operator like you said, but all I'm getting out of anything I try is weird attributes with a missing value.I have tried the Union operator to append the newly generated set to the original and then comparing the amenity value to the long string in the amenities attribute, then using that as the boolean attribute for each amenity.How do I get a simple 0/1 value with the Pivot value, I get an error when I try to run anything on the amenity attribute because it is not a numeric value.Thanks again and in advance!Regards,Jeroen1
-
Hi,
when doing this kind of operation, you usually split your data into different tables. The one that you'll de-pivot and pivot will only have an Id and the attribute you're splitting up. Then you work on this table and join back the results.
For Pivot you need to select the Id as the "group by" attribute and the Amenity as the "column grouping attribute". Then you calculate the count of records, the result will be 1 or missing. You replace the missings by 0 for example, and you might want to rename the attributes automatically so that they don't have the redundant part anymore.<?xml version="1.0" encoding="UTF-8"?><process version="9.2.001"> <context> <input/> <output/> <macros/> </context> <operator activated="true" class="process" compatibility="9.2.001" expanded="true" name="Process"> <parameter key="logverbosity" value="init"/> <parameter key="random_seed" value="2001"/> <parameter key="send_mail" value="never"/> <parameter key="notification_email" value=""/> <parameter key="process_duration_for_mail" value="30"/> <parameter key="encoding" value="SYSTEM"/> <process expanded="true"> <operator activated="true" class="read_excel" compatibility="9.2.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34"> <parameter key="excel_file" value="/home/barany/Downloads/airbnb.xlsx"/> <parameter key="sheet_selection" value="sheet number"/> <parameter key="sheet_number" value="1"/> <parameter key="imported_cell_range" value="A1"/> <parameter key="encoding" value="SYSTEM"/> <parameter key="first_row_as_names" value="true"/> <list key="annotations"/> <parameter key="date_format" value=""/> <parameter key="time_zone" value="SYSTEM"/> <parameter key="locale" value="English (United States)"/> <parameter key="read_all_values_as_polynominal" value="false"/> <list key="data_set_meta_data_information"> <parameter key="0" value="id.true.integer.attribute"/> <parameter key="1" value="property_type.true.polynominal.attribute"/> <parameter key="2" value="room_type.true.polynominal.attribute"/> <parameter key="3" value="amenities.true.polynominal.attribute"/> <parameter key="4" value="accommodates.true.integer.attribute"/> <parameter key="5" value="bathrooms.true.real.attribute"/> <parameter key="6" value="bed_type.true.polynominal.attribute"/> <parameter key="7" value="cancellation_policy.true.polynominal.attribute"/> <parameter key="8" value="cleaning_fee.true.polynominal.attribute"/> <parameter key="9" value="city.true.polynominal.attribute"/> <parameter key="10" value="description.true.polynominal.attribute"/> <parameter key="11" value="first_review.true.date.attribute"/> <parameter key="12" value="host_has_profile_pic.true.polynominal.attribute"/> <parameter key="13" value="host_identity_verified.true.polynominal.attribute"/> <parameter key="14" value="host_response_rate.true.real.attribute"/> <parameter key="15" value="host_since.true.date.attribute"/> <parameter key="16" value="instant_bookable.true.polynominal.attribute"/> <parameter key="17" value="last_review.true.date.attribute"/> <parameter key="18" value="latitude.true.real.attribute"/> <parameter key="19" value="longitude.true.real.attribute"/> <parameter key="20" value="name.true.polynominal.attribute"/> <parameter key="21" value="neighbourhood.true.polynominal.attribute"/> <parameter key="22" value="number_of_reviews.true.integer.attribute"/> <parameter key="23" value="review_scores_rating.true.integer.attribute"/> <parameter key="24" value="thumbnail_url.true.polynominal.attribute"/> <parameter key="25" value="zipcode.true.integer.attribute"/> <parameter key="26" value="bedrooms.true.integer.attribute"/> <parameter key="27" value="beds.true.integer.attribute"/> </list> <parameter key="read_not_matching_values_as_missings" value="false"/> <parameter key="datamanagement" value="double_array"/> <parameter key="data_management" value="auto"/> </operator> <operator activated="true" class="filter_examples" compatibility="9.2.001" expanded="true" height="103" name="Filter Examples" width="90" x="179" y="34"> <parameter key="parameter_expression" value=""/> <parameter key="condition_class" value="custom_filters"/> <parameter key="invert_filter" value="false"/> <list key="filters_list"> <parameter key="filters_entry_key" value="city.equals.NYC"/> <parameter key="filters_entry_key" value="number_of_reviews.ge.5"/> <parameter key="filters_entry_key" value="city.is_not_missing."/> <parameter key="filters_entry_key" value="number_of_reviews.is_not_missing."/> <parameter key="filters_entry_key" value="property_type.is_not_missing."/> <parameter key="filters_entry_key" value="room_type.is_not_missing."/> <parameter key="filters_entry_key" value="amenities.is_not_missing."/> <parameter key="filters_entry_key" value="accommodates.is_not_missing."/> <parameter key="filters_entry_key" value="bathrooms.is_not_missing."/> <parameter key="filters_entry_key" value="bed_type.is_not_missing."/> <parameter key="filters_entry_key" value="cancellation_policy.is_not_missing."/> <parameter key="filters_entry_key" value="cleaning_fee.is_not_missing."/> <parameter key="filters_entry_key" value="host_has_profile_pic.is_not_missing."/> <parameter key="filters_entry_key" value="host_identity_verified.is_not_missing."/> <parameter key="filters_entry_key" value="host_response_rate.is_not_missing."/> <parameter key="filters_entry_key" value="instant_bookable.is_not_missing."/> <parameter key="filters_entry_key" value="neighbourhood.is_not_missing."/> <parameter key="filters_entry_key" value="number_of_reviews.is_not_missing."/> <parameter key="filters_entry_key" value="review_scores_rating.is_not_missing."/> <parameter key="filters_entry_key" value="bedrooms.is_not_missing."/> <parameter key="filters_entry_key" value="beds.is_not_missing."/> <parameter key="filters_entry_key" value="review_scores_rating.ge.80"/> </list> <parameter key="filters_logic_and" value="true"/> <parameter key="filters_check_metadata" value="true"/> </operator> <operator activated="true" class="subprocess" compatibility="9.2.001" expanded="true" height="82" name="Subprocess" width="90" x="313" y="34"> <process expanded="true"> <operator activated="true" class="select_attributes" compatibility="9.2.001" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="34"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="amenities|id"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> </operator> <operator activated="true" class="replace" compatibility="9.2.001" expanded="true" height="82" name="Replace" width="90" x="179" y="34"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="amenities"/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="nominal"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="file_path"/> <parameter key="block_type" value="single_value"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="single_value"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="replace_what" value="[{}"]"/> </operator> <operator activated="true" class="split" compatibility="9.2.001" expanded="true" height="82" name="Split" width="90" x="313" y="34"> <parameter key="attribute_filter_type" value="single"/> <parameter key="attribute" value="amenities"/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="nominal"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="file_path"/> <parameter key="block_type" value="single_value"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="single_value"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="split_pattern" value=","/> <parameter key="split_mode" value="ordered_split"/> </operator> <operator activated="true" class="de_pivot" compatibility="9.2.001" expanded="true" height="82" name="De-Pivot" width="90" x="447" y="34"> <list key="attribute_name"> <parameter key="Amenity" value="amenities.*"/> </list> <parameter key="index_attribute" value="nr"/> <parameter key="create_nominal_index" value="false"/> <parameter key="keep_missings" value="false"/> </operator> <operator activated="true" class="blending:pivot" compatibility="9.2.001" expanded="true" height="82" name="Pivot" width="90" x="581" y="34"> <parameter key="group_by_attributes" value="id"/> <parameter key="column_grouping_attribute" value="Amenity"/> <list key="aggregation_attributes"> <parameter key="nr" value="count"/> </list> <parameter key="use_default_aggregation" value="false"/> <parameter key="default_aggregation_function" value="first"/> </operator> <operator activated="true" class="replace_missing_values" compatibility="9.2.001" expanded="true" height="103" name="Replace Missing Values" width="90" x="715" y="34"> <parameter key="return_preprocessing_model" value="false"/> <parameter key="create_view" value="false"/> <parameter key="attribute_filter_type" value="all"/> <parameter key="attribute" value=""/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="default" value="zero"/> <list key="columns"/> </operator> <operator activated="true" class="rename_by_replacing" compatibility="9.2.001" expanded="true" height="82" name="Rename by Replacing" width="90" x="849" y="34"> <parameter key="attribute_filter_type" value="all"/> <parameter key="attribute" value=""/> <parameter key="attributes" value=""/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="replace_what" value="count.nr._ *"/> </operator> <operator activated="true" class="set_role" compatibility="9.2.001" expanded="true" height="82" name="Set Role" width="90" x="179" y="187"> <parameter key="attribute_name" value="review_scores_rating"/> <parameter key="target_role" value="label"/> <list key="set_additional_roles"/> <description align="center" color="transparent" colored="false" width="126">Make review_score_rating prediction value</description> </operator> <operator activated="false" breakpoints="after" class="nominal_to_numerical" compatibility="9.2.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="380" y="238"> <parameter key="return_preprocessing_model" value="false"/> <parameter key="create_view" value="false"/> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="bed_type|cancellation_policy|property_type|room_type|amenities|cleaning_fee|host_has_profile_pic|host_identity_verified|instant_bookable"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="nominal"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="file_path"/> <parameter key="block_type" value="single_value"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="single_value"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="false"/> <parameter key="coding_type" value="dummy coding"/> <parameter key="use_comparison_groups" value="false"/> <list key="comparison_groups"/> <parameter key="unexpected_value_handling" value="all 0 and warning"/> <parameter key="use_underscore_in_name" value="true"/> <description align="center" color="transparent" colored="false" width="126">Change all textual attributes to booleans</description> </operator> <operator activated="true" class="select_attributes" compatibility="9.2.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="514" y="187"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="bed_type_Airbed|bed_type_Couch|cancellation_policy_long_term|cancellation_policy_super_strict_30|cancellation_policy_super_strict_60|property_type_Bed & Breakfast|property_type_Boat|property_type_Boutique hotel|property_type_Bungalow|property_type_Cabin|property_type_Camper/RV|property_type_Castle|property_type_Cave|property_type_Chalet|property_type_Dorm|property_type_Earth House|property_type_Guest suite|property_type_Guesthouse|property_type_Hostel|property_type_In-law|property_type_Serviced apartment|property_type_Tent|property_type_Timeshare|property_type_Tipi|property_type_Train|property_type_Treehouse|property_type_Vacation home|property_type_Villa|property_type_Yurt|room_type_Shared room"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="true"/> <parameter key="include_special_attributes" value="false"/> <description align="center" color="transparent" colored="false" width="126">Filter out (almost)<br>never used labels<br/></description> </operator> <operator activated="true" breakpoints="after" class="concurrency:join" compatibility="9.2.001" expanded="true" height="82" name="Join" width="90" x="849" y="187"> <parameter key="remove_double_attributes" value="true"/> <parameter key="join_type" value="inner"/> <parameter key="use_id_attribute_as_key" value="false"/> <list key="key_attributes"> <parameter key="id" value="id"/> </list> <parameter key="keep_both_join_attributes" value="false"/> </operator> <connect from_port="in 1" to_op="Select Attributes" to_port="example set input"/> <connect from_op="Select Attributes" from_port="example set output" to_op="Replace" to_port="example set input"/> <connect from_op="Select Attributes" from_port="original" to_op="Set Role" to_port="example set input"/> <connect from_op="Replace" from_port="example set output" to_op="Split" to_port="example set input"/> <connect from_op="Split" from_port="example set output" to_op="De-Pivot" to_port="example set input"/> <connect from_op="De-Pivot" from_port="example set output" to_op="Pivot" to_port="input"/> <connect from_op="Pivot" from_port="output" to_op="Replace Missing Values" to_port="example set input"/> <connect from_op="Replace Missing Values" from_port="example set output" to_op="Rename by Replacing" to_port="example set input"/> <connect from_op="Rename by Replacing" from_port="example set output" to_op="Join" to_port="left"/> <connect from_op="Set Role" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/> <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Join" to_port="right"/> <connect from_op="Join" from_port="join" to_port="out 1"/> <portSpacing port="source_in 1" spacing="0"/> <portSpacing port="source_in 2" spacing="0"/> <portSpacing port="sink_out 1" spacing="0"/> <portSpacing port="sink_out 2" spacing="0"/> </process> <description align="center" color="transparent" colored="false" width="126">Data preparation<br/></description> </operator> <operator activated="true" class="split_data" compatibility="9.2.001" expanded="true" height="103" name="Split Data" width="90" x="447" y="34"> <enumeration key="partitions"> <parameter key="ratio" value="0.7"/> <parameter key="ratio" value="0.3"/> </enumeration> <parameter key="sampling_type" value="shuffled sampling"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> </operator> <operator activated="true" class="polynomial_regression" compatibility="9.2.001" expanded="true" height="82" name="Polynomial Regression" width="90" x="648" y="34"> <parameter key="max_iterations" value="5000"/> <parameter key="replication_factor" value="1"/> <parameter key="max_degree" value="5"/> <parameter key="min_coefficient" value="-100.0"/> <parameter key="max_coefficient" value="100.0"/> <parameter key="use_local_random_seed" value="false"/> <parameter key="local_random_seed" value="1992"/> </operator> <operator activated="true" class="apply_model" compatibility="9.2.001" expanded="true" height="82" name="Apply Model" width="90" x="715" y="238"> <list key="application_parameters"/> <parameter key="create_view" value="false"/> </operator> <connect from_op="Read Excel" from_port="output" to_op="Filter Examples" to_port="example set input"/> <connect from_op="Filter Examples" from_port="example set output" to_op="Subprocess" to_port="in 1"/> <connect from_op="Subprocess" from_port="out 1" to_op="Split Data" to_port="example set"/> <connect from_op="Split Data" from_port="partition 1" to_op="Polynomial Regression" to_port="training set"/> <connect from_op="Split Data" from_port="partition 2" to_op="Apply Model" to_port="unlabelled data"/> <connect from_op="Polynomial Regression" from_port="model" to_op="Apply Model" to_port="model"/> <connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> </process> </operator> </process>
Regards,
Balázs10 -
Thankyou, with some tinkering that did the trick for me!I just had to change some stuff up to lighten the load on the actual regression process, but I've come a great way now.Thanks again for all the help!Regards,Jeroen2