Turn values from JSON row into attribute
I have in my ExampleSet (data from an Excel sheet) one attribute named 'amenities' with massive JSON arrays of values, like so;
{TV,"Wireless Internet","Air conditioning","Pets live on this property",Cat(s),"Indoor fireplace",Heating,"Smoke detector","Carbon monoxide detector",Essentials}
As you can see from the two rows above, a lot of these rows have similar values in them. I am trying to execute a polynomial regression process, so I need all attributes to contain only numerical values. I want to turn all values from these JSON rows into separate attributes, with zeroes and ones as values representing whether it applies to the record in question (if the attribute exists in the JSON row in the amenities attribute), then afterwards exclude the amenities attribute from the ExampleSet and do the regression.
Normally I would just place a Nominal to Numerical operator, but the rows consist of endlessly different values in no particular order.
I have browsed the internet all over for tutorials and answers but nothing has seemed to do the trick for me.
Attached is the process I have so far and the dataset, doing some data prepping for the other attributes in the Excel file and a ready made polynomial regression operator for the end result.
<?xml version="1.0" encoding="UTF-8"?><process version="9.3.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.3.001" expanded="true" name="Process">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="send_mail" value="never"/>
<parameter key="notification_email" value=""/>
<parameter key="process_duration_for_mail" value="30"/>
<parameter key="encoding" value="SYSTEM"/>
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="9.3.001" expanded="true" height="68" name="Retrieve airbnb" width="90" x="45" y="34">
<parameter key="repository_entry" value="//Local Repository/data/airbnb"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="9.3.001" expanded="true" height="103" name="Filter Examples" width="90" x="179" y="34">
<parameter key="parameter_expression" value=""/>
<parameter key="condition_class" value="custom_filters"/>
<parameter key="invert_filter" value="false"/>
<list key="filters_list">
<parameter key="filters_entry_key" value="city.equals.NYC"/>
<parameter key="filters_entry_key" value="number_of_reviews.ge.5"/>
<parameter key="filters_entry_key" value="city.is_not_missing."/>
<parameter key="filters_entry_key" value="number_of_reviews.is_not_missing."/>
<parameter key="filters_entry_key" value="property_type.is_not_missing."/>
<parameter key="filters_entry_key" value="room_type.is_not_missing."/>
<parameter key="filters_entry_key" value="amenities.is_not_missing."/>
<parameter key="filters_entry_key" value="accommodates.is_not_missing."/>
<parameter key="filters_entry_key" value="bathrooms.is_not_missing."/>
<parameter key="filters_entry_key" value="bed_type.is_not_missing."/>
<parameter key="filters_entry_key" value="cancellation_policy.is_not_missing."/>
<parameter key="filters_entry_key" value="cleaning_fee.is_not_missing."/>
<parameter key="filters_entry_key" value="host_has_profile_pic.is_not_missing."/>
<parameter key="filters_entry_key" value="host_identity_verified.is_not_missing."/>
<parameter key="filters_entry_key" value="host_response_rate.is_not_missing."/>
<parameter key="filters_entry_key" value="instant_bookable.is_not_missing."/>
<parameter key="filters_entry_key" value="neighbourhood.is_not_missing."/>
<parameter key="filters_entry_key" value="number_of_reviews.is_not_missing."/>
<parameter key="filters_entry_key" value="review_scores_rating.is_not_missing."/>
<parameter key="filters_entry_key" value="bedrooms.is_not_missing."/>
<parameter key="filters_entry_key" value="beds.is_not_missing."/>
<parameter key="filters_entry_key" value="review_scores_rating.ge.80"/>
</list>
<parameter key="filters_logic_and" value="true"/>
<parameter key="filters_check_metadata" value="true"/>
</operator>
<operator activated="true" class="subprocess" compatibility="9.3.001" expanded="true" height="82" name="Subprocess" width="90" x="313" y="34">
<process expanded="true">
<operator activated="true" class="select_attributes" compatibility="9.3.001" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="accommodates|amenities|bathrooms|bed_type|bedrooms|beds|cancellation_policy|cleaning_fee|host_has_profile_pic|host_identity_verified|host_response_rate|instant_bookable|number_of_reviews|property_type|review_scores_rating|room_type"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<operator activated="true" class="set_role" compatibility="9.3.001" expanded="true" height="82" name="Set Role" width="90" x="246" y="34">
<parameter key="attribute_name" value="review_scores_rating"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="9.3.001" expanded="true" height="103" name="Nominal to Numerical" width="90" x="581" y="34">
<parameter key="return_preprocessing_model" value="false"/>
<parameter key="create_view" value="false"/>
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="bed_type|cancellation_policy|property_type|room_type|amenities|cleaning_fee|host_has_profile_pic|host_identity_verified|instant_bookable"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="nominal"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="file_path"/>
<parameter key="block_type" value="single_value"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="single_value"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="coding_type" value="dummy coding"/>
<parameter key="use_comparison_groups" value="false"/>
<list key="comparison_groups"/>
<parameter key="unexpected_value_handling" value="all 0 and warning"/>
<parameter key="use_underscore_in_name" value="true"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="9.3.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="849" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value=""/>
<parameter key="attributes" value="bed_type_Airbed|bed_type_Couch|cancellation_policy_long_term|cancellation_policy_super_strict_30|cancellation_policy_super_strict_60|property_type_Bed & Breakfast|property_type_Boat|property_type_Boutique hotel|property_type_Bungalow|property_type_Cabin|property_type_Camper/RV|property_type_Castle|property_type_Cave|property_type_Chalet|property_type_Dorm|property_type_Earth House|property_type_Guest suite|property_type_Guesthouse|property_type_Hostel|property_type_In-law|property_type_Serviced apartment|property_type_Tent|property_type_Timeshare|property_type_Tipi|property_type_Train|property_type_Treehouse|property_type_Vacation home|property_type_Villa|property_type_Yurt|room_type_Shared room"/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="attribute_value"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="time"/>
<parameter key="block_type" value="attribute_block"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="value_matrix_row_start"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="false"/>
</operator>
<connect from_port="in 1" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="split_data" compatibility="9.3.001" expanded="true" height="103" name="Split Data" width="90" x="447" y="34">
<enumeration key="partitions">
<parameter key="ratio" value="0.7"/>
<parameter key="ratio" value="0.3"/>
</enumeration>
<parameter key="sampling_type" value="shuffled sampling"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<operator activated="true" class="polynomial_regression" compatibility="9.3.001" expanded="true" height="82" name="Polynomial Regression" width="90" x="648" y="34">
<parameter key="max_iterations" value="5000"/>
<parameter key="replication_factor" value="1"/>
<parameter key="max_degree" value="5"/>
<parameter key="min_coefficient" value="-100.0"/>
<parameter key="max_coefficient" value="100.0"/>
<parameter key="use_local_random_seed" value="false"/>
<parameter key="local_random_seed" value="1992"/>
</operator>
<operator activated="true" class="apply_model" compatibility="9.3.001" expanded="true" height="82" name="Apply Model" width="90" x="715" y="238">
<list key="application_parameters"/>
<parameter key="create_view" value="false"/>
</operator>
<connect from_op="Retrieve airbnb" from_port="output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Subprocess" to_port="in 1"/>
<connect from_op="Subprocess" from_port="out 1" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_op="Polynomial Regression" to_port="training set"/>
<connect from_op="Split Data" from_port="partition 2" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Polynomial Regression" from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_op="Apply Model" from_port="labelled data" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>