How to use a Macro on Extract Information
Hi! I'm looking for some help with the Extarct Information operator combined with macros. I have built a crawling WebService with RapidMiner Server which extract prices of products from different pages.
The Layout is Simple.
The only thing that changes is the RegEx used to extract the information from each page.
I tried to create an exampleset with the domain and rules for each field in order to keep is simple to add new domains that could be crawled but when I try to use a macro under the query expression nothing happens.
Does anybody have tried to use this approach? how could I use the Set Parameters from ExampleSet with the Extract Information operator.
The Layout is Simple.
The only thing that changes is the RegEx used to extract the information from each page.
I tried to create an exampleset with the domain and rules for each field in order to keep is simple to add new domains that could be crawled but when I try to use a macro under the query expression nothing happens.
Does anybody have tried to use this approach? how could I use the Set Parameters from ExampleSet with the Extract Information operator.
<?xml version="1.0" encoding="UTF-8"?><process version="9.4.001"> <context> <input/> <output/> <macros> <macro> <key>url</key> <value>https://www.elpalaciodehierro.com/charm-chile-39861202.html</value> </macro> </macros> </context> <operator activated="true" class="process" compatibility="9.4.000" expanded="true" name="Process"> <parameter key="logverbosity" value="init"/> <parameter key="random_seed" value="2001"/> <parameter key="send_mail" value="never"/> <parameter key="notification_email" value=""/> <parameter key="process_duration_for_mail" value="30"/> <parameter key="encoding" value="SYSTEM"/> <process expanded="true"> <operator activated="true" class="web:get_webpage" compatibility="9.0.000" expanded="true" height="68" name="Get Page" width="90" x="112" y="34"> <parameter key="url" value="%{url}"/> <parameter key="random_user_agent" value="true"/> <parameter key="connection_timeout" value="10000"/> <parameter key="read_timeout" value="10000"/> <parameter key="follow_redirects" value="true"/> <parameter key="accept_cookies" value="all"/> <parameter key="cookie_scope" value="thread"/> <parameter key="request_method" value="GET"/> <list key="query_parameters"/> <list key="request_properties"/> <parameter key="override_encoding" value="false"/> <parameter key="encoding" value="SYSTEM"/> </operator> <operator activated="true" class="text:process_documents" compatibility="8.2.000" expanded="true" height="103" name="Process Documents" width="90" x="246" y="34"> <parameter key="create_word_vector" value="false"/> <parameter key="vector_creation" value="TF-IDF"/> <parameter key="add_meta_information" value="true"/> <parameter key="keep_text" value="false"/> <parameter key="prune_method" value="none"/> <parameter key="prune_below_percent" value="3.0"/> <parameter key="prune_above_percent" value="30.0"/> <parameter key="prune_below_rank" value="0.05"/> <parameter key="prune_above_rank" value="0.95"/> <parameter key="datamanagement" value="double_sparse_array"/> <parameter key="data_management" value="auto"/> <process expanded="true"> <operator activated="true" class="text:extract_information" compatibility="8.2.000" expanded="true" height="68" name="elpalacio (2)" width="90" x="313" y="34"> <parameter key="query_type" value="Regular Expression"/> <list key="string_machting_queries"/> <parameter key="attribute_type" value="Nominal"/> <list key="regular_expression_queries"> <parameter key="nombre" value="<div class="product-name ">\s{1,} <span class="h1" >(.*)</span>\s{1,}</div>"/> <parameter key="precio_n" value="<span class="price">[$]\S([0-9,.]{1,})</span>"/> <parameter key="precio_d" value=" <span class="ls-price-now-price price".*">\s{1,}[$]\S([0-9,.]{1,})\s{1,} </span>"/> <parameter key="antes" value="<span class="ls-price-bef-price price"\sid="old-price-[0-9]{1,}">\s{1,}[$]\S([0-9,.]{1,})\s{1,}</span>\s{1,}</p>"/> </list> <list key="regular_region_queries"/> <list key="xpath_queries"/> <list key="namespaces"/> <parameter key="ignore_CDATA" value="true"/> <parameter key="assume_html" value="true"/> <list key="index_queries"/> <list key="jsonpath_queries"/> </operator> <connect from_port="document" to_op="elpalacio (2)" to_port="document"/> <connect from_op="elpalacio (2)" from_port="document" to_port="document 1"/> <portSpacing port="source_document" spacing="0"/> <portSpacing port="sink_document 1" spacing="0"/> <portSpacing port="sink_document 2" spacing="0"/> </process> </operator> <operator activated="true" class="generate_attributes" compatibility="9.4.001" expanded="true" height="82" name="Generate Attributes" width="90" x="380" y="34"> <list key="function_descriptions"> <parameter key="precio_n" value="if(missing(precio_n),precio_d,precio_n)"/> <parameter key="precio_d" value="if(missing(precio_d),precio_n,precio_d)"/> </list> <parameter key="keep_all" value="true"/> </operator> <operator activated="true" class="parse_numbers" compatibility="9.4.001" expanded="true" height="82" name="Parse Numbers" width="90" x="514" y="34"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="precio_n|precio_d"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="nominal"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="file_path"/> <parameter key="block_type" value="single_value"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="single_value"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="true"/> <parameter key="decimal_character" value="."/> <parameter key="grouped_digits" value="true"/> <parameter key="grouping_character" value=","/> <parameter key="infinity_representation" value=""/> <parameter key="unparsable_value_handling" value="fail"/> </operator> <operator activated="true" class="select_attributes" compatibility="9.4.001" expanded="true" height="82" name="Select Attributes" width="90" x="648" y="34"> <parameter key="attribute_filter_type" value="subset"/> <parameter key="attribute" value=""/> <parameter key="attributes" value="precio_d|precio_n|nombre"/> <parameter key="use_except_expression" value="false"/> <parameter key="value_type" value="attribute_value"/> <parameter key="use_value_type_exception" value="false"/> <parameter key="except_value_type" value="time"/> <parameter key="block_type" value="attribute_block"/> <parameter key="use_block_type_exception" value="false"/> <parameter key="except_block_type" value="value_matrix_row_start"/> <parameter key="invert_selection" value="false"/> <parameter key="include_special_attributes" value="true"/> </operator> <connect from_op="Get Page" from_port="output" to_op="Process Documents" to_port="documents 1"/> <connect from_op="Process Documents" from_port="example set" to_op="Generate Attributes" to_port="example set input"/> <connect from_op="Generate Attributes" from_port="example set output" to_op="Parse Numbers" to_port="example set input"/> <connect from_op="Parse Numbers" from_port="example set output" to_op="Select Attributes" to_port="example set input"/> <connect from_op="Select Attributes" from_port="example set output" to_port="result 1"/> <portSpacing port="source_input 1" spacing="0"/> <portSpacing port="sink_result 1" spacing="0"/> <portSpacing port="sink_result 2" spacing="0"/> </process> </operator> </process>