web

Unknown
edited November 5 in Community Q&A
Hello all,

I am new to data/web/text mining and RapidMiner and I have question regarding these two and I have the following problem:

I have text in an excel sheet in many rows and some columns. For one or two specific columns and for each row I want to make queries on a website and retrieve the results (links) for each query. Then follow the links to the websites and retrieve text from the websites. At the end compare the retrieved text for each website with a text file. The comparison should ignore words like is, and, for, etc. I need to know which words are the same or similar e.g. house, domicile, indoor etc.

Is this possible with RapidMiner or at all?

Thanks in advance









Answers

  • Anyone?

    I don't understand how "Enrich Data By Webservice" works.

    What am I supposed to enter in "query type" and in the two "Edit List".

    I cannot find any examples.
  • TobiasMalbrecht
    TobiasMalbrecht New Altair Community Member
    Hi,

    I attached a simple example below that shows how to query Wikipedia for well-known novels and retrieve the first paragraph as a description. Hope that helps.

    Best,
    Tobias

    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.3.005">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="5.3.005" expanded="true" name="Root">
        <process expanded="true">
          <operator activated="true" class="generate_data_user_specification" compatibility="5.3.005" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="45" y="30">
            <list key="attribute_values">
              <parameter key="QUERY" value="&quot;Moby_****&quot;"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="5.3.005" expanded="true" height="60" name="Generate Data by User Specification (2)" width="90" x="45" y="120">
            <list key="attribute_values">
              <parameter key="QUERY" value="&quot;The_Adventures_of_Tom_Sawyer&quot;"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="generate_data_user_specification" compatibility="5.3.005" expanded="true" height="60" name="Generate Data by User Specification (3)" width="90" x="45" y="210">
            <list key="attribute_values">
              <parameter key="QUERY" value="&quot;Treasure_Island&quot;"/>
            </list>
            <list key="set_additional_roles"/>
          </operator>
          <operator activated="true" class="append" compatibility="5.3.005" expanded="true" height="112" name="Append" width="90" x="179" y="30"/>
          <operator activated="true" class="web:enrich_data_by_webservice" compatibility="5.3.001" expanded="true" height="60" name="MashUp" width="90" x="313" y="30">
            <parameter key="query_type" value="XPath"/>
            <list key="string_machting_queries"/>
            <list key="regular_expression_queries"/>
            <list key="regular_region_queries"/>
            <list key="xpath_queries">
              <parameter key="EXPLANATION" value="//h:div[@id='mw-content-text']/h:p[1]"/>
            </list>
            <list key="namespaces"/>
            <list key="index_queries"/>
            <parameter key="url" value="http://en.wikipedia.org/wiki/&amp;lt;%QUERY%&amp;gt;"/>
            <parameter key="delay" value="1000"/>
            <list key="request_properties"/>
          </operator>
          <operator activated="true" class="subprocess" compatibility="5.3.005" expanded="true" height="76" name="CleanUp" width="90" x="447" y="30">
            <process expanded="true">
              <operator activated="true" class="replace" compatibility="5.3.005" expanded="true" height="76" name="Replace" width="90" x="45" y="30">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="EXPLANATION"/>
                <parameter key="replace_what" value="&lt;.*?&gt;|\n"/>
              </operator>
              <operator activated="true" class="replace" compatibility="5.3.005" expanded="true" height="76" name="Replace (2)" width="90" x="179" y="30">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="EXPLANATION"/>
                <parameter key="replace_what" value="\s{2,}"/>
                <parameter key="replace_by" value=" "/>
              </operator>
              <operator activated="true" class="replace" compatibility="5.3.005" expanded="true" height="76" name="Replace (3)" width="90" x="313" y="30">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="EXPLANATION"/>
                <parameter key="replace_what" value="\s+([\,;\.\)\]\}])"/>
                <parameter key="replace_by" value="$1"/>
              </operator>
              <operator activated="true" class="replace" compatibility="5.3.005" expanded="true" height="76" name="Replace (4)" width="90" x="447" y="30">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="EXPLANATION"/>
                <parameter key="replace_what" value="([\(\[\{])\s+"/>
                <parameter key="replace_by" value="$1"/>
              </operator>
              <operator activated="true" class="replace" compatibility="5.3.005" expanded="true" height="76" name="Replace (5)" width="90" x="581" y="30">
                <parameter key="attribute_filter_type" value="single"/>
                <parameter key="attribute" value="EXPLANATION"/>
                <parameter key="replace_what" value="\s*\[[0-9]+\]\s*"/>
                <parameter key="replace_by" value=" "/>
              </operator>
              <operator activated="true" class="order_attributes" compatibility="5.3.005" expanded="true" height="76" name="Reorder Attributes" width="90" x="715" y="30">
                <parameter key="attribute_ordering" value="QUERY|EXPLANATION"/>
              </operator>
              <connect from_port="in 1" to_op="Replace" to_port="example set input"/>
              <connect from_op="Replace" from_port="example set output" to_op="Replace (2)" to_port="example set input"/>
              <connect from_op="Replace (2)" from_port="example set output" to_op="Replace (3)" to_port="example set input"/>
              <connect from_op="Replace (3)" from_port="example set output" to_op="Replace (4)" to_port="example set input"/>
              <connect from_op="Replace (4)" from_port="example set output" to_op="Replace (5)" to_port="example set input"/>
              <connect from_op="Replace (5)" from_port="example set output" to_op="Reorder Attributes" to_port="example set input"/>
              <connect from_op="Reorder Attributes" from_port="example set output" to_port="out 1"/>
              <portSpacing port="source_in 1" spacing="0"/>
              <portSpacing port="source_in 2" spacing="0"/>
              <portSpacing port="sink_out 1" spacing="0"/>
              <portSpacing port="sink_out 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
          <connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
          <connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
          <connect from_op="Append" from_port="merged set" to_op="MashUp" to_port="Example Set"/>
          <connect from_op="MashUp" from_port="ExampleSet" to_op="CleanUp" to_port="in 1"/>
          <connect from_op="CleanUp" from_port="out 1" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
  • Hi,

    thanks for your quick respond, but I'm more confused than before.
    How can I use this XML code in RapidMiner?
    I want to do e.g. something like this:

    1. Go to http://www.yellowpages.com.au/
    2. Give something in the one form e.g. Jonathan
    3. Click on Search
    4. From the results click again one of them
    5. And then get e.g. the address and the phone number
    6. Put address and phone number in a file

    Can RapidMiner do this and how?

    I suppose I have to use the "Enrich Data by Webservice" but what kind of query is it in my case?
    I don't want to enter manually the queries in the list but extract them from a list.
    What's the meaning of the input in this operator?

    Regards