🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

Problem with Xpath query? Processing documents from web

User: "pix123"
New Altair Community Member
Updated by Jocelyn
Hi there,

I am trying to extract documents from a movie review site. When I run the process below I get 0 results but can't figure out the problem, can anyone help? Thanks.

<?xml version="1.0" encoding="UTF-8"?><process version="9.0.003"><br>&nbsp; <context><br>&nbsp;&nbsp;&nbsp; <input/><br>&nbsp;&nbsp;&nbsp; <output/><br>&nbsp;&nbsp;&nbsp; <macros/><br>&nbsp; </context><br>&nbsp; <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process"><br>&nbsp;&nbsp;&nbsp; <process expanded="true"><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <operator activated="true" class="concurrency:loop" compatibility="9.0.003" expanded="true" height="82" name="Loop" width="90" x="313" y="238"><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <parameter key="number_of_iterations" value="10"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <process expanded="true"><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <operator activated="true" class="web:process_web_modern" compatibility="9.0.000" expanded="true" height="68" name="Process Documents from Web" width="90" x="179" y="85"><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <parameter key="url" value="https://www.rottentomatoes.com/m/chef_2014/reviews/"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <list key="crawling_rules"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <process expanded="true"><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <operator activated="true" class="text:cut_document" compatibility="8.1.000" expanded="true" height="68" name="Cut Document" width="90" x="246" y="34"><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <parameter key="query_type" value="XPath"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <list key="string_machting_queries"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <list key="regular_expression_queries"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <list key="regular_region_queries"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <list key="xpath_queries"><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <parameter key="seg" value="//h:table[@class='table table-striped']/h:tr"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; </list><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <list key="namespaces"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <list key="index_queries"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <list key="jsonpath_queries"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <process expanded="true"><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <connect from_port="segment" to_port="document 1"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <portSpacing port="source_segment" spacing="0"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <portSpacing port="sink_document 1" spacing="0"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <portSpacing port="sink_document 2" spacing="0"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; </process><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; </operator><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <operator activated="true" class="text:extract_information" compatibility="8.1.000" expanded="true" height="68" name="Extract Information" width="90" x="447" y="34"><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <parameter key="query_type" value="XPath"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <list key="string_machting_queries"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <list key="regular_expression_queries"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <list key="regular_region_queries"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <list key="xpath_queries"><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <parameter key="text" value="//h:p/text|)"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; </list><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <list key="namespaces"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <list key="index_queries"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <list key="jsonpath_queries"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; </operator><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <connect from_port="document" to_op="Cut Document" to_port="document"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <connect from_op="Cut Document" from_port="documents" to_port="document 1"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <portSpacing port="source_document" spacing="0"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <portSpacing port="sink_document 1" spacing="0"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <portSpacing port="sink_document 2" spacing="0"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; </process><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; </operator><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <connect from_op="Process Documents from Web" from_port="example set" to_port="output 1"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <portSpacing port="source_input 1" spacing="0"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <portSpacing port="sink_output 1" spacing="0"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <portSpacing port="sink_output 2" spacing="0"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; </process><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; </operator><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <connect from_op="Loop" from_port="output 1" to_port="result 1"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <portSpacing port="source_input 1" spacing="0"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <portSpacing port="sink_result 1" spacing="0"/><br>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; <portSpacing port="sink_result 2" spacing="0"/><br>&nbsp;&nbsp;&nbsp; </process><br>&nbsp; </operator><br></process>





Find more posts tagged with

Sort by:
1 - 1 of 11
    User: "kayman"
    New Altair Community Member
    Accepted Answer
    Bit hard to explain, but what you do is as follows :

    You select the reviews with a loop logic, the translation of the xpath used is a bit like 'give me the text of every review that has a class called 'the_review:neutral

    But then you take the xpath for the first match of each attribute, but this doesn't give the right result as every review has this data on a different location, relative to the actual review, so you do not map these together. With the current structure you loose all relation between the data, and what you need is more like 

    For every div containing a review, get me the parameters (reviewer, date etc) that are part of this div.

    (told you it was hard to explain :wink:)

    Long story short, I'm not sure you can get this with the typical xpath extractor, but you can use xpath directly with the xslt operators.

    I've attached an example, it's a bit more complex but still relatively easy to adapt.

    The logic is to create a proper xml from the htm first (the code is not xhtml) and then use dedicated xpath, this returns a nice table with your data ready to use

    <?xml version="1.0" encoding="UTF-8"?><process version="9.0.003">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
        <process expanded="true">
          <operator activated="true" class="concurrency:loop" compatibility="9.0.003" expanded="true" height="82" name="Loop" width="90" x="179" y="34">
            <parameter key="number_of_iterations" value="10"/>
            <parameter key="enable_parallel_execution" value="false"/>
            <process expanded="true">
              <operator activated="true" class="web:get_webpage" compatibility="9.0.000" expanded="true" height="68" name="Get Page" width="90" x="45" y="289">
                <parameter key="url" value="https://www.rottentomatoes.com/m/chef_2014/reviews/?page=%{iteration}"/>
                <list key="query_parameters"/>
                <list key="request_properties"/>
              </operator>
              <operator activated="true" class="text:html_to_xml" compatibility="8.1.000" expanded="true" height="68" name="HTML to XML" width="90" x="179" y="289"/>
              <operator activated="true" class="text:replace_tokens" compatibility="8.1.000" expanded="true" height="68" name="Replace Tokens" width="90" x="313" y="289">
                <list key="replace_dictionary">
                  <parameter key="(?s)^.*?&lt;html.*?&gt;" value="&lt;html&gt;"/>
                </list>
              </operator>
              <operator activated="true" class="text:create_document" compatibility="8.1.000" expanded="true" height="68" name="Create Document" width="90" x="179" y="646">
                <parameter key="text" value="&lt;?xml version=&quot;1.0&quot; encoding=&quot;UTF-8&quot;?&gt;&#10;&lt;xsl:stylesheet version=&quot;1.0&quot; xmlns:xsl=&quot;http://www.w3.org/1999/XSL/Transform&quot;&gt;&#10;&#9;&lt;xsl:output method=&quot;xml&quot; version=&quot;1.0&quot; encoding=&quot;UTF-8&quot; indent=&quot;yes&quot;/&gt;&#10;&#9;&lt;xsl:template match=&quot;/&quot;&gt;&#10;&#9;&#9;&lt;root&gt;&#10;&#9;&#9;&#9;&lt;xsl:for-each select=&quot;//div[@class='row review_table_row']&quot;&gt;&#10;&#9;&#9;&#9;&#9;&lt;!--&lt;xsl:copy-of select=&quot;.&quot;/&gt;--&gt;&#10;&#9;&#9;&#9;&#9;&lt;row &#10;&#9;&#9;&#9;&#9;critic=&quot;{normalize-space(.//div[contains(@class,'critic_name')]/a[1])}&quot; &#10;&#9;&#9;&#9;&#9;publisher=&quot;{normalize-space(.//div[contains(@class,'critic_name')]/a[2]/em)}&quot; &#10;&#9;&#9;&#9;&#9;date=&quot;{normalize-space(.//div[contains(@class,'review_date')])}&quot; &#10;&#9;&#9;&#9;&#9;review=&quot;{normalize-space(.//div[@class='the_review'])}&quot; &#10;&#9;&#9;&#9;&#9;score=&quot;{normalize-space(.//div[@class='small subtle'][contains(.,'Original Score')])}&quot;/&gt;&#10;&#9;&#9;&#9;&lt;/xsl:for-each&gt;&#10;&#9;&#9;&lt;/root&gt;&#10;&#9;&lt;/xsl:template&gt;&#10;&lt;/xsl:stylesheet&gt;"/>
              </operator>
              <operator activated="true" class="text:process_xslt" compatibility="8.1.000" expanded="true" height="82" name="Process XSLT" width="90" x="313" y="544"/>
              <operator activated="true" class="text:cut_document" compatibility="8.1.000" expanded="true" height="68" name="Cut Document" width="90" x="447" y="544">
                <parameter key="query_type" value="Regular Region"/>
                <list key="string_machting_queries">
                  <parameter key="review" value="&lt;row./&gt;"/>
                </list>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries">
                  <parameter key="review" value="&lt;row./&gt;"/>
                </list>
                <list key="xpath_queries"/>
                <list key="namespaces"/>
                <list key="index_queries"/>
                <list key="jsonpath_queries"/>
                <process expanded="true">
                  <operator activated="true" class="text:extract_information" compatibility="8.1.000" expanded="true" height="68" name="Extract Information" width="90" x="246" y="34">
                    <parameter key="query_type" value="XPath"/>
                    <list key="string_machting_queries"/>
                    <list key="regular_expression_queries"/>
                    <list key="regular_region_queries"/>
                    <list key="xpath_queries">
                      <parameter key="Critic Name" value="//@critic"/&gt;
                      <parameter key="Reviews" value="//@review"/&gt;
                      <parameter key="Date Posted" value="//@date"/&gt;
                      <parameter key="Publisher" value="//@publisher"/&gt;
                      <parameter key="Score" value="//@score"/&gt;
                    </list>
                    <list key="namespaces"/>
                    <parameter key="ignore_CDATA" value="false"/>
                    <parameter key="assume_html" value="false"/>
                    <list key="index_queries"/>
                    <list key="jsonpath_queries"/>
                  </operator>
                  <connect from_port="segment" to_op="Extract Information" to_port="document"/>
                  <connect from_op="Extract Information" from_port="document" to_port="document 1"/>
                  <portSpacing port="source_segment" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                </process>
              </operator>
              <operator activated="true" class="text:documents_to_data" compatibility="8.1.000" expanded="true" height="82" name="Documents to Data" width="90" x="581" y="544">
                <parameter key="text_attribute" value="content"/>
              </operator>
              <operator activated="true" class="select_attributes" compatibility="9.0.003" expanded="true" height="82" name="Select Attributes" width="90" x="581" y="391">
                <parameter key="attribute_filter_type" value="subset"/>
                <parameter key="attributes" value="content|query_key"/>
                <parameter key="invert_selection" value="true"/>
              </operator>
              <connect from_op="Get Page" from_port="output" to_op="HTML to XML" to_port="document"/>
              <connect from_op="HTML to XML" from_port="document" to_op="Replace Tokens" to_port="document"/>
              <connect from_op="Replace Tokens" from_port="document" to_op="Process XSLT" to_port="document"/>
              <connect from_op="Create Document" from_port="output" to_op="Process XSLT" to_port="xslt document"/>
              <connect from_op="Process XSLT" from_port="document" to_op="Cut Document" to_port="document"/>
              <connect from_op="Cut Document" from_port="documents" to_op="Documents to Data" to_port="documents 1"/>
              <connect from_op="Documents to Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
              <connect from_op="Select Attributes" from_port="example set output" to_port="output 1"/>
              <portSpacing port="source_input 1" spacing="0"/>
              <portSpacing port="sink_output 1" spacing="0"/>
              <portSpacing port="sink_output 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="store" compatibility="9.0.003" expanded="true" height="68" name="Store" width="90" x="313" y="34">
            <parameter key="repository_entry" value="New Output of Web Pages/RT Reviews"/>
          </operator>
          <connect from_op="Loop" from_port="output 1" to_op="Store" to_port="input"/>
          <connect from_op="Store" from_port="through" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>