nav[aria-label="Primary Navigation"] { padding: 0; & ul { list-style: none; width: 100%; display: flex; flex-direction: row; justify-content: start; align-items: start; gap: 30px; padding: 0; & li { margin: 0; } & ul li { list-style: none; } } }

Siemens Community Catalyst Program

The Siemens Community Catalyst program was co-created with our community to acknowledge technology leaders who consistently contribute to the Siemens Community. Nominations are accepted on a rolling basis.

Nominate Now

Extract multiple paragraphs from xhtml

paavopdf

Hi there, I'm currently trying to extract multiple paragraphs from xhtml. I am using the "Generate Extract" operator with xPath as query for this.

The xPath query should look like:

//xhtml:p/text()

and the name space like:

http://www.w3.org/1999/xhtml

But I receive only a small fragment of the first paragraph (seem like it's only the first word until a space occurs).

Here is my process:

<?xml version="1.0" encoding="UTF-8"?><process version="7.3.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.3.001" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="7.3.001" expanded="true" height="68" name="Retrieve Fussball-pages_sample" width="90" x="45" y="34">
        <parameter key="repository_entry" value="//Local Repository/SpiegelMining/Fussball-pages_sample"/>
      </operator>
      <operator activated="true" class="generate_id" compatibility="7.3.001" expanded="true" height="82" name="Generate ID" width="90" x="179" y="34">
        <parameter key="create_nominal_ids" value="false"/>
        <parameter key="offset" value="0"/>
      </operator>
      <operator activated="true" class="text:generate_extract" compatibility="7.3.000" expanded="true" height="68" name="Generate Extract" width="90" x="313" y="34">
        <parameter key="source_attribute" value="html"/>
        <parameter key="query_type" value="XPath"/>
        <list key="string_machting_queries"/>
        <parameter key="attribute_type" value="Nominal"/>
        <list key="regular_expression_queries">
          <parameter key="title" value=".*(title).*"/>
        </list>
        <list key="regular_region_queries">
          <parameter key="title" value="&lt;title&gt;.&lt;/title&gt;"/>
          <parameter key="publish_time" value="&lt;time class=&quot;timeformat&quot; itemprop=&quot;datePublished&quot;.&gt;"/>
          <parameter key="paragraphs" value="&lt;p&gt;.&lt;/p&gt;"/>
        </list>
        <list key="xpath_queries">
          <parameter key="paragraphs" value="//xhtml:p/text()"/>
        </list>
        <list key="namespaces">
          <parameter key="xhtml" value="http://www.w3.org/1999/xhtml"/>
        </list>
        <parameter key="ignore_CDATA" value="false"/>
        <parameter key="assume_html" value="true"/>
        <list key="index_queries"/>
        <list key="jsonpath_queries"/>
        <parameter key="value_seperator" value="; "/>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="7.3.001" expanded="true" height="82" name="Select Attributes" width="90" x="447" y="34">
        <parameter key="attribute_filter_type" value="subset"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="paragraphs|title|time|publish_time"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
      </operator>
      <connect from_op="Retrieve Fussball-pages_sample" from_port="output" to_op="Generate ID" to_port="example set input"/>
      <connect from_op="Generate ID" from_port="example set output" to_op="Generate Extract" to_port="Example Set"/>
      <connect from_op="Generate Extract" from_port="Example Set" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Any help is welcome

Find more posts tagged with

AI Studio

Accepted answers

Edin_Klapic

Hi,

The xml you provided does not have an easy structure. If you check it in a Text editor you will see that some information is not within a <p> but within a <li> element.

I used Read XML to hopefully achieve the result you wanted.

My solution extracts the article-body, the title and the timestamp when the article was published.

Check it out and let me know if it helps

Best regards,

Edin

Btw: Text processing and Web mining extension are required.

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.1.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.1.001" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="advanced_file_connectors:read_xml" compatibility="7.1.001" expanded="true" height="68" name="Read XML" width="90" x="45" y="34">
        <parameter key="file" value="C:\Users\EdinKlapic\Downloads\blatter.xml"/>
        <parameter key="xpath_for_examples" value="*"/>
        <enumeration key="xpaths_for_attributes">
          <parameter key="xpath_for_attribute" value="//xhtml:h2[@class='article-title lp-article-title']"/>
          <parameter key="xpath_for_attribute" value="//xhtml:div[@itemprop='articleBody']"/>
          <parameter key="xpath_for_attribute" value="//xhtml:span[@class='article-function-date']"/>
        </enumeration>
        <list key="namespaces">
          <parameter key="xhtml" value="http://www.w3.org/1999/xhtml"/>
        </list>
        <parameter key="use_default_namespace" value="false"/>
        <list key="annotations"/>
        <list key="data_set_meta_data_information"/>
      </operator>
      <operator activated="true" class="transpose" compatibility="7.1.001" expanded="true" height="82" name="Transpose" width="90" x="179" y="34"/>
      <operator activated="true" class="select_attributes" compatibility="7.1.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="313" y="34">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="id"/>
        <parameter key="invert_selection" value="true"/>
        <parameter key="include_special_attributes" value="true"/>
      </operator>
      <operator activated="true" class="filter_examples" compatibility="7.1.001" expanded="true" height="103" name="Filter Examples" width="90" x="45" y="136">
        <list key="filters_list">
          <parameter key="filters_entry_key" value="att_1.starts_with.&lt;span class=&quot;article-function-date&quot;"/>
        </list>
      </operator>
      <operator activated="true" class="replace" compatibility="7.1.001" expanded="true" height="82" name="Replace" width="90" x="179" y="136">
        <parameter key="replace_what" value="(?s).*datetime=&quot;(.*)&quot;\s.*"/>
        <parameter key="replace_by" value="$1"/>
      </operator>
      <operator activated="true" class="rename" compatibility="7.1.001" expanded="true" height="82" name="Rename" width="90" x="313" y="136">
        <parameter key="old_name" value="att_1"/>
        <parameter key="new_name" value="Timestamp"/>
        <list key="rename_additional_attributes"/>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="7.1.001" expanded="true" height="82" name="Nominal to Text" width="90" x="45" y="289">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="att_1"/>
      </operator>
      <operator activated="true" class="text:data_to_documents" compatibility="7.1.001" expanded="true" height="68" name="Data to Documents (2)" width="90" x="179" y="289">
        <list key="specify_weights"/>
      </operator>
      <operator activated="true" class="loop_collection" compatibility="7.1.001" expanded="true" height="82" name="Loop Collection" width="90" x="313" y="289">
        <process expanded="true">
          <operator activated="true" class="web:extract_html_text_content" compatibility="7.1.001" expanded="true" height="68" name="Extract Content (3)" width="90" x="45" y="34"/>
          <operator activated="true" class="text:documents_to_data" compatibility="7.1.001" expanded="true" height="82" name="Documents to Data" width="90" x="179" y="34">
            <parameter key="text_attribute" value="text"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="7.1.001" expanded="true" height="82" name="Select Attributes" width="90" x="313" y="34">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="text"/>
          </operator>
          <connect from_port="single" to_op="Extract Content (3)" to_port="document"/>
          <connect from_op="Extract Content (3)" from_port="document" to_op="Documents to Data" to_port="documents 1"/>
          <connect from_op="Documents to Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_port="output 1"/>
          <portSpacing port="source_single" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="append" compatibility="7.1.001" expanded="true" height="82" name="Append" width="90" x="447" y="289"/>
      <connect from_op="Read XML" from_port="output" to_op="Transpose" to_port="example set input"/>
      <connect from_op="Transpose" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/>
      <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
      <connect from_op="Filter Examples" from_port="example set output" to_op="Replace" to_port="example set input"/>
      <connect from_op="Filter Examples" from_port="unmatched example set" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Replace" from_port="example set output" to_op="Rename" to_port="example set input"/>
      <connect from_op="Rename" from_port="example set output" to_port="result 1"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Data to Documents (2)" to_port="example set"/>
      <connect from_op="Data to Documents (2)" from_port="documents" to_op="Loop Collection" to_port="collection"/>
      <connect from_op="Loop Collection" from_port="output 1" to_op="Append" to_port="example set 1"/>
      <connect from_op="Append" from_port="merged set" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="105"/>
      <portSpacing port="sink_result 2" spacing="84"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Read Blatter XML.rmp

All comments

MartinLiebig

Hi paavo,

any chance you could provide an example XML?

~Martin

paavopdf

 see blatter_artikel.xml

see attached file (20k characters are not enough)

blatter_artikel.xml

Edin_Klapic

Hi,

The xml you provided does not have an easy structure. If you check it in a Text editor you will see that some information is not within a <p> but within a <li> element.

I used Read XML to hopefully achieve the result you wanted.

My solution extracts the article-body, the title and the timestamp when the article was published.

Check it out and let me know if it helps

Best regards,

Edin

Btw: Text processing and Web mining extension are required.

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="7.1.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="7.1.001" expanded="true" name="Process">
    <process expanded="true">
      <operator activated="true" class="advanced_file_connectors:read_xml" compatibility="7.1.001" expanded="true" height="68" name="Read XML" width="90" x="45" y="34">
        <parameter key="file" value="C:\Users\EdinKlapic\Downloads\blatter.xml"/>
        <parameter key="xpath_for_examples" value="*"/>
        <enumeration key="xpaths_for_attributes">
          <parameter key="xpath_for_attribute" value="//xhtml:h2[@class='article-title lp-article-title']"/>
          <parameter key="xpath_for_attribute" value="//xhtml:div[@itemprop='articleBody']"/>
          <parameter key="xpath_for_attribute" value="//xhtml:span[@class='article-function-date']"/>
        </enumeration>
        <list key="namespaces">
          <parameter key="xhtml" value="http://www.w3.org/1999/xhtml"/>
        </list>
        <parameter key="use_default_namespace" value="false"/>
        <list key="annotations"/>
        <list key="data_set_meta_data_information"/>
      </operator>
      <operator activated="true" class="transpose" compatibility="7.1.001" expanded="true" height="82" name="Transpose" width="90" x="179" y="34"/>
      <operator activated="true" class="select_attributes" compatibility="7.1.001" expanded="true" height="82" name="Select Attributes (2)" width="90" x="313" y="34">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="id"/>
        <parameter key="invert_selection" value="true"/>
        <parameter key="include_special_attributes" value="true"/>
      </operator>
      <operator activated="true" class="filter_examples" compatibility="7.1.001" expanded="true" height="103" name="Filter Examples" width="90" x="45" y="136">
        <list key="filters_list">
          <parameter key="filters_entry_key" value="att_1.starts_with.&lt;span class=&quot;article-function-date&quot;"/>
        </list>
      </operator>
      <operator activated="true" class="replace" compatibility="7.1.001" expanded="true" height="82" name="Replace" width="90" x="179" y="136">
        <parameter key="replace_what" value="(?s).*datetime=&quot;(.*)&quot;\s.*"/>
        <parameter key="replace_by" value="$1"/>
      </operator>
      <operator activated="true" class="rename" compatibility="7.1.001" expanded="true" height="82" name="Rename" width="90" x="313" y="136">
        <parameter key="old_name" value="att_1"/>
        <parameter key="new_name" value="Timestamp"/>
        <list key="rename_additional_attributes"/>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="7.1.001" expanded="true" height="82" name="Nominal to Text" width="90" x="45" y="289">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="att_1"/>
      </operator>
      <operator activated="true" class="text:data_to_documents" compatibility="7.1.001" expanded="true" height="68" name="Data to Documents (2)" width="90" x="179" y="289">
        <list key="specify_weights"/>
      </operator>
      <operator activated="true" class="loop_collection" compatibility="7.1.001" expanded="true" height="82" name="Loop Collection" width="90" x="313" y="289">
        <process expanded="true">
          <operator activated="true" class="web:extract_html_text_content" compatibility="7.1.001" expanded="true" height="68" name="Extract Content (3)" width="90" x="45" y="34"/>
          <operator activated="true" class="text:documents_to_data" compatibility="7.1.001" expanded="true" height="82" name="Documents to Data" width="90" x="179" y="34">
            <parameter key="text_attribute" value="text"/>
          </operator>
          <operator activated="true" class="select_attributes" compatibility="7.1.001" expanded="true" height="82" name="Select Attributes" width="90" x="313" y="34">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="text"/>
          </operator>
          <connect from_port="single" to_op="Extract Content (3)" to_port="document"/>
          <connect from_op="Extract Content (3)" from_port="document" to_op="Documents to Data" to_port="documents 1"/>
          <connect from_op="Documents to Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
          <connect from_op="Select Attributes" from_port="example set output" to_port="output 1"/>
          <portSpacing port="source_single" spacing="0"/>
          <portSpacing port="sink_output 1" spacing="0"/>
          <portSpacing port="sink_output 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="append" compatibility="7.1.001" expanded="true" height="82" name="Append" width="90" x="447" y="289"/>
      <connect from_op="Read XML" from_port="output" to_op="Transpose" to_port="example set input"/>
      <connect from_op="Transpose" from_port="example set output" to_op="Select Attributes (2)" to_port="example set input"/>
      <connect from_op="Select Attributes (2)" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
      <connect from_op="Filter Examples" from_port="example set output" to_op="Replace" to_port="example set input"/>
      <connect from_op="Filter Examples" from_port="unmatched example set" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Replace" from_port="example set output" to_op="Rename" to_port="example set input"/>
      <connect from_op="Rename" from_port="example set output" to_port="result 1"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Data to Documents (2)" to_port="example set"/>
      <connect from_op="Data to Documents (2)" from_port="documents" to_op="Loop Collection" to_port="collection"/>
      <connect from_op="Loop Collection" from_port="output 1" to_op="Append" to_port="example set 1"/>
      <connect from_op="Append" from_port="merged set" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="105"/>
      <portSpacing port="sink_result 2" spacing="84"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>

Read Blatter XML.rmp

paavopdf

Thank you very much, this should really help create my solution.