🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

Read HTML Table - Extension Operator

User: "btibert"
New Altair Community Member
Updated by Jocelyn
I was expecting the following URL to parse properly:

https://www.hockey-reference.com/leagues/NHL_2019_skaters.html

However, the operator did not find any tables on the page.  The tutorial process does properly parse tables from wikipedia, but fails on the page above.
That said, this is my go-to reference for my students as the tables are easily parsed in R and Python.  For example:

import pandas as pd
tables = pd.read_html("https://www.hockey-reference.com/leagues/NHL_2019_skaters.html")
skaters = tables[0]
skaters.head().

Yes, there has to be some cleanup on the columns and data types, but that is part of the exercise and why I like using this reference.  I figured it would be even more powerful as a training exercise in RM given the amount of data prep that is necessary.

Any helps or tips on how to configure this operator would be much appreciated!

Find more posts tagged with

Sort by:
1 - 1 of 11
    User: "Marco_Barradas"
    Altair Employee
    Accepted Answer
    Updated by Marco_Barradas
    @btibert I found another solution just change the path of the HTML file that is created
    @ey it would be great if we could connect the file connector of the Write Document operator to the Read HTML Table.
     
    Sorry for not commenting, but I'm on a rush but I wanted to help.

    <?xml version="1.0" encoding="UTF-8"?><process version="9.3.001">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.3.001" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="web:get_webpage" compatibility="9.0.000" expanded="true" height="68" name="Get Page" width="90" x="45" y="187">
            <parameter key="url" value="https://www.hockey-reference.com/leagues/NHL_2019_skaters.html"/>
            <parameter key="random_user_agent" value="true"/>
            <parameter key="connection_timeout" value="10000"/>
            <parameter key="read_timeout" value="10000"/>
            <parameter key="follow_redirects" value="true"/>
            <parameter key="accept_cookies" value="all"/>
            <parameter key="cookie_scope" value="thread"/>
            <parameter key="request_method" value="GET"/>
            <list key="query_parameters"/>
            <list key="request_properties"/>
            <parameter key="override_encoding" value="false"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="9.3.001" expanded="true" height="124" name="Multiply" width="90" x="179" y="136"/>
          <operator activated="true" class="text:cut_document" compatibility="8.2.000" expanded="true" height="68" name="Table Headers" width="90" x="179" y="340">
            <parameter key="query_type" value="Regular Region"/>
            <list key="string_machting_queries"/>
            <parameter key="attribute_type" value="Nominal"/>
            <list key="regular_expression_queries"/>
            <list key="regular_region_queries">
              <parameter key="Header" value="&lt;thead&gt;.&lt;/thead&gt;"/>
            </list>
            <list key="xpath_queries"/>
            <list key="namespaces"/>
            <parameter key="ignore_CDATA" value="true"/>
            <parameter key="assume_html" value="true"/>
            <list key="index_queries"/>
            <list key="jsonpath_queries"/>
            <process expanded="true">
              <operator activated="true" breakpoints="before" class="text:cut_document" compatibility="8.2.000" expanded="true" height="68" name="Cut Document (2)" width="90" x="380" y="34">
                <parameter key="query_type" value="String Matching"/>
                <list key="string_machting_queries">
                  <parameter key="header" value="&lt;tr&gt;.&lt;/tr&gt;"/>
                </list>
                <parameter key="attribute_type" value="Nominal"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries"/>
                <list key="xpath_queries"/>
                <list key="namespaces"/>
                <parameter key="ignore_CDATA" value="true"/>
                <parameter key="assume_html" value="true"/>
                <list key="index_queries"/>
                <list key="jsonpath_queries"/>
                <process expanded="true">
                  <connect from_port="segment" to_port="document 1"/>
                  <portSpacing port="source_segment" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                </process>
              </operator>
              <connect from_port="segment" to_op="Cut Document (2)" to_port="document"/>
              <connect from_op="Cut Document (2)" from_port="documents" to_port="document 1"/>
              <portSpacing port="source_segment" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="text:cut_document" compatibility="8.2.000" expanded="true" height="68" name="Rows of the table" width="90" x="313" y="238">
            <parameter key="query_type" value="Regular Region"/>
            <list key="string_machting_queries"/>
            <parameter key="attribute_type" value="Nominal"/>
            <list key="regular_expression_queries"/>
            <list key="regular_region_queries">
              <parameter key="Row" value="&lt;tr &gt;.&lt;/tr&gt;"/>
            </list>
            <list key="xpath_queries"/>
            <list key="namespaces"/>
            <parameter key="ignore_CDATA" value="true"/>
            <parameter key="assume_html" value="true"/>
            <list key="index_queries"/>
            <list key="jsonpath_queries"/>
            <process expanded="true">
              <operator activated="true" class="text:extract_information" compatibility="8.2.000" expanded="true" height="68" name="Get_Data (2)" width="90" x="380" y="34">
                <parameter key="query_type" value="Regular Expression"/>
                <list key="string_machting_queries"/>
                <parameter key="attribute_type" value="Nominal"/>
                <list key="regular_expression_queries">
                  <parameter key="Rank" value="&lt;.*data-stat=&quot;ranker&quot;.*&gt;(\d+)+&lt;/th&gt;"/>
                  <parameter key="Player" value="&lt;.*data-stat=&quot;player&quot;.*&gt;([\w\s]+)&lt;/a&gt;&lt;/td&gt;&lt;td.*data-stat=&quot;age&quot; &gt;"/>
                  <parameter key="Age" value="&lt;td.*data-stat=&quot;age&quot;.*&gt;(\d+)&lt;/td&gt;&lt;td.*&quot;team_id&quot;.*&gt;"/>
                </list>
                <list key="regular_region_queries">
                  <parameter key="Rank" value="&lt;\.*data-stat=&quot;ranker&quot;\.*&gt;.&lt;/th&gt;"/>
                  <parameter key="Player" value="&lt;\.*data-stat=&quot;player&quot;\.*&gt;.&lt;/th&gt;"/>
                </list>
                <list key="xpath_queries"/>
                <list key="namespaces"/>
                <parameter key="ignore_CDATA" value="true"/>
                <parameter key="assume_html" value="true"/>
                <list key="index_queries"/>
                <list key="jsonpath_queries"/>
              </operator>
              <connect from_port="segment" to_op="Get_Data (2)" to_port="document"/>
              <connect from_op="Get_Data (2)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_segment" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="text:process_documents" compatibility="8.2.000" expanded="true" height="103" name="With RegEx" width="90" x="313" y="34">
            <parameter key="create_word_vector" value="false"/>
            <parameter key="vector_creation" value="TF-IDF"/>
            <parameter key="add_meta_information" value="true"/>
            <parameter key="keep_text" value="false"/>
            <parameter key="prune_method" value="none"/>
            <parameter key="prune_below_percent" value="3.0"/>
            <parameter key="prune_above_percent" value="30.0"/>
            <parameter key="prune_below_rank" value="0.05"/>
            <parameter key="prune_above_rank" value="0.95"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <process expanded="true">
              <operator activated="true" class="text:cut_document" compatibility="8.2.000" expanded="true" height="68" name="Cut Rows" width="90" x="313" y="34">
                <parameter key="query_type" value="Regular Region"/>
                <list key="string_machting_queries"/>
                <parameter key="attribute_type" value="Nominal"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries">
                  <parameter key="Row" value="&lt;tr &gt;.&lt;/tr&gt;"/>
                </list>
                <list key="xpath_queries"/>
                <list key="namespaces"/>
                <parameter key="ignore_CDATA" value="true"/>
                <parameter key="assume_html" value="true"/>
                <list key="index_queries"/>
                <list key="jsonpath_queries"/>
                <process expanded="true">
                  <operator activated="true" class="text:extract_information" compatibility="8.2.000" expanded="true" height="68" name="Get_Data" width="90" x="246" y="34">
                    <parameter key="query_type" value="Regular Expression"/>
                    <list key="string_machting_queries"/>
                    <parameter key="attribute_type" value="Nominal"/>
                    <list key="regular_expression_queries">
                      <parameter key="Rank" value="&lt;.*data-stat=&quot;ranker&quot;.*&gt;(\d+)+&lt;/th&gt;"/>
                      <parameter key="Player" value="&lt;.*data-stat=&quot;player&quot;.*&gt;([\w\s]+)&lt;/a&gt;&lt;/td&gt;&lt;td.*data-stat=&quot;age&quot; &gt;"/>
                      <parameter key="Age" value="&lt;td.*data-stat=&quot;age&quot;.*&gt;(\d+)&lt;/td&gt;&lt;td.*&quot;team_id&quot;.*&gt;"/>
                    </list>
                    <list key="regular_region_queries">
                      <parameter key="Rank" value="&lt;\.*data-stat=&quot;ranker&quot;\.*&gt;.&lt;/th&gt;"/>
                      <parameter key="Player" value="&lt;\.*data-stat=&quot;player&quot;\.*&gt;.&lt;/th&gt;"/>
                    </list>
                    <list key="xpath_queries"/>
                    <list key="namespaces"/>
                    <parameter key="ignore_CDATA" value="true"/>
                    <parameter key="assume_html" value="true"/>
                    <list key="index_queries"/>
                    <list key="jsonpath_queries"/>
                    <description align="center" color="transparent" colored="false" width="126">With regular expressions and capture groups ( ) we get the data we want</description>
                  </operator>
                  <connect from_port="segment" to_op="Get_Data" to_port="document"/>
                  <connect from_op="Get_Data" from_port="document" to_port="document 1"/>
                  <portSpacing port="source_segment" spacing="0"/>
                  <portSpacing port="sink_document 1" spacing="0"/>
                  <portSpacing port="sink_document 2" spacing="0"/>
                  <description align="center" color="yellow" colored="false" height="94" resized="true" width="362" x="140" y="233">(\d+) gets all the digits that are on the row&lt;br&gt;([\w\s]+) gets all the text and spaces that &lt;br/&gt;we need from the row</description>
                </process>
                <description align="center" color="orange" colored="true" width="126">With the Regular Region we are getting everything between &amp;lt;tr&amp;gt;&amp;lt;/tr&amp;gt; tags</description>
              </operator>
              <connect from_port="document" to_op="Cut Rows" to_port="document"/>
              <connect from_op="Cut Rows" from_port="documents" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="text:create_document" compatibility="8.2.000" expanded="true" height="68" name="table openning tag" width="90" x="313" y="136">
            <parameter key="text" value="&lt;table&gt;"/>
            <parameter key="add label" value="false"/>
            <parameter key="label_type" value="nominal"/>
          </operator>
          <operator activated="true" class="text:create_document" compatibility="8.2.000" expanded="true" height="68" name="table clossing tag" width="90" x="313" y="493">
            <parameter key="text" value="&lt;/table&gt;"/>
            <parameter key="add label" value="false"/>
            <parameter key="label_type" value="nominal"/>
          </operator>
          <operator activated="true" class="text:combine_documents" compatibility="8.2.000" expanded="true" height="145" name="Combine Documents" width="90" x="514" y="289"/>
          <operator activated="true" class="text:write_document" compatibility="8.2.000" expanded="true" height="82" name="Creates a file with the HTML table" width="90" x="648" y="289">
            <parameter key="overwrite" value="true"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <operator activated="true" class="write_file" compatibility="9.3.001" expanded="true" height="68" name="Write File to your computer" width="90" x="782" y="289">
            <parameter key="resource_type" value="file"/>
            <parameter key="filename" value="C:\Users\mbarradas\Desktop\table.html"/>
            <parameter key="mime_type" value="application/octet-stream"/>
          </operator>
          <operator activated="true" class="web_table_extraction:html2exampleset_operator" compatibility="1.0.000" expanded="true" height="68" name="Extracts the table from the file created" width="90" x="648" y="136">
            <parameter key="resource_type" value="file"/>
            <parameter key="file name" value="C:\Users\mbarradas\Desktop\table.html"/>
          </operator>
          <connect from_op="Get Page" from_port="output" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="With RegEx" to_port="documents 1"/>
          <connect from_op="Multiply" from_port="output 2" to_op="Rows of the table" to_port="document"/>
          <connect from_op="Multiply" from_port="output 3" to_op="Table Headers" to_port="document"/>
          <connect from_op="Table Headers" from_port="documents" to_op="Combine Documents" to_port="documents 2"/>
          <connect from_op="Rows of the table" from_port="documents" to_op="Combine Documents" to_port="documents 3"/>
          <connect from_op="With RegEx" from_port="example set" to_port="result 1"/>
          <connect from_op="table openning tag" from_port="output" to_op="Combine Documents" to_port="documents 1"/>
          <connect from_op="table clossing tag" from_port="output" to_op="Combine Documents" to_port="documents 4"/>
          <connect from_op="Combine Documents" from_port="document" to_op="Creates a file with the HTML table" to_port="document"/>
          <connect from_op="Creates a file with the HTML table" from_port="file" to_op="Write File to your computer" to_port="file"/>
          <connect from_op="Extracts the table from the file created" from_port="collection of html data tables as example sets" to_port="result 2"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
        </process>
      </operator>
    </process>