Read Excel Table with 300+ URLs and get Page Informations

Naveen_Vimalan
Naveen_Vimalan New Altair Community Member
edited November 5 in Community Q&A
I would like to get Informations such as the Response Code, Response Message, Content Type etc. of the URLs in my Excel Table. I used - Read Excel -> Store -> Handle Exception (Get Pages) -> Store - as my Process Chain. For some reason I only get the URL as my Result instead of all the Information I want. Hopefully someone can help out.

This is the Code:
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.9.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="read_excel" compatibility="9.9.000" expanded="true" height="68" name="Read Excel" width="90" x="112" y="136">
        <parameter key="excel_file" value="/Users/XXX/datamining/excel/Leuphana.xlsx"/>
        <parameter key="sheet_selection" value="sheet number"/>
        <parameter key="sheet_number" value="1"/>
        <parameter key="imported_cell_range" value="A1"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="first_row_as_names" value="true"/>
        <list key="annotations"/>
        <parameter key="date_format" value=""/>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="locale" value="German (Germany)"/>
        <parameter key="read_all_values_as_polynominal" value="false"/>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="Links.true.file_path.attribute"/>
        </list>
        <parameter key="read_not_matching_values_as_missings" value="false"/>
      </operator>
      <operator activated="true" class="store" compatibility="9.9.000" expanded="true" height="68" name="Store" width="90" x="246" y="136">
        <parameter key="repository_entry" value="../data/Leuphana_Links"/>
      </operator>
      <operator activated="true" class="handle_exception" compatibility="9.9.000" expanded="true" height="82" name="Handle Exception" width="90" x="380" y="136">
        <parameter key="add_details_to_log" value="false"/>
        <process expanded="true">
          <operator activated="true" class="web:retrieve_webpages" compatibility="9.7.000" expanded="true" height="68" name="Get Pages" width="90" x="179" y="34">
            <parameter key="link_attribute" value="Links"/>
            <parameter key="page_attribute" value="Inhalt"/>
            <parameter key="random_user_agent" value="true"/>
            <parameter key="connection_timeout" value="10000"/>
            <parameter key="read_timeout" value="10000"/>
            <parameter key="follow_redirects" value="true"/>
            <parameter key="accept_cookies" value="all"/>
            <parameter key="cookie_scope" value="thread"/>
            <parameter key="request_method" value="POST"/>
            <parameter key="delay" value="none"/>
            <parameter key="delay_amount" value="1000"/>
            <parameter key="min_delay_amount" value="0"/>
            <parameter key="max_delay_amount" value="1000"/>
          </operator>
          <connect from_port="in 1" to_op="Get Pages" to_port="Example Set"/>
          <connect from_op="Get Pages" from_port="Example Set" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
        <process expanded="true">
          <connect from_port="in 1" to_port="out 1"/>
          <portSpacing port="source_in 1" spacing="0"/>
          <portSpacing port="source_in 2" spacing="0"/>
          <portSpacing port="sink_out 1" spacing="0"/>
          <portSpacing port="sink_out 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="store" compatibility="9.9.000" expanded="true" height="68" name="Store (2)" width="90" x="514" y="136">
        <parameter key="repository_entry" value="../data/Leuphana_Result"/>
      </operator>
      <connect from_op="Read Excel" from_port="output" to_op="Store" to_port="input"/>
      <connect from_op="Store" from_port="through" to_op="Handle Exception" to_port="in 1"/>
      <connect from_op="Handle Exception" from_port="out 1" to_op="Store (2)" to_port="input"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
    </process>
  </operator>
</process> 

Best Answer

  • YYH
    YYH
    Altair Employee
    Answer ✓
    Hi @Naveen_Vimalan,

    I used your excel as input for URL links and got 325 results and 8 errors (see attached screenshot for the error msg). The errors mostly come from the bad URL link that contains  regex (why regex?)

    Process with loop and Get Page attached for your reference.

    Cheers,
    YY

Answers