create wordlist for a list of URLs

davidellis
davidellis New Altair Community Member
edited November 5 in Community Q&A
I have a list of URLs that I want to scrape and create a word list. I can easily do this for all the URLs and how do I load a whole bunch and get a wordlist for each one and export to excel
here is the simple code to get the wordlist for all combined

<?xml version="1.0" encoding="UTF-8"?>

-<process version="9.9.002">


-<context>

<input/>

<output/>

<macros/>

</context>


-<operator name="Process" expanded="true" compatibility="9.9.002" class="process" activated="true">

<parameter value="init" key="logverbosity"/>

<parameter value="2001" key="random_seed"/>

<parameter value="never" key="send_mail"/>

<parameter value="" key="notification_email"/>

<parameter value="30" key="process_duration_for_mail"/>

<parameter value="SYSTEM" key="encoding"/>


-<process expanded="true">


-<operator name="Read CSV" expanded="true" compatibility="9.9.002" class="read_csv" activated="true" y="85" x="112" width="90" height="68">

<parameter value="C:/Users/david/Desktop/rapidminer test.csv" key="csv_file"/>

<parameter value=";" key="column_separators"/>

<parameter value="false" key="trim_lines"/>

<parameter value="true" key="use_quotes"/>

<parameter value=""" key="quotes_character"/>

<parameter value="\" key="escape_character"/>

<parameter value="false" key="skip_comments"/>

<parameter value="#" key="comment_characters"/>

<parameter value="1" key="starting_row"/>

<parameter value="true" key="parse_numbers"/>

<parameter value="." key="decimal_character"/>

<parameter value="false" key="grouped_digits"/>

<parameter value="," key="grouping_character"/>

<parameter value="" key="infinity_representation"/>

<parameter value="" key="date_format"/>

<parameter value="true" key="first_row_as_names"/>

<list key="annotations"/>

<parameter value="SYSTEM" key="time_zone"/>

<parameter value="English (United States)" key="locale"/>

<parameter value="SYSTEM" key="encoding"/>

<parameter value="false" key="read_all_values_as_polynominal"/>

<list key="data_set_meta_data_information"/>

<parameter value="true" key="read_not_matching_values_as_missings"/>

</operator>


-<operator name="Get Pages" expanded="true" compatibility="9.3.001" class="web:retrieve_webpages" activated="true" y="85" x="246" width="90" height="68">

<parameter value="NEWURL" key="link_attribute"/>

<parameter value="false" key="random_user_agent"/>

<parameter value="10000" key="connection_timeout"/>

<parameter value="10000" key="read_timeout"/>

<parameter value="true" key="follow_redirects"/>

<parameter value="none" key="accept_cookies"/>

<parameter value="global" key="cookie_scope"/>

<parameter value="GET" key="request_method"/>

<parameter value="none" key="delay"/>

<parameter value="1000" key="delay_amount"/>

<parameter value="0" key="min_delay_amount"/>

<parameter value="1000" key="max_delay_amount"/>

</operator>


-<operator name="Process Documents from Data" expanded="true" compatibility="9.3.001" class="text:process_document_from_data" activated="true" y="85" x="447" width="90" height="82">

<parameter value="true" key="create_word_vector"/>

<parameter value="TF-IDF" key="vector_creation"/>

<parameter value="true" key="add_meta_information"/>

<parameter value="true" key="keep_text"/>

<parameter value="absolute" key="prune_method"/>

<parameter value="3.0" key="prune_below_percent"/>

<parameter value="30.0" key="prune_above_percent"/>

<parameter value="2" key="prune_below_absolute"/>

<parameter value="100000000" key="prune_above_absolute"/>

<parameter value="0.05" key="prune_below_rank"/>

<parameter value="0.95" key="prune_above_rank"/>

<parameter value="double_sparse_array" key="datamanagement"/>

<parameter value="auto" key="data_management"/>

<parameter value="false" key="select_attributes_and_weights"/>

<list key="specify_weights"/>


-<process expanded="true">


-<operator name="Extract Content" expanded="true" compatibility="9.3.001" class="web:extract_html_text_content" activated="true" y="136" x="112" width="90" height="68">

<parameter value="true" key="extract_content"/>

<parameter value="5" key="minimum_text_block_length"/>

<parameter value="true" key="override_content_type_information"/>

<parameter value="true" key="neglegt_span_tags"/>

<parameter value="true" key="neglect_p_tags"/>

<parameter value="true" key="neglect_b_tags"/>

<parameter value="true" key="neglect_i_tags"/>

<parameter value="true" key="neglect_br_tags"/>

<parameter value="true" key="ignore_non_html_tags"/>

</operator>


-<operator name="Tokenize" expanded="true" compatibility="9.3.001" class="text:tokenize" activated="true" y="238" x="112" width="90" height="68">

<parameter value="non letters" key="mode"/>

<parameter value=".:" key="characters"/>

<parameter value="English" key="language"/>

<parameter value="3" key="max_token_length"/>

</operator>


-<operator name="Transform Cases" expanded="true" compatibility="9.3.001" class="text:transform_cases" activated="true" y="238" x="246" width="90" height="68">

<parameter value="lower case" key="transform_to"/>

</operator>

<operator name="Filter Stopwords (English)" expanded="true" compatibility="9.3.001" class="text:filter_stopwords_english" activated="true" y="238" x="380" width="90" height="68"/>

<connect to_port="document" to_op="Extract Content" from_port="document"/>

<connect to_port="document" to_op="Tokenize" from_port="document" from_op="Extract Content"/>

<connect to_port="document" to_op="Transform Cases" from_port="document" from_op="Tokenize"/>

<connect to_port="document" to_op="Filter Stopwords (English)" from_port="document" from_op="Transform Cases"/>

<connect to_port="document 1" from_port="document" from_op="Filter Stopwords (English)"/>

<portSpacing spacing="0" port="source_document"/>

<portSpacing spacing="0" port="sink_document 1"/>

<portSpacing spacing="0" port="sink_document 2"/>

</process>

</operator>

<connect to_port="Example Set" to_op="Get Pages" from_port="output" from_op="Read CSV"/>

<connect to_port="example set" to_op="Process Documents from Data" from_port="Example Set" from_op="Get Pages"/>

<connect to_port="result 1" from_port="word list" from_op="Process Documents from Data"/>

<portSpacing spacing="0" port="source_input 1"/>

<portSpacing spacing="0" port="sink_result 1"/>

<portSpacing spacing="0" port="sink_result 2"/>

</process>

</operator>

</process>
Tagged:

Answers

  • Marco_Barradas
    Marco_Barradas
    Altair Employee
    edited October 2021
    Hi @davidellis

    I fixed the process. Please check if this will help you. 
    You needed to define the name of the attribute that is going to store your html.

    If you want to get the wordlist for each of the urls you'll need to grap everything inside a loop values operator. That way you will get a list for each of the urls you visit. 
    Then you can append everything and write it back to an excel.

    <?xml version="1.0" encoding="UTF-8"?><process version="9.10.000">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.10.000" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="false" class="read_csv" compatibility="9.10.000" expanded="true" height="68" name="Read CSV" width="90" x="45" y="238">
            <parameter key="csv_file" value="C:/Users/david/Desktop/rapidminer test.csv"/>
            <parameter key="column_separators" value=";"/>
            <parameter key="trim_lines" value="false"/>
            <parameter key="use_quotes" value="true"/>
            <parameter key="quotes_character" value=""/>
            <parameter key="escape_character" value="\"/>
            <parameter key="skip_comments" value="false"/>
            <parameter key="comment_characters" value="#"/>
            <parameter key="starting_row" value="1"/>
            <parameter key="parse_numbers" value="true"/>
            <parameter key="decimal_character" value="."/>
            <parameter key="grouped_digits" value="false"/>
            <parameter key="grouping_character" value=","/>
            <parameter key="infinity_representation" value=""/>
            <parameter key="date_format" value=""/>
            <parameter key="first_row_as_names" value="true"/>
            <list key="annotations"/>
            <parameter key="time_zone" value="SYSTEM"/>
            <parameter key="locale" value="English (United States)"/>
            <parameter key="encoding" value="SYSTEM"/>
            <parameter key="read_all_values_as_polynominal" value="false"/>
            <list key="data_set_meta_data_information"/>
            <parameter key="read_not_matching_values_as_missings" value="true"/>
          </operator>
          <operator activated="true" class="utility:create_exampleset" compatibility="9.10.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="112" y="85">
            <parameter key="generator_type" value="comma separated text"/>
            <parameter key="number_of_examples" value="100"/>
            <parameter key="use_stepsize" value="false"/>
            <list key="function_descriptions"/>
            <parameter key="add_id_attribute" value="false"/>
            <list key="numeric_series_configuration"/>
            <list key="date_series_configuration"/>
            <list key="date_series_configuration (interval)"/>
            <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
            <parameter key="time_zone" value="SYSTEM"/>
            <parameter key="input_csv_text" value="NEWURL&#10;https://cincodias.elpais.com/cincodias/2021/10/19/fortunas/1634630126_347370.html&#10;https://www.vidaextra.com/xbox-series-x/se-abren-reservas-para-mini-nevera-xbox-espana-otros-paises-han-volado-unidades-minutos&#10;https://us.marca.com/claro/futbol/real-madrid/2021/10/19/616eccf6268e3efb4b8b4593.html"/>
            <parameter key="column_separator" value=","/>
            <parameter key="parse_all_as_nominal" value="false"/>
            <parameter key="decimal_point_character" value="."/>
            <parameter key="trim_attribute_names" value="true"/>
          </operator>
          <operator activated="true" class="web:retrieve_webpages" compatibility="9.7.000" expanded="true" height="68" name="Get Pages" width="90" x="246" y="85">
            <parameter key="link_attribute" value="NEWURL"/>
            <parameter key="page_attribute" value="HTML"/>
            <parameter key="random_user_agent" value="true"/>
            <parameter key="connection_timeout" value="10000"/>
            <parameter key="read_timeout" value="10000"/>
            <parameter key="follow_redirects" value="true"/>
            <parameter key="accept_cookies" value="none"/>
            <parameter key="cookie_scope" value="global"/>
            <parameter key="request_method" value="GET"/>
            <parameter key="delay" value="none"/>
            <parameter key="delay_amount" value="1000"/>
            <parameter key="min_delay_amount" value="0"/>
            <parameter key="max_delay_amount" value="1000"/>
          </operator>
          <operator activated="true" class="text:process_document_from_data" compatibility="9.3.001" expanded="true" height="82" name="Process Documents from Data" width="90" x="447" y="85">
            <parameter key="create_word_vector" value="true"/>
            <parameter key="vector_creation" value="TF-IDF"/>
            <parameter key="add_meta_information" value="true"/>
            <parameter key="keep_text" value="true"/>
            <parameter key="prune_method" value="absolute"/>
            <parameter key="prune_below_percent" value="3.0"/>
            <parameter key="prune_above_percent" value="30.0"/>
            <parameter key="prune_below_absolute" value="2"/>
            <parameter key="prune_above_absolute" value="100000000"/>
            <parameter key="prune_below_rank" value="0.05"/>
            <parameter key="prune_above_rank" value="0.95"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <parameter key="select_attributes_and_weights" value="false"/>
            <list key="specify_weights"/>
            <process expanded="true">
              <operator activated="true" class="web:extract_html_text_content" compatibility="9.7.000" expanded="true" height="68" name="Extract Content" width="90" x="112" y="136">
                <parameter key="extract_content" value="true"/>
                <parameter key="minimum_text_block_length" value="5"/>
                <parameter key="override_content_type_information" value="true"/>
                <parameter key="neglegt_span_tags" value="true"/>
                <parameter key="neglect_p_tags" value="true"/>
                <parameter key="neglect_b_tags" value="true"/>
                <parameter key="neglect_i_tags" value="true"/>
                <parameter key="neglect_br_tags" value="true"/>
                <parameter key="ignore_non_html_tags" value="true"/>
              </operator>
              <operator activated="true" class="text:tokenize" compatibility="9.3.001" expanded="true" height="68" name="Tokenize" width="90" x="112" y="238">
                <parameter key="mode" value="non letters"/>
                <parameter key="characters" value=".:"/>
                <parameter key="language" value="English"/>
                <parameter key="max_token_length" value="3"/>
              </operator>
              <operator activated="true" class="text:transform_cases" compatibility="9.3.001" expanded="true" height="68" name="Transform Cases" width="90" x="246" y="238">
                <parameter key="transform_to" value="lower case"/>
              </operator>
              <operator activated="true" class="text:filter_stopwords_english" compatibility="9.3.001" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="380" y="238"/>
              <connect from_port="document" to_op="Extract Content" to_port="document"/>
              <connect from_op="Extract Content" from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
              <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
              <connect from_op="Filter Stopwords (English)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_op="Create ExampleSet" from_port="output" to_op="Get Pages" to_port="Example Set"/>
          <connect from_op="Get Pages" from_port="Example Set" to_op="Process Documents from Data" to_port="example set"/>
          <connect from_op="Process Documents from Data" from_port="word list" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>