I have a list of URLs that I want to scrape and create a word list. I can easily do this for all the URLs and how do I load a whole bunch and get a wordlist for each one and export to excel
here is the simple code to get the wordlist for all combined
<?xml version="1.0" encoding="UTF-8"?>
-<process version="9.9.002">
-<context>
<input/>
<output/>
<macros/>
</context>
-<operator name="Process" expanded="true" compatibility="9.9.002" class="process" activated="true">
<parameter value="init" key="logverbosity"/>
<parameter value="2001" key="random_seed"/>
<parameter value="never" key="send_mail"/>
<parameter value="" key="notification_email"/>
<parameter value="30" key="process_duration_for_mail"/>
<parameter value="SYSTEM" key="encoding"/>
-<process expanded="true">
-<operator name="Read CSV" expanded="true" compatibility="9.9.002" class="read_csv" activated="true" y="85" x="112" width="90" height="68">
<parameter value="C:/Users/david/Desktop/rapidminer test.csv" key="csv_file"/>
<parameter value=";" key="column_separators"/>
<parameter value="false" key="trim_lines"/>
<parameter value="true" key="use_quotes"/>
<parameter value=""" key="quotes_character"/>
<parameter value="\" key="escape_character"/>
<parameter value="false" key="skip_comments"/>
<parameter value="#" key="comment_characters"/>
<parameter value="1" key="starting_row"/>
<parameter value="true" key="parse_numbers"/>
<parameter value="." key="decimal_character"/>
<parameter value="false" key="grouped_digits"/>
<parameter value="," key="grouping_character"/>
<parameter value="" key="infinity_representation"/>
<parameter value="" key="date_format"/>
<parameter value="true" key="first_row_as_names"/>
<list key="annotations"/>
<parameter value="SYSTEM" key="time_zone"/>
<parameter value="English (United States)" key="locale"/>
<parameter value="SYSTEM" key="encoding"/>
<parameter value="false" key="read_all_values_as_polynominal"/>
<list key="data_set_meta_data_information"/>
<parameter value="true" key="read_not_matching_values_as_missings"/>
</operator>
-<operator name="Get Pages" expanded="true" compatibility="9.3.001" class="web:retrieve_webpages" activated="true" y="85" x="246" width="90" height="68">
<parameter value="NEWURL" key="link_attribute"/>
<parameter value="false" key="random_user_agent"/>
<parameter value="10000" key="connection_timeout"/>
<parameter value="10000" key="read_timeout"/>
<parameter value="true" key="follow_redirects"/>
<parameter value="none" key="accept_cookies"/>
<parameter value="global" key="cookie_scope"/>
<parameter value="GET" key="request_method"/>
<parameter value="none" key="delay"/>
<parameter value="1000" key="delay_amount"/>
<parameter value="0" key="min_delay_amount"/>
<parameter value="1000" key="max_delay_amount"/>
</operator>
-<operator name="Process Documents from Data" expanded="true" compatibility="9.3.001" class="text:process_document_from_data" activated="true" y="85" x="447" width="90" height="82">
<parameter value="true" key="create_word_vector"/>
<parameter value="TF-IDF" key="vector_creation"/>
<parameter value="true" key="add_meta_information"/>
<parameter value="true" key="keep_text"/>
<parameter value="absolute" key="prune_method"/>
<parameter value="3.0" key="prune_below_percent"/>
<parameter value="30.0" key="prune_above_percent"/>
<parameter value="2" key="prune_below_absolute"/>
<parameter value="100000000" key="prune_above_absolute"/>
<parameter value="0.05" key="prune_below_rank"/>
<parameter value="0.95" key="prune_above_rank"/>
<parameter value="double_sparse_array" key="datamanagement"/>
<parameter value="auto" key="data_management"/>
<parameter value="false" key="select_attributes_and_weights"/>
<list key="specify_weights"/>
-<process expanded="true">
-<operator name="Extract Content" expanded="true" compatibility="9.3.001" class="web:extract_html_text_content" activated="true" y="136" x="112" width="90" height="68">
<parameter value="true" key="extract_content"/>
<parameter value="5" key="minimum_text_block_length"/>
<parameter value="true" key="override_content_type_information"/>
<parameter value="true" key="neglegt_span_tags"/>
<parameter value="true" key="neglect_p_tags"/>
<parameter value="true" key="neglect_b_tags"/>
<parameter value="true" key="neglect_i_tags"/>
<parameter value="true" key="neglect_br_tags"/>
<parameter value="true" key="ignore_non_html_tags"/>
</operator>
-<operator name="Tokenize" expanded="true" compatibility="9.3.001" class="text:tokenize" activated="true" y="238" x="112" width="90" height="68">
<parameter value="non letters" key="mode"/>
<parameter value=".:" key="characters"/>
<parameter value="English" key="language"/>
<parameter value="3" key="max_token_length"/>
</operator>
-<operator name="Transform Cases" expanded="true" compatibility="9.3.001" class="text:transform_cases" activated="true" y="238" x="246" width="90" height="68">
<parameter value="lower case" key="transform_to"/>
</operator>
<operator name="Filter Stopwords (English)" expanded="true" compatibility="9.3.001" class="text:filter_stopwords_english" activated="true" y="238" x="380" width="90" height="68"/>
<connect to_port="document" to_op="Extract Content" from_port="document"/>
<connect to_port="document" to_op="Tokenize" from_port="document" from_op="Extract Content"/>
<connect to_port="document" to_op="Transform Cases" from_port="document" from_op="Tokenize"/>
<connect to_port="document" to_op="Filter Stopwords (English)" from_port="document" from_op="Transform Cases"/>
<connect to_port="document 1" from_port="document" from_op="Filter Stopwords (English)"/>
<portSpacing spacing="0" port="source_document"/>
<portSpacing spacing="0" port="sink_document 1"/>
<portSpacing spacing="0" port="sink_document 2"/>
</process>
</operator>
<connect to_port="Example Set" to_op="Get Pages" from_port="output" from_op="Read CSV"/>
<connect to_port="example set" to_op="Process Documents from Data" from_port="Example Set" from_op="Get Pages"/>
<connect to_port="result 1" from_port="word list" from_op="Process Documents from Data"/>
<portSpacing spacing="0" port="source_input 1"/>
<portSpacing spacing="0" port="sink_result 1"/>
<portSpacing spacing="0" port="sink_result 2"/>
</process>
</operator>
</process>