🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

Exporting Word frequency to excel

User: "AnumBse16"
New Altair Community Member
Updated by Jocelyn
Hi!
I'm trying to implement the attached paper.I'm not getting the frequency list like the one in the picture.My process is given below, kindly guide
@bhupendra_patil @sgenzer

<?xml version="1.0" encoding="UTF-8"?><process version="9.2.000">
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.2.000" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="read_excel" compatibility="9.2.000" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34">
        <parameter key="excel_file" value="D:\uni stuff\6th semester\AI\project\Links.xlsx"/>
        <parameter key="sheet_selection" value="sheet number"/>
        <parameter key="sheet_number" value="1"/>
        <parameter key="imported_cell_range" value="A1"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="first_row_as_names" value="true"/>
        <list key="annotations"/>
        <parameter key="date_format" value=""/>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="locale" value="English (United States)"/>
        <parameter key="read_all_values_as_polynominal" value="false"/>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="Link.true.polynominal.attribute"/>
        </list>
        <parameter key="read_not_matching_values_as_missings" value="false"/>
        <parameter key="datamanagement" value="double_array"/>
        <parameter key="data_management" value="auto"/>
      </operator>
      <operator activated="true" class="web:retrieve_webpages" compatibility="9.0.000" expanded="true" height="68" name="Get Pages" width="90" x="179" y="34">
        <parameter key="link_attribute" value="Link"/>
        <parameter key="random_user_agent" value="false"/>
        <parameter key="connection_timeout" value="10000"/>
        <parameter key="read_timeout" value="10000"/>
        <parameter key="follow_redirects" value="true"/>
        <parameter key="accept_cookies" value="none"/>
        <parameter key="cookie_scope" value="global"/>
        <parameter key="request_method" value="GET"/>
        <parameter key="delay" value="none"/>
        <parameter key="delay_amount" value="1000"/>
        <parameter key="min_delay_amount" value="0"/>
        <parameter key="max_delay_amount" value="1000"/>
      </operator>
      <operator activated="true" class="text:data_to_documents" compatibility="8.1.000" expanded="true" height="68" name="Data to Documents" width="90" x="246" y="187">
        <parameter key="select_attributes_and_weights" value="false"/>
        <list key="specify_weights"/>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="8.1.000" expanded="true" height="103" name="Process Documents" width="90" x="380" y="85">
        <parameter key="create_word_vector" value="true"/>
        <parameter key="vector_creation" value="Term Frequency"/>
        <parameter key="add_meta_information" value="true"/>
        <parameter key="keep_text" value="true"/>
        <parameter key="prune_method" value="none"/>
        <parameter key="prune_below_percent" value="3.0"/>
        <parameter key="prune_above_percent" value="30.0"/>
        <parameter key="prune_below_rank" value="0.05"/>
        <parameter key="prune_above_rank" value="0.95"/>
        <parameter key="datamanagement" value="double_sparse_array"/>
        <parameter key="data_management" value="auto"/>
        <process expanded="true">
          <operator activated="true" class="web:extract_html_text_content" compatibility="9.0.000" expanded="true" height="68" name="Extract Content" width="90" x="112" y="34">
            <parameter key="extract_content" value="true"/>
            <parameter key="minimum_text_block_length" value="5"/>
            <parameter key="override_content_type_information" value="true"/>
            <parameter key="neglegt_span_tags" value="true"/>
            <parameter key="neglect_p_tags" value="true"/>
            <parameter key="neglect_b_tags" value="true"/>
            <parameter key="neglect_i_tags" value="true"/>
            <parameter key="neglect_br_tags" value="true"/>
            <parameter key="ignore_non_html_tags" value="true"/>
          </operator>
          <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="246" y="34">
            <parameter key="mode" value="non letters"/>
            <parameter key="characters" value=".:"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
          </operator>
          <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="380" y="34">
            <parameter key="transform_to" value="lower case"/>
          </operator>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="447" y="136"/>
          <operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="581" y="289">
            <parameter key="min_chars" value="4"/>
            <parameter key="max_chars" value="25"/>
          </operator>
          <connect from_port="document" to_op="Extract Content" to_port="document"/>
          <connect from_op="Extract Content" from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="text:wordlist_to_data" compatibility="8.1.000" expanded="true" height="82" name="WordList to Data" width="90" x="581" y="136"/>
      <operator activated="true" class="write_excel" compatibility="9.2.000" expanded="true" height="82" name="Write Excel" width="90" x="648" y="238">
        <parameter key="excel_file" value="D:\uni stuff\6th semester\AI\project\result.xlsx"/>
        <parameter key="file_format" value="xlsx"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="sheet_name" value="RapidMiner Data"/>
        <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
        <parameter key="number_format" value="#.0"/>
      </operator>
      <connect from_op="Read Excel" from_port="output" to_op="Get Pages" to_port="Example Set"/>
      <connect from_op="Get Pages" from_port="Example Set" to_op="Data to Documents" to_port="example set"/>
      <connect from_op="Data to Documents" from_port="documents" to_op="Process Documents" to_port="documents 1"/>
      <connect from_op="Process Documents" from_port="example set" to_port="result 1"/>
      <connect from_op="Process Documents" from_port="word list" to_op="WordList to Data" to_port="word list"/>
      <connect from_op="WordList to Data" from_port="example set" to_op="Write Excel" to_port="input"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

Find more posts tagged with

Sort by:
1 - 8 of 81
    User: "sgenzer"
    Altair Employee
    hi @AnumBse16 ah interesting! I took a look and for starters you did not include the "Data to Similarity" operator as they did in the paper - that is what will give you the similarity chart.

    <?xml version="1.0" encoding="UTF-8"?><process version="9.3.000-BETA">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="9.3.000-BETA" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="SYSTEM"/>
        <process expanded="true">
          <operator activated="true" class="read_excel" compatibility="9.3.000-BETA" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34">
            <parameter key="excel_file" value="D:\uni stuff\6th semester\AI\project\Links.xlsx"/>
            <parameter key="sheet_selection" value="sheet number"/>
            <parameter key="sheet_number" value="1"/>
            <parameter key="imported_cell_range" value="A1"/>
            <parameter key="encoding" value="SYSTEM"/>
            <parameter key="first_row_as_names" value="true"/>
            <list key="annotations"/>
            <parameter key="date_format" value=""/>
            <parameter key="time_zone" value="SYSTEM"/>
            <parameter key="locale" value="English (United States)"/>
            <parameter key="read_all_values_as_polynominal" value="false"/>
            <list key="data_set_meta_data_information">
              <parameter key="0" value="Link.true.polynominal.attribute"/>
            </list>
            <parameter key="read_not_matching_values_as_missings" value="false"/>
            <parameter key="datamanagement" value="double_array"/>
            <parameter key="data_management" value="auto"/>
          </operator>
          <operator activated="true" class="web:retrieve_webpages" compatibility="9.0.000" expanded="true" height="68" name="Get Pages" width="90" x="179" y="34">
            <parameter key="link_attribute" value="Link"/>
            <parameter key="random_user_agent" value="false"/>
            <parameter key="connection_timeout" value="10000"/>
            <parameter key="read_timeout" value="10000"/>
            <parameter key="follow_redirects" value="true"/>
            <parameter key="accept_cookies" value="none"/>
            <parameter key="cookie_scope" value="global"/>
            <parameter key="request_method" value="GET"/>
            <parameter key="delay" value="none"/>
            <parameter key="delay_amount" value="1000"/>
            <parameter key="min_delay_amount" value="0"/>
            <parameter key="max_delay_amount" value="1000"/>
          </operator>
          <operator activated="true" class="text:data_to_documents" compatibility="8.1.000" expanded="true" height="68" name="Data to Documents" width="90" x="246" y="187">
            <parameter key="select_attributes_and_weights" value="false"/>
            <list key="specify_weights"/>
          </operator>
          <operator activated="true" class="text:process_documents" compatibility="8.1.000" expanded="true" height="103" name="Process Documents" width="90" x="380" y="85">
            <parameter key="create_word_vector" value="true"/>
            <parameter key="vector_creation" value="Term Frequency"/>
            <parameter key="add_meta_information" value="true"/>
            <parameter key="keep_text" value="true"/>
            <parameter key="prune_method" value="none"/>
            <parameter key="prune_below_percent" value="3.0"/>
            <parameter key="prune_above_percent" value="30.0"/>
            <parameter key="prune_below_rank" value="0.05"/>
            <parameter key="prune_above_rank" value="0.95"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <process expanded="true">
              <operator activated="true" class="web:extract_html_text_content" compatibility="9.0.000" expanded="true" height="68" name="Extract Content" width="90" x="112" y="34">
                <parameter key="extract_content" value="true"/>
                <parameter key="minimum_text_block_length" value="5"/>
                <parameter key="override_content_type_information" value="true"/>
                <parameter key="neglegt_span_tags" value="true"/>
                <parameter key="neglect_p_tags" value="true"/>
                <parameter key="neglect_b_tags" value="true"/>
                <parameter key="neglect_i_tags" value="true"/>
                <parameter key="neglect_br_tags" value="true"/>
                <parameter key="ignore_non_html_tags" value="true"/>
              </operator>
              <operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="246" y="34">
                <parameter key="mode" value="non letters"/>
                <parameter key="characters" value=".:"/>
                <parameter key="language" value="English"/>
                <parameter key="max_token_length" value="3"/>
              </operator>
              <operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="380" y="34">
                <parameter key="transform_to" value="lower case"/>
              </operator>
              <operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="447" y="136"/>
              <operator activated="true" class="text:filter_by_length" compatibility="8.1.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="581" y="289">
                <parameter key="min_chars" value="4"/>
                <parameter key="max_chars" value="25"/>
              </operator>
              <connect from_port="document" to_op="Extract Content" to_port="document"/>
              <connect from_op="Extract Content" from_port="document" to_op="Tokenize" to_port="document"/>
              <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
              <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
              <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
              <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_document" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="data_to_similarity" compatibility="9.3.000-BETA" expanded="true" height="82" name="Data to Similarity" width="90" x="581" y="85">
            <parameter key="measure_types" value="MixedMeasures"/>
            <parameter key="mixed_measure" value="MixedEuclideanDistance"/>
            <parameter key="nominal_measure" value="NominalDistance"/>
            <parameter key="numerical_measure" value="EuclideanDistance"/>
            <parameter key="divergence" value="GeneralizedIDivergence"/>
            <parameter key="kernel_type" value="radial"/>
            <parameter key="kernel_gamma" value="1.0"/>
            <parameter key="kernel_sigma1" value="1.0"/>
            <parameter key="kernel_sigma2" value="0.0"/>
            <parameter key="kernel_sigma3" value="2.0"/>
            <parameter key="kernel_degree" value="3.0"/>
            <parameter key="kernel_shift" value="1.0"/>
            <parameter key="kernel_a" value="1.0"/>
            <parameter key="kernel_b" value="0.0"/>
          </operator>
          <operator activated="true" class="text:wordlist_to_data" compatibility="8.1.000" expanded="true" height="82" name="WordList to Data" width="90" x="581" y="238"/>
          <operator activated="true" class="write_excel" compatibility="9.2.001" expanded="true" height="103" name="Write Excel" width="90" x="715" y="340">
            <parameter key="excel_file" value="D:\uni stuff\6th semester\AI\project\result.xlsx"/>
            <parameter key="file_format" value="xlsx"/>
            <enumeration key="sheet_names"/>
            <parameter key="sheet_name" value="RapidMiner Data"/>
            <parameter key="date_format" value="yyyy-MM-dd HH:mm:ss"/>
            <parameter key="number_format" value="#.0"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <connect from_op="Read Excel" from_port="output" to_op="Get Pages" to_port="Example Set"/>
          <connect from_op="Get Pages" from_port="Example Set" to_op="Data to Documents" to_port="example set"/>
          <connect from_op="Data to Documents" from_port="documents" to_op="Process Documents" to_port="documents 1"/>
          <connect from_op="Process Documents" from_port="example set" to_op="Data to Similarity" to_port="example set"/>
          <connect from_op="Process Documents" from_port="word list" to_op="WordList to Data" to_port="word list"/>
          <connect from_op="Data to Similarity" from_port="similarity" to_port="result 1"/>
          <connect from_op="WordList to Data" from_port="example set" to_op="Write Excel" to_port="input"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    



    I can't really test the rest of the process as you don't post the xlsx files that are needed... :wink:

    Scott

    User: "AnumBse16"
    New Altair Community Member
    OP
    @sgenzer i'm trying to have what is in figure 9 and fiigure 10 of the document. Is data to similarity operator needed for it? I'm using the links in attached xlsx
    User: "sgenzer"
    Altair Employee
    oh all you want is the word lists? I can get them fine IF I do two things to your process:

    1. In Get Pages, you should turn ON 'accept cookies'  ;)



    2. Get out of the EU as there is a GDPR splash screen on time.com before you proceed.

    Results:




    Scott

    User: "AnumBse16"
    New Altair Community Member
    OP
    I want to have figure 9 and figure 10 that are in excel sheet 
    User: "sgenzer"
    Altair Employee
    Accepted Answer
    right. I think the article says that it used Excel from that point onwards :neutral:


    User: "AnumBse16"
    New Altair Community Member
    OP
    But how :(
    User: "sgenzer"
    Altair Employee
    Accepted Answer
    Updated by sgenzer
    https://techcommunity.microsoft.com/t5/Excel/ct-p/Excel_Cat

    I haven't done a pivot in Excel since (most likely) before you were a twinkle in your mother's eye :wink:
    User: "AnumBse16"
    New Altair Community Member
    OP
    @sgenzer thanks!