🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

Stop word removal with operator toolbox extract topics from data(LDA)

User: "Joris"
New Altair Community Member
Updated by Jocelyn
Hi there - first time asking a question, and new to rapidminer.

I have an excel file with around a text field and 740 rows of data.  Each cell contains a few paragraphs of text.  I ran the operator toolbox extract topics from data (LDA) on it, and it works very well in topic discovery.  But it doesn't remove the stopwords, so I end up with a large group of topics clustered around stopwords. 

I've tried a few different ways of removing the stopwords (and then still be able to run the LDA operator on it (it can take documents or a text field in a dataset).

Can anyone assist in removing the stopwords?

Below is my XML code.  Thanks a lot!

<?xml version="1.0" encoding="UTF-8"?><process version="9.3.001">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.3.001" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="read_excel" compatibility="9.3.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="85">
        <parameter key="excel_file" value="C:\Users\vande\Google Drive (jorisv@clearoutcomes.net)\IPDET Course Text Analytics\Demonstrations - Practical exercises\Exercise 5 Topic Modelling\PovertyActionLab.xlsx"/>
        <parameter key="sheet_selection" value="sheet number"/>
        <parameter key="sheet_number" value="1"/>
        <parameter key="imported_cell_range" value="A1"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="first_row_as_names" value="true"/>
        <list key="annotations"/>
        <parameter key="date_format" value=""/>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="locale" value="English (United States)"/>
        <parameter key="read_all_values_as_polynominal" value="false"/>
        <list key="data_set_meta_data_information"/>
        <parameter key="read_not_matching_values_as_missings" value="true"/>
        <parameter key="datamanagement" value="double_array"/>
        <parameter key="data_management" value="auto"/>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="9.3.001" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="85">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="PolicyIssues"/>
        <parameter key="attributes" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="nominal"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="file_path"/>
        <parameter key="block_type" value="single_value"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="single_value"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
      </operator>
      <operator activated="true" class="operator_toolbox:lda_exampleset" compatibility="2.1.000" expanded="true" height="124" name="Extract Topics from Data (LDA)" width="90" x="380" y="136">
        <parameter key="text_attribute" value="PolicyIssues"/>
        <parameter key="number_of_topics" value="6"/>
        <parameter key="use_alpha_heuristics" value="true"/>
        <parameter key="alpha_sum" value="0.1"/>
        <parameter key="use_beta_heuristics" value="true"/>
        <parameter key="beta" value="0.01"/>
        <parameter key="optimize_hyperparameters" value="true"/>
        <parameter key="optimize_interval_for_hyperparameters" value="10"/>
        <parameter key="top_words_per_topic" value="10"/>
        <parameter key="iterations" value="1000"/>
        <parameter key="reproducible" value="true"/>
        <parameter key="enable_logging" value="false"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
      </operator>
      <connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Extract Topics from Data (LDA)" to_port="exa"/>
      <connect from_op="Extract Topics from Data (LDA)" from_port="exa" to_port="result 1"/>
      <connect from_op="Extract Topics from Data (LDA)" from_port="top" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
    </process>
  </operator>
</process>


Sort by:
1 - 1 of 11
    User: "MartinLiebig"
    Altair Employee
    Accepted Answer
    Hi @Joris,
    can you maybe try this?
    <?xml version="1.0" encoding="UTF-8"?><process version="9.3.001"><br>  <context><br>    <input/><br>    <output/><br>    <macros/><br>  </context><br>  <operator activated="true" class="process" compatibility="9.3.001" expanded="true" name="Process"><br>    <parameter key="logverbosity" value="init"/><br>    <parameter key="random_seed" value="2001"/><br>    <parameter key="send_mail" value="never"/><br>    <parameter key="notification_email" value=""/><br>    <parameter key="process_duration_for_mail" value="30"/><br>    <parameter key="encoding" value="SYSTEM"/><br>    <process expanded="true"><br>      <operator activated="true" class="read_excel" compatibility="9.3.001" expanded="true" height="68" name="Read Excel" width="90" x="45" y="85"><br>        <parameter key="excel_file" value="C:\Users\vande\Google Drive (jorisv@clearoutcomes.net)\IPDET Course Text Analytics\Demonstrations - Practical exercises\Exercise 5 Topic Modelling\PovertyActionLab.xlsx"/><br>        <parameter key="sheet_selection" value="sheet number"/><br>        <parameter key="sheet_number" value="1"/><br>        <parameter key="imported_cell_range" value="A1"/><br>        <parameter key="encoding" value="SYSTEM"/><br>        <parameter key="first_row_as_names" value="true"/><br>        <list key="annotations"/><br>        <parameter key="date_format" value=""/><br>        <parameter key="time_zone" value="SYSTEM"/><br>        <parameter key="locale" value="English (United States)"/><br>        <parameter key="read_all_values_as_polynominal" value="false"/><br>        <list key="data_set_meta_data_information"/><br>        <parameter key="read_not_matching_values_as_missings" value="true"/><br>        <parameter key="datamanagement" value="double_array"/><br>        <parameter key="data_management" value="auto"/><br>      </operator><br>      <operator activated="true" class="nominal_to_text" compatibility="9.3.001" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="85"><br>        <parameter key="attribute_filter_type" value="single"/><br>        <parameter key="attribute" value="PolicyIssues"/><br>        <parameter key="attributes" value=""/><br>        <parameter key="use_except_expression" value="false"/><br>        <parameter key="value_type" value="nominal"/><br>        <parameter key="use_value_type_exception" value="false"/><br>        <parameter key="except_value_type" value="file_path"/><br>        <parameter key="block_type" value="single_value"/><br>        <parameter key="use_block_type_exception" value="false"/><br>        <parameter key="except_block_type" value="single_value"/><br>        <parameter key="invert_selection" value="false"/><br>        <parameter key="include_special_attributes" value="false"/><br>      </operator><br>      <operator activated="true" class="text:data_to_documents" compatibility="9.1.000-SNAPSHOT" expanded="true" height="68" name="Data to Documents" width="90" x="313" y="85"><br>        <parameter key="select_attributes_and_weights" value="false"/><br>        <list key="specify_weights"/><br>      </operator><br>      <operator activated="true" class="loop_collection" compatibility="9.3.001" expanded="true" height="82" name="Loop Collection" width="90" x="447" y="85"><br>        <parameter key="set_iteration_macro" value="false"/><br>        <parameter key="macro_name" value="iteration"/><br>        <parameter key="macro_start_value" value="1"/><br>        <parameter key="unfold" value="false"/><br>        <process expanded="true"><br>          <operator activated="true" class="text:tokenize" compatibility="9.1.000-SNAPSHOT" expanded="true" height="68" name="Tokenize" width="90" x="313" y="34"><br>            <parameter key="mode" value="non letters"/><br>            <parameter key="characters" value=".:"/><br>            <parameter key="language" value="English"/><br>            <parameter key="max_token_length" value="3"/><br>          </operator><br>          <operator activated="true" class="text:filter_stopwords_english" compatibility="9.1.000-SNAPSHOT" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="447" y="34"/><br>          <connect from_port="single" to_op="Tokenize" to_port="document"/><br>          <connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/><br>          <connect from_op="Filter Stopwords (English)" from_port="document" to_port="output 1"/><br>          <portSpacing port="source_single" spacing="0"/><br>          <portSpacing port="sink_output 1" spacing="0"/><br>          <portSpacing port="sink_output 2" spacing="0"/><br>        </process><br>      </operator><br>      <operator activated="true" class="text:documents_to_data" compatibility="9.1.000-SNAPSHOT" expanded="true" height="82" name="Documents to Data" width="90" x="581" y="85"><br>        <parameter key="add_meta_information" value="true"/><br>        <parameter key="datamanagement" value="double_sparse_array"/><br>        <parameter key="data_management" value="auto"/><br>        <parameter key="use_processed_text" value="true"/><br>      </operator><br>      <operator activated="true" class="operator_toolbox:lda_exampleset" compatibility="2.3.000-SNAPSHOT" expanded="true" height="124" name="Extract Topics from Data (LDA)" width="90" x="715" y="85"><br>        <parameter key="text_attribute" value="PolicyIssues"/><br>        <parameter key="number_of_topics" value="6"/><br>        <parameter key="use_alpha_heuristics" value="true"/><br>        <parameter key="alpha_sum" value="0.1"/><br>        <parameter key="use_beta_heuristics" value="true"/><br>        <parameter key="beta" value="0.01"/><br>        <parameter key="optimize_hyperparameters" value="true"/><br>        <parameter key="optimize_interval_for_hyperparameters" value="10"/><br>        <parameter key="top_words_per_topic" value="10"/><br>        <parameter key="iterations" value="1000"/><br>        <parameter key="reproducible" value="true"/><br>        <parameter key="enable_logging" value="false"/><br>        <parameter key="use_local_random_seed" value="false"/><br>        <parameter key="local_random_seed" value="1992"/><br>      </operator><br>      <connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/><br>      <connect from_op="Nominal to Text" from_port="example set output" to_op="Data to Documents" to_port="example set"/><br>      <connect from_op="Data to Documents" from_port="documents" to_op="Loop Collection" to_port="collection"/><br>      <connect from_op="Loop Collection" from_port="output 1" to_op="Documents to Data" to_port="documents 1"/><br>      <connect from_op="Documents to Data" from_port="example set" to_op="Extract Topics from Data (LDA)" to_port="exa"/><br>      <connect from_op="Extract Topics from Data (LDA)" from_port="exa" to_port="result 1"/><br>      <connect from_op="Extract Topics from Data (LDA)" from_port="top" to_port="result 2"/><br>      <portSpacing port="source_input 1" spacing="0"/><br>      <portSpacing port="sink_result 1" spacing="0"/><br>      <portSpacing port="sink_result 2" spacing="0"/><br>      <portSpacing port="sink_result 3" spacing="0"/><br>    </process><br>  </operator><br></process><br><br>

    BR,

    Martin