Revert Tokenized words back to sentences

Chloronic
Chloronic New Altair Community Member
edited November 2024 in Community Q&A
Hello, I would like to ask. How do I change my tokenized words attribute back into my own text attribute in Excel file? I was doing tokenized words for correcting many mistakes in my text by using Stem (Dictionary) and many other operators within Process Documents from Data. The thing is that I can't find any operator that can export my fixed tokenized words to replace all faulty sentences in my excel file.
Is there a way to export it back into excel? but in this case is with my sentences, not tokenized one.

Here's my XML file:
<?xml version="1.0" encoding="UTF-8"?><process version="9.10.011">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.10.011" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="read_excel" compatibility="9.10.011" expanded="true" height="68" name="Read Excel" width="90" x="112" y="34">
        <parameter key="excel_file" value="C:\Users\wille\Documents\Dataset Pemilu 2024 (New)\2. Preprocessing\Pemilu 2024 Final Res 2.xlsx"/>
        <parameter key="sheet_selection" value="sheet number"/>
        <parameter key="sheet_number" value="1"/>
        <parameter key="imported_cell_range" value="D1:D10485776"/>
        <parameter key="encoding" value="SYSTEM"/>
        <parameter key="first_row_as_names" value="true"/>
        <list key="annotations"/>
        <parameter key="date_format" value=""/>
        <parameter key="time_zone" value="SYSTEM"/>
        <parameter key="locale" value="English (United States)"/>
        <parameter key="read_all_values_as_polynominal" value="false"/>
        <list key="data_set_meta_data_information">
          <parameter key="0" value="Text.true.polynominal.attribute"/>
        </list>
        <parameter key="read_not_matching_values_as_missings" value="false"/>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="9.10.011" expanded="true" height="82" name="Nominal to Text" width="90" x="246" y="34">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="Text"/>
        <parameter key="attributes" value=""/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="nominal"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="file_path"/>
        <parameter key="block_type" value="single_value"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="single_value"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="9.4.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="380" y="34">
        <parameter key="create_word_vector" value="true"/>
        <parameter key="vector_creation" value="TF-IDF"/>
        <parameter key="add_meta_information" value="true"/>
        <parameter key="keep_text" value="false"/>
        <parameter key="prune_method" value="none"/>
        <parameter key="prune_below_percent" value="3.0"/>
        <parameter key="prune_above_percent" value="30.0"/>
        <parameter key="prune_below_rank" value="0.05"/>
        <parameter key="prune_above_rank" value="0.95"/>
        <parameter key="datamanagement" value="double_sparse_array"/>
        <parameter key="data_management" value="auto"/>
        <parameter key="select_attributes_and_weights" value="false"/>
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="9.4.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34">
            <parameter key="mode" value="non letters"/>
            <parameter key="characters" value=".:"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
          </operator>
          <operator activated="true" class="text:transform_cases" compatibility="9.4.000" expanded="true" height="68" name="Transform Cases" width="90" x="179" y="34">
            <parameter key="transform_to" value="lower case"/>
          </operator>
          <operator activated="true" class="text:filter_stopwords_dictionary" compatibility="9.4.000" expanded="true" height="82" name="Filter Stopwords (Dictionary)" width="90" x="313" y="34">
            <parameter key="file" value="C:/Users/wille/Documents/Dataset Pemilu 2024 (New)/Misc/Cleaning/stopwordbahasa.csv"/>
            <parameter key="case_sensitive" value="false"/>
            <parameter key="encoding" value="SYSTEM"/>
          </operator>
          <operator activated="true" class="text:filter_by_length" compatibility="9.4.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="447" y="34">
            <parameter key="min_chars" value="3"/>
            <parameter key="max_chars" value="15"/>
          </operator>
          <operator activated="true" class="text:stem_dictionary" compatibility="9.4.000" expanded="true" height="82" name="Stem (Dictionary)" width="90" x="581" y="34">
            <parameter key="file" value="C:/Users/wille/Documents/Dataset Pemilu 2024 (New)/2. Preprocessing/stemming.txt"/>
          </operator>
          <operator activated="true" class="text:tokenize" compatibility="9.4.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="715" y="34">
            <parameter key="mode" value="non letters"/>
            <parameter key="characters" value=".:"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
          </operator>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
          <connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_op="Stem (Dictionary)" to_port="document"/>
          <connect from_op="Stem (Dictionary)" from_port="document" to_op="Tokenize (2)" to_port="document"/>
          <connect from_op="Tokenize (2)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <connect from_port="input 1" to_op="Read Excel" to_port="file"/>
      <connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="source_input 2" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
    </process>
  </operator>
</process>

I also attached my dataset "Pemilu 2024..." and my stemming file .txt.

Answers

  • ruhaila
    ruhaila New Altair Community Member
    Hi, I tried running it but there is a missing file named "stopwordbahasa.csv".
  • Chloronic
    Chloronic New Altair Community Member
    Sorry, for that, I already found the answer. Thanks for trying to help me out.

Welcome!

It looks like you're new here. Sign in or register to get started.

Welcome!

It looks like you're new here. Sign in or register to get started.