Applying Word2vec on a dataset of texts

AC24
AC24 New Altair Community Member
edited November 2024 in Community Q&A
Hi, I created ad SVM model for text classification and I want to share the process with you to have any advise to improve it. The purpose of this classificator is to classify a dataset of comments, reviews, or sentences in general, into positive and negative and the dataset I used for its training was made of 2400 tweets (1200 positive and 1200 negative). 

I also would ask you if there is a way to implement in this process a word2vec embedding, or how can I create an alternative process for this purpose. If I try to apply word2vec on the opetators loop and loop collection, this return some errors and I don't know how to give as input a dataset of sentences (my dataset has the attributs "text" and "sentiment") and not a whole text or a collaction of long texts. 

Here is the code of the process:

<?xml version="1.0" encoding="UTF-8"?><process version="9.10.011">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="9.10.011" expanded="true" name="Process">
    <parameter key="logverbosity" value="init"/>
    <parameter key="random_seed" value="2001"/>
    <parameter key="send_mail" value="never"/>
    <parameter key="notification_email" value=""/>
    <parameter key="process_duration_for_mail" value="30"/>
    <parameter key="encoding" value="SYSTEM"/>
    <process expanded="true">
      <operator activated="true" class="retrieve" compatibility="9.10.011" expanded="true" height="68" name="Retrieve clean_dataset2" width="90" x="45" y="136">
        <parameter key="repository_entry" value="clean_dataset2"/>
      </operator>
      <operator activated="true" class="operator_toolbox:extract_sentiment" compatibility="2.14.000" expanded="true" height="103" name="Extract Sentiment" width="90" x="179" y="136">
        <parameter key="model" value="vader"/>
        <parameter key="text_attribute" value="text"/>
        <parameter key="show_advanced_output" value="true"/>
        <parameter key="use_default_tokenization_regex" value="true"/>
        <parameter key="tokenization_regex" value=".*"/>
        <list key="additional_words"/>
      </operator>
      <operator activated="true" class="nominal_to_text" compatibility="9.10.011" expanded="true" height="82" name="Nominal to Text" width="90" x="313" y="136">
        <parameter key="attribute_filter_type" value="single"/>
        <parameter key="attribute" value="text"/>
        <parameter key="attributes" value="text|sentiment"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="nominal"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="file_path"/>
        <parameter key="block_type" value="single_value"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="single_value"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="false"/>
      </operator>
      <operator activated="true" class="text:process_document_from_data" compatibility="9.4.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="447" y="136">
        <parameter key="create_word_vector" value="true"/>
        <parameter key="vector_creation" value="TF-IDF"/>
        <parameter key="add_meta_information" value="true"/>
        <parameter key="keep_text" value="true"/>
        <parameter key="prune_method" value="percentual"/>
        <parameter key="prune_below_percent" value="1.0"/>
        <parameter key="prune_above_percent" value="99.0"/>
        <parameter key="prune_below_rank" value="0.05"/>
        <parameter key="prune_above_rank" value="0.95"/>
        <parameter key="datamanagement" value="double_sparse_array"/>
        <parameter key="data_management" value="auto"/>
        <parameter key="select_attributes_and_weights" value="false"/>
        <list key="specify_weights"/>
        <process expanded="true">
          <operator activated="true" class="text:tokenize" compatibility="9.4.000" expanded="true" height="68" name="Tokenize" width="90" x="179" y="34">
            <parameter key="mode" value="non letters"/>
            <parameter key="characters" value=".:"/>
            <parameter key="language" value="English"/>
            <parameter key="max_token_length" value="3"/>
          </operator>
          <operator activated="true" class="text:transform_cases" compatibility="9.4.000" expanded="true" height="68" name="Transform Cases" width="90" x="313" y="34">
            <parameter key="transform_to" value="lower case"/>
          </operator>
          <operator activated="true" class="text:filter_stopwords_english" compatibility="9.4.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="447" y="34"/>
          <operator activated="true" class="text:filter_by_length" compatibility="9.4.000" expanded="true" height="68" name="Filter Tokens (by Length)" width="90" x="581" y="34">
            <parameter key="min_chars" value="4"/>
            <parameter key="max_chars" value="999"/>
          </operator>
          <connect from_port="document" to_op="Tokenize" to_port="document"/>
          <connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
          <connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
          <connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
          <connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="select_attributes" compatibility="9.10.011" expanded="true" height="82" name="Select Attributes" width="90" x="581" y="136">
        <parameter key="attribute_filter_type" value="no_missing_values"/>
        <parameter key="attribute" value=""/>
        <parameter key="attributes" value="|sentiment|text"/>
        <parameter key="use_except_expression" value="false"/>
        <parameter key="value_type" value="attribute_value"/>
        <parameter key="use_value_type_exception" value="false"/>
        <parameter key="except_value_type" value="time"/>
        <parameter key="block_type" value="attribute_block"/>
        <parameter key="use_block_type_exception" value="false"/>
        <parameter key="except_block_type" value="value_matrix_row_start"/>
        <parameter key="invert_selection" value="false"/>
        <parameter key="include_special_attributes" value="true"/>
      </operator>
      <operator activated="true" class="store" compatibility="9.10.011" expanded="true" height="68" name="Store wordlist" width="90" x="380" y="289">
        <parameter key="repository_entry" value="Wordlist"/>
      </operator>
      <operator activated="true" class="set_role" compatibility="9.10.011" expanded="true" height="82" name="Set Role" width="90" x="715" y="136">
        <parameter key="attribute_name" value="sentiment"/>
        <parameter key="target_role" value="label"/>
        <list key="set_additional_roles">
          <parameter key="Score" value="regular"/>
        </list>
      </operator>
      <operator activated="true" class="split_data" compatibility="9.10.011" expanded="true" height="103" name="Split Data" width="90" x="581" y="238">
        <enumeration key="partitions">
          <parameter key="ratio" value="0.7"/>
          <parameter key="ratio" value="0.3"/>
        </enumeration>
        <parameter key="sampling_type" value="stratified sampling"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
      </operator>
      <operator activated="true" class="concurrency:cross_validation" compatibility="9.10.011" expanded="true" height="145" name="Cross Validation" width="90" x="715" y="238">
        <parameter key="split_on_batch_attribute" value="false"/>
        <parameter key="leave_one_out" value="false"/>
        <parameter key="number_of_folds" value="10"/>
        <parameter key="sampling_type" value="automatic"/>
        <parameter key="use_local_random_seed" value="false"/>
        <parameter key="local_random_seed" value="1992"/>
        <parameter key="enable_parallel_execution" value="true"/>
        <process expanded="true">
          <operator activated="true" class="support_vector_machine_libsvm" compatibility="9.10.011" expanded="true" height="82" name="SVM (2)" width="90" x="179" y="136">
            <parameter key="svm_type" value="C-SVC"/>
            <parameter key="kernel_type" value="linear"/>
            <parameter key="degree" value="3"/>
            <parameter key="gamma" value="0.05"/>
            <parameter key="coef0" value="0.0"/>
            <parameter key="C" value="0.0"/>
            <parameter key="nu" value="0.5"/>
            <parameter key="cache_size" value="80"/>
            <parameter key="epsilon" value="1.0"/>
            <parameter key="p" value="0.1"/>
            <list key="class_weights"/>
            <parameter key="shrinking" value="true"/>
            <parameter key="calculate_confidences" value="true"/>
            <parameter key="confidence_for_multiclass" value="false"/>
          </operator>
          <connect from_port="training set" to_op="SVM (2)" to_port="training set"/>
          <connect from_op="SVM (2)" from_port="model" to_port="model"/>
          <portSpacing port="source_training set" spacing="0"/>
          <portSpacing port="sink_model" spacing="0"/>
          <portSpacing port="sink_through 1" spacing="0"/>
        </process>
        <process expanded="true">
          <operator activated="true" class="apply_model" compatibility="9.10.011" expanded="true" height="82" name="Apply Model" width="90" x="112" y="85">
            <list key="application_parameters"/>
            <parameter key="create_view" value="false"/>
          </operator>
          <operator activated="true" class="performance" compatibility="9.10.011" expanded="true" height="82" name="Performance" width="90" x="179" y="187">
            <parameter key="use_example_weights" value="true"/>
          </operator>
          <connect from_port="model" to_op="Apply Model" to_port="model"/>
          <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
          <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
          <connect from_op="Performance" from_port="performance" to_port="performance 1"/>
          <portSpacing port="source_model" spacing="0"/>
          <portSpacing port="source_test set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_test set results" spacing="0"/>
          <portSpacing port="sink_performance 1" spacing="0"/>
          <portSpacing port="sink_performance 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="store" compatibility="9.10.011" expanded="true" height="68" name="Store model" width="90" x="849" y="238">
        <parameter key="repository_entry" value="My SVM model"/>
      </operator>
      <operator activated="true" class="apply_model" compatibility="9.10.011" expanded="true" height="82" name="Apply Model (2)" width="90" x="447" y="442">
        <list key="application_parameters"/>
        <parameter key="create_view" value="false"/>
      </operator>
      <operator activated="true" class="performance" compatibility="9.10.011" expanded="true" height="82" name="Performance (2)" width="90" x="648" y="442">
        <parameter key="use_example_weights" value="true"/>
      </operator>
      <connect from_op="Retrieve clean_dataset2" from_port="output" to_op="Extract Sentiment" to_port="exa"/>
      <connect from_op="Extract Sentiment" from_port="exa" to_op="Nominal to Text" to_port="example set input"/>
      <connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
      <connect from_op="Process Documents from Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
      <connect from_op="Process Documents from Data" from_port="word list" to_op="Store wordlist" to_port="input"/>
      <connect from_op="Select Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
      <connect from_op="Set Role" from_port="example set output" to_op="Split Data" to_port="example set"/>
      <connect from_op="Split Data" from_port="partition 1" to_op="Cross Validation" to_port="example set"/>
      <connect from_op="Split Data" from_port="partition 2" to_op="Apply Model (2)" to_port="unlabelled data"/>
      <connect from_op="Cross Validation" from_port="model" to_op="Store model" to_port="input"/>
      <connect from_op="Store model" from_port="through" to_op="Apply Model (2)" to_port="model"/>
      <connect from_op="Apply Model (2)" from_port="labelled data" to_op="Performance (2)" to_port="labelled data"/>
      <connect from_op="Apply Model (2)" from_port="model" to_port="result 2"/>
      <connect from_op="Performance (2)" from_port="performance" to_port="result 1"/>
      <connect from_op="Performance (2)" from_port="example set" to_port="result 3"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
    </process>
  </operator>
</process>

I also share an image of the process.


Welcome!

It looks like you're new here. Sign in or register to get started.

Welcome!

It looks like you're new here. Sign in or register to get started.