🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

"FeatureExtraction from XML LibSVM Java"

User: "jorno"
New Altair Community Member
Updated by Jocelyn
Hi All,

First of all i would like to thanks the Rapid Miner guys for their great product !
Thanks a lot for the examples , documentation and of course, the wizards !!
I would also like to thank Michael Wurst for his tutorial on his website (nemoz.org) !!!

----------------------------
I'm a newbie student and i have assignment  to classify urls.
I read a lot of documentation and searched in the forums , but i guess i still have 2 problems ( RapidMiner version 4.2 ) ...

I created an XML file for each url features in 2 folders.
.\train\news\www.news1.de.xml
.\train\news\www.news2.de.xml
.\train\porn\www.porn1.de.xml
.\train\porn\www.porn2.de.xml

each xml looks like:
<myXML>
      <title> my title </title>
      <keywords> my keywords </keywords>
      <numberOfPages> 6 </numberOfPages>
</myXML>
----------------------------
1. when i am running the project file ( below ) in RapidMiner - with libsvm - it says :
  "Message: This learning scheme does not have sufficient capabilities for the given data set: polynominal attributes not supported" 
    I tried to use the "06_ExtractionAndWordVecotor.xml" example - but it gave me the same error.

2. I tried to load the model using java - but i cannot understand how to load the features themselves instead of the whole text ...
  ( TextInput instead of SingleTextInput ?? ) , the simple example works - but without the features ...

I would really appreciate your help !
Thanks a lot for everything  !!
Jorno

---------------------------------------------
RAPID MINER CONFIGURATION FILE
---------------------------------------------
<?xml version="1.0" encoding="windows-1252"?>
<process version="4.4">

  <operator name="Root" class="Process" expanded="yes">   
      <parameter key="logverbosity"  value="init"/>
      <parameter key="random_seed"  value="2001"/>
      <parameter key="encoding"  value="SYSTEM"/>
      <operator name="Extractor" class="FeatureExtraction">
          <list key="texts">
            <parameter key="news"  value=".\train\news"/>
            <parameter key="porn"  value=".\train\porn"/>
          </list>
          <parameter key="default_content_type"  value=""/>
          <parameter key="default_content_encoding"  value="UTF-8"/>
          <parameter key="default_content_language"  value="english"/>
          <parameter key="use_content_attributes"  value="false"/>
          <parameter key="id_attribute_type"  value="long"/>
          <list key="attributes">
            <parameter key="title"  value="//*/title/text() "/>
            <parameter key="#numberOfPages"  value="//*/numberOfPages/text()"/>
            <parameter key="keywords"  value="//*/keywords/text()"/>
          </list>
          <list key="namespaces">
          </list>
      </operator>
      <operator name="TextInput" class="TextInput" expanded="yes">
          <list key="texts">
            <parameter key="news"  value=".\train\news"/>
            <parameter key="porn"  value=".\train\porn"/>
          </list>
          <parameter key="default_content_type"  value=""/>
          <parameter key="default_content_encoding"  value="UTF-8"/>
          <parameter key="default_content_language"  value="english"/>
          <parameter key="prune_below"  value="-1"/>
          <parameter key="prune_above"  value="-1"/>
          <parameter key="vector_creation"  value="TFIDF"/>
          <parameter key="use_content_attributes"  value="false"/>
          <parameter key="use_given_word_list"  value="false"/>
          <parameter key="return_word_list"  value="true"/>
          <parameter key="output_word_list"  value=".\train\training_words.txt"/>
          <parameter key="id_attribute_type"  value="long"/>
          <list key="namespaces">
          </list>
          <parameter key="create_text_visualizer"  value="true"/>
          <parameter key="on_the_fly_pruning"  value="-1"/>
          <parameter key="extend_exampleset"  value="true"/>
          <operator name="StringTokenizer" class="StringTokenizer">
          </operator>
          <operator name="EnglishStopwordFilter" class="EnglishStopwordFilter">
          </operator>
          <operator name="TokenLengthFilter" class="TokenLengthFilter">
              <parameter key="min_chars"  value="3"/>
              <parameter key="max_chars"  value="2147483647"/>
          </operator>
          <operator name="PorterStemmer" class="PorterStemmer">
          </operator>
      </operator>
      <operator name="LibSVMLearner" class="LibSVMLearner">
          <parameter key="keep_example_set"  value="false"/>
          <parameter key="svm_type"  value="C-SVC"/>
          <parameter key="kernel_type"  value="linear"/>
          <parameter key="degree"  value="3"/>
          <parameter key="gamma"  value="0.0"/>
          <parameter key="coef0"  value="0.0"/>
          <parameter key="C"  value="0.0"/>
          <parameter key="nu"  value="0.5"/>
          <parameter key="cache_size"  value="80"/>
          <parameter key="epsilon"  value="0.0010"/>
          <parameter key="p"  value="0.1"/>
          <list key="class_weights">
          </list>
          <parameter key="shrinking"  value="true"/>
          <parameter key="calculate_confidences"  value="false"/>
          <parameter key="confidence_for_multiclass"  value="true"/>
      </operator>
      <operator name="ModelWriter" class="ModelWriter">
          <parameter key="model_file"  value=".\train\training_model.mod"/>
          <parameter key="overwrite_existing_file"  value="true"/>
          <parameter key="output_type"  value="Binary"/>
      </operator>
  </operator>

</process>





-----------------------------------------------
JAVA CODE
-----------------------------------------------
import java.io.File;
import java.io.IOException;

import com.rapidminer.RapidMiner;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.IOContainer;
import com.rapidminer.operator.Model;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorChain;
import com.rapidminer.operator.OperatorCreationException;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.tools.OperatorService;

public class RapidMinerTextClassifier
{

  private OperatorChain wvtoolOperator;
  private Operator modelApplier;
  private Model model;

  public RapidMinerTextClassifier(File modelFile, File wordListFile)
        throws IOException, OperatorCreationException, OperatorException
  {

      //System.setProperty(RapidMiner.PROPERTY_RAPIDMINER_HOME, "C:\\Program Files\\Rapid-I\\RapidMiner\\lib"); //  "rapidminer.home"
      //System.setProperty("rapidminer.home", "D:\\Applications\\RapidMiner-4.2");
      System.setProperty("rapidminer.home", "C:\\Program Files\\Rapid-I\\RapidMiner");
     
      String pluginDirString = new File("C:\\Program Files\\Rapid-I\\RapidMiner\\lib\\plugins").getAbsolutePath();
      System.setProperty(RapidMiner.PROPERTY_RAPIDMINER_INIT_PLUGINS_LOCATION, pluginDirString);

      RapidMiner.init(false, false, false, true);
     
      // Create the text input operator and set the path to the word list you stored using Rapid Miner
      // As there is only a single text, we use the SingleTextInput operator
      wvtoolOperator = (OperatorChain) OperatorService.createOperator("SingleTextInput"); // I need TextInput ?????????????
     
      wvtoolOperator.setParameter("input_word_list", wordListFile.getAbsolutePath());

      // Add additional processing steps.
      // Note the setup must be same as the one you used when creating the classification model
      wvtoolOperator.addOperator(OperatorService.createOperator("StringTokenizer"));
      wvtoolOperator.addOperator(OperatorService.createOperator("EnglishStopwordFilter"));
      wvtoolOperator.addOperator(OperatorService.createOperator("TokenLengthFilter"));
      wvtoolOperator.addOperator(OperatorService.createOperator("PorterStemmer"));

      // Create the model applier
      modelApplier = OperatorService.createOperator("ModelApplier");

      // Load the model into a field of the class
      Operator modelLoader = OperatorService.createOperator("ModelLoader");
      modelLoader.setParameter("model_file", modelFile.getAbsolutePath());
      IOContainer container = modelLoader.apply(new IOContainer());
      model = container.get(Model.class);

  }

  public String apply(String text) throws OperatorException
  {

      // Set the text
      wvtoolOperator.setParameter("text", text);     
      //wvtoolOperator.setParameter("title", text);
      //wvtoolOperator.setParameter("keywords", text);
      //wvtoolOperator.setParameter("numberOfPages", int);
     
     
      // Call the text input operator
      IOContainer container = wvtoolOperator.apply(new IOContainer(model));

      // Call the model applier (the model was added already before calling the text input)
      container = modelApplier.apply(container);

      // Obtain the example set from the io container. It contains only a single example with our text in it.
      ExampleSet eset = container.get(ExampleSet.class);
      Example e = eset.iterator().next();

      // Compare the predicted label with the positive label     
      System.out.println(eset.getAttributes().getPredictedLabel().getMapping() + " " + e.getConfidence("porn") + " " + e.getConfidence("news"));
      return eset.getAttributes().getPredictedLabel().getMapping().mapIndex( (int)e.getPredictedLabel() );

  }

  public static void main(String args[]) throws Exception
  {
     
      // Create a text classifier
      RapidMinerTextClassifier tr = new RapidMinerTextClassifier(
            new File(
                  "C:\\Main\\eclipse\\workspace\\octopus\\RapidMiner\\train\\training_model.mod"),
            new File(
                  "C:\\Main\\eclipse\\workspace\\octopus\\RapidMiner\\train\\training_words.txt"));

      // Call the classifier with texts
      System.out.println("Test1:" + tr.apply("povrai xflick resolution gif"));
      System.out.println("Test2:" + tr.apply("workstation intel switch"));
      System.out.println("Test3:" + tr.apply("sex porn sex povrai xflick resolution gif"));

  }

}

Find more posts tagged with