"FeatureExtraction from XML LibSVM Java"
Hi All,
First of all i would like to thanks the Rapid Miner guys for their great product !
Thanks a lot for the examples , documentation and of course, the wizards !!
I would also like to thank Michael Wurst for his tutorial on his website (nemoz.org) !!!
----------------------------
I'm a newbie student and i have assignment to classify urls.
I read a lot of documentation and searched in the forums , but i guess i still have 2 problems ( RapidMiner version 4.2 ) ...
I created an XML file for each url features in 2 folders.
.\train\news\www.news1.de.xml
.\train\news\www.news2.de.xml
.\train\porn\www.porn1.de.xml
.\train\porn\www.porn2.de.xml
each xml looks like:
<myXML>
<title> my title </title>
<keywords> my keywords </keywords>
<numberOfPages> 6 </numberOfPages>
</myXML>
----------------------------
1. when i am running the project file ( below ) in RapidMiner - with libsvm - it says :
"Message: This learning scheme does not have sufficient capabilities for the given data set: polynominal attributes not supported"
I tried to use the "06_ExtractionAndWordVecotor.xml" example - but it gave me the same error.
2. I tried to load the model using java - but i cannot understand how to load the features themselves instead of the whole text ...
( TextInput instead of SingleTextInput ?? ) , the simple example works - but without the features ...
I would really appreciate your help !
Thanks a lot for everything !!
Jorno
---------------------------------------------
RAPID MINER CONFIGURATION FILE
---------------------------------------------
<?xml version="1.0" encoding="windows-1252"?>
<process version="4.4">
<operator name="Root" class="Process" expanded="yes">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="encoding" value="SYSTEM"/>
<operator name="Extractor" class="FeatureExtraction">
<list key="texts">
<parameter key="news" value=".\train\news"/>
<parameter key="porn" value=".\train\porn"/>
</list>
<parameter key="default_content_type" value=""/>
<parameter key="default_content_encoding" value="UTF-8"/>
<parameter key="default_content_language" value="english"/>
<parameter key="use_content_attributes" value="false"/>
<parameter key="id_attribute_type" value="long"/>
<list key="attributes">
<parameter key="title" value="//*/title/text() "/>
<parameter key="#numberOfPages" value="//*/numberOfPages/text()"/>
<parameter key="keywords" value="//*/keywords/text()"/>
</list>
<list key="namespaces">
</list>
</operator>
<operator name="TextInput" class="TextInput" expanded="yes">
<list key="texts">
<parameter key="news" value=".\train\news"/>
<parameter key="porn" value=".\train\porn"/>
</list>
<parameter key="default_content_type" value=""/>
<parameter key="default_content_encoding" value="UTF-8"/>
<parameter key="default_content_language" value="english"/>
<parameter key="prune_below" value="-1"/>
<parameter key="prune_above" value="-1"/>
<parameter key="vector_creation" value="TFIDF"/>
<parameter key="use_content_attributes" value="false"/>
<parameter key="use_given_word_list" value="false"/>
<parameter key="return_word_list" value="true"/>
<parameter key="output_word_list" value=".\train\training_words.txt"/>
<parameter key="id_attribute_type" value="long"/>
<list key="namespaces">
</list>
<parameter key="create_text_visualizer" value="true"/>
<parameter key="on_the_fly_pruning" value="-1"/>
<parameter key="extend_exampleset" value="true"/>
<operator name="StringTokenizer" class="StringTokenizer">
</operator>
<operator name="EnglishStopwordFilter" class="EnglishStopwordFilter">
</operator>
<operator name="TokenLengthFilter" class="TokenLengthFilter">
<parameter key="min_chars" value="3"/>
<parameter key="max_chars" value="2147483647"/>
</operator>
<operator name="PorterStemmer" class="PorterStemmer">
</operator>
</operator>
<operator name="LibSVMLearner" class="LibSVMLearner">
<parameter key="keep_example_set" value="false"/>
<parameter key="svm_type" value="C-SVC"/>
<parameter key="kernel_type" value="linear"/>
<parameter key="degree" value="3"/>
<parameter key="gamma" value="0.0"/>
<parameter key="coef0" value="0.0"/>
<parameter key="C" value="0.0"/>
<parameter key="nu" value="0.5"/>
<parameter key="cache_size" value="80"/>
<parameter key="epsilon" value="0.0010"/>
<parameter key="p" value="0.1"/>
<list key="class_weights">
</list>
<parameter key="shrinking" value="true"/>
<parameter key="calculate_confidences" value="false"/>
<parameter key="confidence_for_multiclass" value="true"/>
</operator>
<operator name="ModelWriter" class="ModelWriter">
<parameter key="model_file" value=".\train\training_model.mod"/>
<parameter key="overwrite_existing_file" value="true"/>
<parameter key="output_type" value="Binary"/>
</operator>
</operator>
</process>
-----------------------------------------------
JAVA CODE
-----------------------------------------------
import java.io.File;
import java.io.IOException;
import com.rapidminer.RapidMiner;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.IOContainer;
import com.rapidminer.operator.Model;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorChain;
import com.rapidminer.operator.OperatorCreationException;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.tools.OperatorService;
public class RapidMinerTextClassifier
{
private OperatorChain wvtoolOperator;
private Operator modelApplier;
private Model model;
public RapidMinerTextClassifier(File modelFile, File wordListFile)
throws IOException, OperatorCreationException, OperatorException
{
//System.setProperty(RapidMiner.PROPERTY_RAPIDMINER_HOME, "C:\\Program Files\\Rapid-I\\RapidMiner\\lib"); // "rapidminer.home"
//System.setProperty("rapidminer.home", "D:\\Applications\\RapidMiner-4.2");
System.setProperty("rapidminer.home", "C:\\Program Files\\Rapid-I\\RapidMiner");
String pluginDirString = new File("C:\\Program Files\\Rapid-I\\RapidMiner\\lib\\plugins").getAbsolutePath();
System.setProperty(RapidMiner.PROPERTY_RAPIDMINER_INIT_PLUGINS_LOCATION, pluginDirString);
RapidMiner.init(false, false, false, true);
// Create the text input operator and set the path to the word list you stored using Rapid Miner
// As there is only a single text, we use the SingleTextInput operator
wvtoolOperator = (OperatorChain) OperatorService.createOperator("SingleTextInput"); // I need TextInput ?????????????
wvtoolOperator.setParameter("input_word_list", wordListFile.getAbsolutePath());
// Add additional processing steps.
// Note the setup must be same as the one you used when creating the classification model
wvtoolOperator.addOperator(OperatorService.createOperator("StringTokenizer"));
wvtoolOperator.addOperator(OperatorService.createOperator("EnglishStopwordFilter"));
wvtoolOperator.addOperator(OperatorService.createOperator("TokenLengthFilter"));
wvtoolOperator.addOperator(OperatorService.createOperator("PorterStemmer"));
// Create the model applier
modelApplier = OperatorService.createOperator("ModelApplier");
// Load the model into a field of the class
Operator modelLoader = OperatorService.createOperator("ModelLoader");
modelLoader.setParameter("model_file", modelFile.getAbsolutePath());
IOContainer container = modelLoader.apply(new IOContainer());
model = container.get(Model.class);
}
public String apply(String text) throws OperatorException
{
// Set the text
wvtoolOperator.setParameter("text", text);
//wvtoolOperator.setParameter("title", text);
//wvtoolOperator.setParameter("keywords", text);
//wvtoolOperator.setParameter("numberOfPages", int);
// Call the text input operator
IOContainer container = wvtoolOperator.apply(new IOContainer(model));
// Call the model applier (the model was added already before calling the text input)
container = modelApplier.apply(container);
// Obtain the example set from the io container. It contains only a single example with our text in it.
ExampleSet eset = container.get(ExampleSet.class);
Example e = eset.iterator().next();
// Compare the predicted label with the positive label
System.out.println(eset.getAttributes().getPredictedLabel().getMapping() + " " + e.getConfidence("porn") + " " + e.getConfidence("news"));
return eset.getAttributes().getPredictedLabel().getMapping().mapIndex( (int)e.getPredictedLabel() );
}
public static void main(String args[]) throws Exception
{
// Create a text classifier
RapidMinerTextClassifier tr = new RapidMinerTextClassifier(
new File(
"C:\\Main\\eclipse\\workspace\\octopus\\RapidMiner\\train\\training_model.mod"),
new File(
"C:\\Main\\eclipse\\workspace\\octopus\\RapidMiner\\train\\training_words.txt"));
// Call the classifier with texts
System.out.println("Test1:" + tr.apply("povrai xflick resolution gif"));
System.out.println("Test2:" + tr.apply("workstation intel switch"));
System.out.println("Test3:" + tr.apply("sex porn sex povrai xflick resolution gif"));
}
}
First of all i would like to thanks the Rapid Miner guys for their great product !
Thanks a lot for the examples , documentation and of course, the wizards !!
I would also like to thank Michael Wurst for his tutorial on his website (nemoz.org) !!!
----------------------------
I'm a newbie student and i have assignment to classify urls.
I read a lot of documentation and searched in the forums , but i guess i still have 2 problems ( RapidMiner version 4.2 ) ...
I created an XML file for each url features in 2 folders.
.\train\news\www.news1.de.xml
.\train\news\www.news2.de.xml
.\train\porn\www.porn1.de.xml
.\train\porn\www.porn2.de.xml
each xml looks like:
<myXML>
<title> my title </title>
<keywords> my keywords </keywords>
<numberOfPages> 6 </numberOfPages>
</myXML>
----------------------------
1. when i am running the project file ( below ) in RapidMiner - with libsvm - it says :
"Message: This learning scheme does not have sufficient capabilities for the given data set: polynominal attributes not supported"
I tried to use the "06_ExtractionAndWordVecotor.xml" example - but it gave me the same error.
2. I tried to load the model using java - but i cannot understand how to load the features themselves instead of the whole text ...
( TextInput instead of SingleTextInput ?? ) , the simple example works - but without the features ...
I would really appreciate your help !
Thanks a lot for everything !!
Jorno
---------------------------------------------
RAPID MINER CONFIGURATION FILE
---------------------------------------------
<?xml version="1.0" encoding="windows-1252"?>
<process version="4.4">
<operator name="Root" class="Process" expanded="yes">
<parameter key="logverbosity" value="init"/>
<parameter key="random_seed" value="2001"/>
<parameter key="encoding" value="SYSTEM"/>
<operator name="Extractor" class="FeatureExtraction">
<list key="texts">
<parameter key="news" value=".\train\news"/>
<parameter key="porn" value=".\train\porn"/>
</list>
<parameter key="default_content_type" value=""/>
<parameter key="default_content_encoding" value="UTF-8"/>
<parameter key="default_content_language" value="english"/>
<parameter key="use_content_attributes" value="false"/>
<parameter key="id_attribute_type" value="long"/>
<list key="attributes">
<parameter key="title" value="//*/title/text() "/>
<parameter key="#numberOfPages" value="//*/numberOfPages/text()"/>
<parameter key="keywords" value="//*/keywords/text()"/>
</list>
<list key="namespaces">
</list>
</operator>
<operator name="TextInput" class="TextInput" expanded="yes">
<list key="texts">
<parameter key="news" value=".\train\news"/>
<parameter key="porn" value=".\train\porn"/>
</list>
<parameter key="default_content_type" value=""/>
<parameter key="default_content_encoding" value="UTF-8"/>
<parameter key="default_content_language" value="english"/>
<parameter key="prune_below" value="-1"/>
<parameter key="prune_above" value="-1"/>
<parameter key="vector_creation" value="TFIDF"/>
<parameter key="use_content_attributes" value="false"/>
<parameter key="use_given_word_list" value="false"/>
<parameter key="return_word_list" value="true"/>
<parameter key="output_word_list" value=".\train\training_words.txt"/>
<parameter key="id_attribute_type" value="long"/>
<list key="namespaces">
</list>
<parameter key="create_text_visualizer" value="true"/>
<parameter key="on_the_fly_pruning" value="-1"/>
<parameter key="extend_exampleset" value="true"/>
<operator name="StringTokenizer" class="StringTokenizer">
</operator>
<operator name="EnglishStopwordFilter" class="EnglishStopwordFilter">
</operator>
<operator name="TokenLengthFilter" class="TokenLengthFilter">
<parameter key="min_chars" value="3"/>
<parameter key="max_chars" value="2147483647"/>
</operator>
<operator name="PorterStemmer" class="PorterStemmer">
</operator>
</operator>
<operator name="LibSVMLearner" class="LibSVMLearner">
<parameter key="keep_example_set" value="false"/>
<parameter key="svm_type" value="C-SVC"/>
<parameter key="kernel_type" value="linear"/>
<parameter key="degree" value="3"/>
<parameter key="gamma" value="0.0"/>
<parameter key="coef0" value="0.0"/>
<parameter key="C" value="0.0"/>
<parameter key="nu" value="0.5"/>
<parameter key="cache_size" value="80"/>
<parameter key="epsilon" value="0.0010"/>
<parameter key="p" value="0.1"/>
<list key="class_weights">
</list>
<parameter key="shrinking" value="true"/>
<parameter key="calculate_confidences" value="false"/>
<parameter key="confidence_for_multiclass" value="true"/>
</operator>
<operator name="ModelWriter" class="ModelWriter">
<parameter key="model_file" value=".\train\training_model.mod"/>
<parameter key="overwrite_existing_file" value="true"/>
<parameter key="output_type" value="Binary"/>
</operator>
</operator>
</process>
-----------------------------------------------
JAVA CODE
-----------------------------------------------
import java.io.File;
import java.io.IOException;
import com.rapidminer.RapidMiner;
import com.rapidminer.example.Example;
import com.rapidminer.example.ExampleSet;
import com.rapidminer.operator.IOContainer;
import com.rapidminer.operator.Model;
import com.rapidminer.operator.Operator;
import com.rapidminer.operator.OperatorChain;
import com.rapidminer.operator.OperatorCreationException;
import com.rapidminer.operator.OperatorException;
import com.rapidminer.tools.OperatorService;
public class RapidMinerTextClassifier
{
private OperatorChain wvtoolOperator;
private Operator modelApplier;
private Model model;
public RapidMinerTextClassifier(File modelFile, File wordListFile)
throws IOException, OperatorCreationException, OperatorException
{
//System.setProperty(RapidMiner.PROPERTY_RAPIDMINER_HOME, "C:\\Program Files\\Rapid-I\\RapidMiner\\lib"); // "rapidminer.home"
//System.setProperty("rapidminer.home", "D:\\Applications\\RapidMiner-4.2");
System.setProperty("rapidminer.home", "C:\\Program Files\\Rapid-I\\RapidMiner");
String pluginDirString = new File("C:\\Program Files\\Rapid-I\\RapidMiner\\lib\\plugins").getAbsolutePath();
System.setProperty(RapidMiner.PROPERTY_RAPIDMINER_INIT_PLUGINS_LOCATION, pluginDirString);
RapidMiner.init(false, false, false, true);
// Create the text input operator and set the path to the word list you stored using Rapid Miner
// As there is only a single text, we use the SingleTextInput operator
wvtoolOperator = (OperatorChain) OperatorService.createOperator("SingleTextInput"); // I need TextInput ?????????????
wvtoolOperator.setParameter("input_word_list", wordListFile.getAbsolutePath());
// Add additional processing steps.
// Note the setup must be same as the one you used when creating the classification model
wvtoolOperator.addOperator(OperatorService.createOperator("StringTokenizer"));
wvtoolOperator.addOperator(OperatorService.createOperator("EnglishStopwordFilter"));
wvtoolOperator.addOperator(OperatorService.createOperator("TokenLengthFilter"));
wvtoolOperator.addOperator(OperatorService.createOperator("PorterStemmer"));
// Create the model applier
modelApplier = OperatorService.createOperator("ModelApplier");
// Load the model into a field of the class
Operator modelLoader = OperatorService.createOperator("ModelLoader");
modelLoader.setParameter("model_file", modelFile.getAbsolutePath());
IOContainer container = modelLoader.apply(new IOContainer());
model = container.get(Model.class);
}
public String apply(String text) throws OperatorException
{
// Set the text
wvtoolOperator.setParameter("text", text);
//wvtoolOperator.setParameter("title", text);
//wvtoolOperator.setParameter("keywords", text);
//wvtoolOperator.setParameter("numberOfPages", int);
// Call the text input operator
IOContainer container = wvtoolOperator.apply(new IOContainer(model));
// Call the model applier (the model was added already before calling the text input)
container = modelApplier.apply(container);
// Obtain the example set from the io container. It contains only a single example with our text in it.
ExampleSet eset = container.get(ExampleSet.class);
Example e = eset.iterator().next();
// Compare the predicted label with the positive label
System.out.println(eset.getAttributes().getPredictedLabel().getMapping() + " " + e.getConfidence("porn") + " " + e.getConfidence("news"));
return eset.getAttributes().getPredictedLabel().getMapping().mapIndex( (int)e.getPredictedLabel() );
}
public static void main(String args[]) throws Exception
{
// Create a text classifier
RapidMinerTextClassifier tr = new RapidMinerTextClassifier(
new File(
"C:\\Main\\eclipse\\workspace\\octopus\\RapidMiner\\train\\training_model.mod"),
new File(
"C:\\Main\\eclipse\\workspace\\octopus\\RapidMiner\\train\\training_words.txt"));
// Call the classifier with texts
System.out.println("Test1:" + tr.apply("povrai xflick resolution gif"));
System.out.println("Test2:" + tr.apply("workstation intel switch"));
System.out.println("Test3:" + tr.apply("sex porn sex povrai xflick resolution gif"));
}
}