TEXT MINING HELP!!!!!!!

kersor
New Altair Community Member
HEY GUYS,
I USE RAPID MINER FOR TEXT MINING.WHEN I TRY TO PUT TRANFORM CASES OR GENERATE N GRAMS IN THE PROCCES, THE PROJECT DON'T RUN. I USE NAIVE BAYES IN VALIDATION AND WITHOUT TRANFORM CASES AND GENERATE N GRAMS I HAVE ONLY 67% PRFORMANCE.I DID TO GO TO 75%.WHAT I MUST TO DO TO RUN WITHOUT PROBLEMS??HERE IS THE XML OF THE PROCCES.IF A DELETE TRANFORM CASES ITS RUN AND HAVE 68%.SORRY FOR THE BAD ENGLISH,HOPE TO UNDERSTAND.PLS HELP!!!!
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.006">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.006" expanded="true" name="Process">
<parameter key="parallelize_main_process" value="true"/>
<process expanded="true" height="396" width="660">
<operator activated="true" class="text:process_document_from_file" compatibility="5.1.001" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="75">
<list key="text_directories">
<parameter key="negative" value="C:\Users\Alkis\Desktop\DATA_MINIMG\Negative"/>
<parameter key="positive" value="C:\Users\Alkis\Desktop\DATA_MINIMG\positive"/>
</list>
<parameter key="vector_creation" value="Binary Term Occurrences"/>
<parameter key="prune_above_percent" value="99.0"/>
<parameter key="parallelize_vector_creation" value="true"/>
<process expanded="true" height="396" width="815">
<operator activated="true" class="text:tokenize" compatibility="5.1.001" expanded="true" height="60" name="Tokenize" width="90" x="112" y="120"/>
<operator activated="true" class="text:transform_cases" compatibility="5.1.001" expanded="true" height="60" name="Transform Cases" width="90" x="299" y="124"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="5.1.001" expanded="true" height="60" name="Generate n-Grams (Terms)" width="90" x="413" y="121"/>
<operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.1.001" expanded="true" height="60" name="Filter Stopwords (Dictionary)" width="90" x="581" y="120">
<parameter key="file" value="C:\Users\Alkis\Desktop\DATA_MINIMG\stopwords_greek\stopwords_greek.txt"/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
<connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="x_validation" compatibility="5.1.006" expanded="true" height="112" name="Validation" width="90" x="246" y="75">
<description>A cross-validation evaluating a decision tree model.</description>
<parameter key="parallelize_training" value="true"/>
<parameter key="parallelize_testing" value="true"/>
<process expanded="true" height="654" width="466">
<operator activated="true" class="naive_bayes" compatibility="5.1.006" expanded="true" height="76" name="Naive Bayes" width="90" x="160" y="196"/>
<connect from_port="training" to_op="Naive Bayes" to_port="training set"/>
<connect from_op="Naive Bayes" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true" height="654" width="466">
<operator activated="true" class="apply_model" compatibility="5.1.006" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.1.006" expanded="true" height="76" name="Performance" width="90" x="179" y="30"/>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Validation" to_port="training"/>
<connect from_op="Process Documents from Files" from_port="word list" to_port="result 2"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
I USE RAPID MINER FOR TEXT MINING.WHEN I TRY TO PUT TRANFORM CASES OR GENERATE N GRAMS IN THE PROCCES, THE PROJECT DON'T RUN. I USE NAIVE BAYES IN VALIDATION AND WITHOUT TRANFORM CASES AND GENERATE N GRAMS I HAVE ONLY 67% PRFORMANCE.I DID TO GO TO 75%.WHAT I MUST TO DO TO RUN WITHOUT PROBLEMS??HERE IS THE XML OF THE PROCCES.IF A DELETE TRANFORM CASES ITS RUN AND HAVE 68%.SORRY FOR THE BAD ENGLISH,HOPE TO UNDERSTAND.PLS HELP!!!!
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.006">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.006" expanded="true" name="Process">
<parameter key="parallelize_main_process" value="true"/>
<process expanded="true" height="396" width="660">
<operator activated="true" class="text:process_document_from_file" compatibility="5.1.001" expanded="true" height="76" name="Process Documents from Files" width="90" x="45" y="75">
<list key="text_directories">
<parameter key="negative" value="C:\Users\Alkis\Desktop\DATA_MINIMG\Negative"/>
<parameter key="positive" value="C:\Users\Alkis\Desktop\DATA_MINIMG\positive"/>
</list>
<parameter key="vector_creation" value="Binary Term Occurrences"/>
<parameter key="prune_above_percent" value="99.0"/>
<parameter key="parallelize_vector_creation" value="true"/>
<process expanded="true" height="396" width="815">
<operator activated="true" class="text:tokenize" compatibility="5.1.001" expanded="true" height="60" name="Tokenize" width="90" x="112" y="120"/>
<operator activated="true" class="text:transform_cases" compatibility="5.1.001" expanded="true" height="60" name="Transform Cases" width="90" x="299" y="124"/>
<operator activated="true" class="text:generate_n_grams_terms" compatibility="5.1.001" expanded="true" height="60" name="Generate n-Grams (Terms)" width="90" x="413" y="121"/>
<operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.1.001" expanded="true" height="60" name="Filter Stopwords (Dictionary)" width="90" x="581" y="120">
<parameter key="file" value="C:\Users\Alkis\Desktop\DATA_MINIMG\stopwords_greek\stopwords_greek.txt"/>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Generate n-Grams (Terms)" to_port="document"/>
<connect from_op="Generate n-Grams (Terms)" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
<connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="x_validation" compatibility="5.1.006" expanded="true" height="112" name="Validation" width="90" x="246" y="75">
<description>A cross-validation evaluating a decision tree model.</description>
<parameter key="parallelize_training" value="true"/>
<parameter key="parallelize_testing" value="true"/>
<process expanded="true" height="654" width="466">
<operator activated="true" class="naive_bayes" compatibility="5.1.006" expanded="true" height="76" name="Naive Bayes" width="90" x="160" y="196"/>
<connect from_port="training" to_op="Naive Bayes" to_port="training set"/>
<connect from_op="Naive Bayes" from_port="model" to_port="model"/>
<portSpacing port="source_training" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
<portSpacing port="sink_through 1" spacing="0"/>
</process>
<process expanded="true" height="654" width="466">
<operator activated="true" class="apply_model" compatibility="5.1.006" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
<list key="application_parameters"/>
</operator>
<operator activated="true" class="performance" compatibility="5.1.006" expanded="true" height="76" name="Performance" width="90" x="179" y="30"/>
<connect from_port="model" to_op="Apply Model" to_port="model"/>
<connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
<connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
<portSpacing port="source_model" spacing="0"/>
<portSpacing port="source_test set" spacing="0"/>
<portSpacing port="source_through 1" spacing="0"/>
<portSpacing port="sink_averagable 1" spacing="0"/>
<portSpacing port="sink_averagable 2" spacing="0"/>
</process>
</operator>
<connect from_port="input 1" to_op="Process Documents from Files" to_port="word list"/>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Validation" to_port="training"/>
<connect from_op="Process Documents from Files" from_port="word list" to_port="result 2"/>
<connect from_op="Validation" from_port="averagable 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
Tagged:
0
Answers
-
Hi,
yeeees, transform cases always helps, anytime.
So what the hack is the error? You will have to give me some HINTS what goes wrong before I can help you fix it. Your process looks fine.
Greetings,
Sebastian0 -
the log file when i run the procces is:
May 17, 2011 6:46:41 PM INFO: Process //NewLocalRepository/TEST_ME)POLLA starts
May 17, 2011 6:46:41 PM INFO: Executing process concurrently: Main Process
May 17, 2011 6:46:41 PM INFO: Executing process concurrently: Vector Creation
May 17, 2011 6:46:41 PM INFO: Executing process concurrently: Vector Creation
May 17, 2011 6:46:41 PM INFO: Executing process concurrently: Vector Creation
May 17, 2011 6:46:41 PM WARNING: Caught exception in concurrent execution of Transform Cases (Transform Cases): java.lang.NullPointerException
May 17, 2011 6:46:41 PM WARNING: Caught exception in concurrent execution of Process Documents from Files (Process Documents from Files): java.lang.NullPointerException
And i have the same with N grams :
May 17, 2011 6:51:58 PM INFO: Executing process concurrently: Vector Creation
May 17, 2011 6:51:58 PM WARNING: Caught exception in concurrent execution of Generate n-Grams (Terms) (Generate n-Grams (Terms)): java.lang.NullPointerException
May 17, 2011 6:51:58 PM WARNING: Caught exception in concurrent execution of Process Documents from Files (Process Documents from Files): java.lang.NullPointerException
i; mnew in rapid miner and i dont know how to fix it.if you know any other operators(like tranform cases)that could help me to go to 75% or 72% please let me know,
Thanks for the reply...!!!!0 -
Hi,
please send me the stack trace of the error. There seems to be a problem, but without the stack trace I don't know where.
Greetings,
Sebastian0 -
sorry but i don't understand what you mean with stack trace of the error.you mean the log file?i checked the procces with articles written in english ant run really good without problems.so the problem maybe is cause i use greek articles.if you want to give me a mail that you use to sent you exactly the articles and you check it and see what is the problem.
thanks again Sebasntian0