Deep Learning Extension Problem with modified classification Word2Vec
Hello and congratulations for the good job implementing DeepLearning4j into RM.
I have tried to slightly change the ready-made process of Classification of IMDB reviews using Word2Vec with classification of BBC RSS Feeds.
I have this error:
- Exception: java.lang.NullPointerException
- Message: null
- Stack trace:
- com.rapidminer.example.Example.getNominalValue(Example.java:97)
- com.rapidminer.extension.deeplearning.tools.LabeledTextProvider.nextSentence(LabeledTextProvider.java:45)
- org.deeplearning4j.iterator.CnnSentenceDataSetIterator.preLoadTokens(CnnSentenceDataSetIterator.java:211)
- org.deeplearning4j.iterator.CnnSentenceDataSetIterator.hasNext(CnnSentenceDataSetIterator.java:201)
- com.rapidminer.extension.deeplearning.ioobjects.TensorIOObject.(TensorIOObject.java:62)
- com.rapidminer.extension.deeplearning.operators.WordEmbeddingOperator.doWork(WordEmbeddingOperator.java:118)
- com.rapidminer.operator.Operator.execute(Operator.java:1025)
- com.rapidminer.operator.execution.SimpleUnitExecutor.execute(SimpleUnitExecutor.java:77)
- com.rapidminer.operator.ExecutionUnit$2.run(ExecutionUnit.java:812)
- com.rapidminer.operator.ExecutionUnit$2.run(ExecutionUnit.java:807)
- java.security.AccessController.doPrivileged(Native Method)
- com.rapidminer.operator.ExecutionUnit.execute(ExecutionUnit.java:807)
- com.rapidminer.operator.OperatorChain.doWork(OperatorChain.java:428)
- com.rapidminer.operator.Operator.execute(Operator.java:1025)
- com.rapidminer.Process.execute(Process.java:1322)
- com.rapidminer.Process.run(Process.java:1297)
- com.rapidminer.Process.run(Process.java:1183)
- com.rapidminer.Process.run(Process.java:1136)
- com.rapidminer.Process.run(Process.java:1131)
- com.rapidminer.Process.run(Process.java:1121)
- com.rapidminer.gui.ProcessThread.run(ProcessThread.java:65)
my process' XML looks like this:
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.0.002" expanded="true" name="Process" origin="GENERATED_SAMPLE">
<process expanded="true">
<operator activated="true" class="open_file" compatibility="9.0.002" expanded="true" height="68" name="Open File" origin="GENERATED_SAMPLE" width="90" x="45" y="748">
<parameter key="filename" value="C:\Users\mmara\Downloads\GoogleNews-vectors-negative300.bin.gz"/>
</operator>
<operator activated="true" class="multiply" compatibility="9.0.002" expanded="true" height="103" name="Multiply" origin="GENERATED_SAMPLE" width="90" x="179" y="646"/>
<operator activated="true" class="web:read_rss" compatibility="7.3.000" expanded="true" height="68" name="Read RSS Feed" width="90" x="45" y="34">
<parameter key="url" value="http://feeds.bbci.co.uk/news/business/rss.xml"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="9.0.002" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="136">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Content"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="9.0.002" expanded="true" height="82" name="Generate Attributes" width="90" x="179" y="34">
<list key="function_descriptions">
<parameter key="class" value=""business""/>
</list>
</operator>
<operator activated="true" class="set_role" compatibility="9.0.002" expanded="true" height="82" name="Set Role" width="90" x="179" y="136">
<parameter key="attribute_name" value="class"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="web:read_rss" compatibility="7.3.000" expanded="true" height="68" name="Read RSS Feed (2)" width="90" x="45" y="340">
<parameter key="url" value="http://feeds.bbci.co.uk/news/technology/rss.xml"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="9.0.002" expanded="true" height="82" name="Select Attributes (2)" width="90" x="45" y="442">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Content"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="9.0.002" expanded="true" height="82" name="Generate Attributes (2)" width="90" x="246" y="340">
<list key="function_descriptions">
<parameter key="class" value=""technology""/>
</list>
</operator>
<operator activated="true" class="set_role" compatibility="9.0.002" expanded="true" height="82" name="Set Role (2)" width="90" x="179" y="493">
<parameter key="attribute_name" value="class"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="union" compatibility="9.0.002" expanded="true" height="82" name="Union" width="90" x="112" y="238"/>
<operator activated="true" breakpoints="after" class="text_to_nominal" compatibility="9.0.002" expanded="true" height="82" name="Text to Nominal" width="90" x="246" y="238">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Content"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="split_data" compatibility="9.0.002" expanded="true" height="103" name="Split Data" origin="GENERATED_SAMPLE" width="90" x="380" y="289">
<enumeration key="partitions">
<parameter key="ratio" value="0.8"/>
<parameter key="ratio" value="0.2"/>
</enumeration>
<parameter key="sampling_type" value="shuffled sampling"/>
</operator>
<operator activated="true" class="deeplearning:dl4j_word_embedding" compatibility="0.8.000" expanded="true" height="82" name="Text to Numbers using Word2Vec" origin="GENERATED_SAMPLE" width="90" x="380" y="85">
<parameter key="text_attribute" value="Content"/>
<parameter key="label_attribute" value="class"/>
<parameter key="max._sentence_length" value="10"/>
<description align="center" color="transparent" colored="false" width="126">Convert training sentences to numbers.</description>
</operator>
<operator activated="true" class="deeplearning:dl4j_word_embedding" compatibility="0.8.000" expanded="true" height="82" name="Text to Numbers using Word2Vec (2)" origin="GENERATED_SAMPLE" width="90" x="447" y="493">
<parameter key="text_attribute" value="Content"/>
<parameter key="label_attribute" value="class"/>
<parameter key="max._sentence_length" value="10"/>
<description align="center" color="transparent" colored="false" width="126">Convert testing sentences to numbers.</description>
</operator>
<operator activated="true" class="deeplearning:dl4j_tensor_sequential_neural_network" compatibility="0.8.000" expanded="true" height="103" name="Deep Learning on Tensors" origin="GENERATED_SAMPLE" width="90" x="514" y="85">
<parameter key="use_miniBatch" value="true"/>
<parameter key="updater" value="Nesterovs"/>
<parameter key="learning_rate" value="0.1"/>
<parameter key="infer_input_shape" value="false"/>
<parameter key="network_type" value="Convolutional"/>
<parameter key="height" value="10"/>
<parameter key="width" value="300"/>
<parameter key="depth" value="1"/>
<process expanded="true">
<operator activated="true" class="deeplearning:dl4j_convolutional_layer" compatibility="0.8.000" expanded="true" height="68" name="Add Convolutional Layer" origin="GENERATED_SAMPLE" width="90" x="179" y="34">
<parameter key="kernel_size" value="2.2"/>
<parameter key="stride_size" value="1.1"/>
<parameter key="layer_name" value="conv"/>
<description align="center" color="transparent" colored="false" width="126">3, 300 Kernel --&gt; 3 regular kernel size; 300 number of dimensions from Googles word2vec model</description>
</operator>
<operator activated="true" class="deeplearning:dl4j_global_pooling_layer" compatibility="0.8.000" expanded="true" height="68" name="Add Global Pooling Layer" origin="GENERATED_SAMPLE" width="90" x="380" y="34"/>
<operator activated="true" class="deeplearning:dl4j_dense_layer" compatibility="0.8.000" expanded="true" height="68" name="Add Dense Layer" origin="GENERATED_SAMPLE" width="90" x="581" y="34">
<parameter key="number_of_neurons" value="2"/>
<parameter key="activation_function" value="Softmax"/>
<description align="center" color="transparent" colored="false" width="126">2 classes --&gt; 2 neurons with softmax</description>
</operator>
<connect from_port="layerArchitecture" to_op="Add Convolutional Layer" to_port="layerArchitecture"/>
<connect from_op="Add Convolutional Layer" from_port="layerArchitecture" to_op="Add Global Pooling Layer" to_port="layerArchitecture"/>
<connect from_op="Add Global Pooling Layer" from_port="layerArchitecture" to_op="Add Dense Layer" to_port="layerArchitecture"/>
<connect from_op="Add Dense Layer" from_port="layerArchitecture" to_port="layerArchitecture"/>
<portSpacing port="source_layerArchitecture" spacing="0"/>
<portSpacing port="sink_layerArchitecture" spacing="0"/>
</process>
</operator>
<operator activated="true" class="deeplearning:dl4j_apply_tensor_model" compatibility="0.8.000" expanded="true" height="82" name="Apply Model on Tensor" origin="GENERATED_SAMPLE" width="90" x="648" y="187"/>
<operator activated="true" class="performance_binominal_classification" compatibility="9.0.002" expanded="true" height="82" name="Performance" origin="GENERATED_SAMPLE" width="90" x="782" y="187"/>
<connect from_op="Open File" from_port="file" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Text to Numbers using Word2Vec (2)" to_port="file with word2vec model"/>
<connect from_op="Multiply" from_port="output 2" to_op="Text to Numbers using Word2Vec" to_port="file with word2vec model"/>
<connect from_op="Read RSS Feed" from_port="output" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Union" to_port="example set 1"/>
<connect from_op="Read RSS Feed (2)" from_port="output" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Generate Attributes (2)" to_port="example set input"/>
<connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Set Role (2)" to_port="example set input"/>
<connect from_op="Set Role (2)" from_port="example set output" to_op="Union" to_port="example set 2"/>
<connect from_op="Union" from_port="union" to_op="Text to Nominal" to_port="example set input"/>
<connect from_op="Text to Nominal" from_port="example set output" to_op="Split Data" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 1" to_op="Text to Numbers using Word2Vec" to_port="example set"/>
<connect from_op="Split Data" from_port="partition 2" to_op="Text to Numbers using Word2Vec (2)" to_port="example set"/>
<connect from_op="Text to Numbers using Word2Vec" from_port="tensor" to_op="Deep Learning on Tensors" to_port="training set"/>
<connect from_op="Text to Numbers using Word2Vec (2)" from_port="tensor" to_op="Apply Model on Tensor" to_port="unlabelled tensor"/>
<connect from_op="Deep Learning on Tensors" from_port="model" to_op="Apply Model on Tensor" to_port="model"/>
<connect from_op="Apply Model on Tensor" from_port="labeled data" to_op="Performance" to_port="labelled data"/>
<connect from_op="Performance" from_port="performance" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
I am using the 1.2GB Google file for the lexicon.
Regards
Manolis