Process failed abnormally in web mining
Hi, I have a web mining process in which some of the links are incorrect, therefore I use Handle Exception. Due to Loop Examples not working, the process is a bit messy:
<?xml version="1.0" encoding="UTF-8"?><process version="8.2.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.2.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.2.000" expanded="true" height="68" name="Retrieve joined" width="90" x="45" y="34">
<parameter key="repository_entry" value="../data/sources/joined"/>
</operator>
<operator activated="true" class="generate_id" compatibility="8.2.000" expanded="true" height="82" name="Generate ID" width="90" x="179" y="34"/>
<operator activated="true" class="multiply" compatibility="8.2.000" expanded="true" height="103" name="Multiply" width="90" x="179" y="238"/>
<operator activated="true" class="select_attributes" compatibility="8.2.000" expanded="true" height="82" name="Select Attributes" width="90" x="313" y="442">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="link"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="8.2.000" expanded="true" height="103" name="Filter Examples" width="90" x="447" y="442">
<list key="filters_list">
<parameter key="filters_entry_key" value="link.is_not_missing."/>
</list>
</operator>
<operator activated="false" class="split_data" compatibility="8.2.000" expanded="true" height="68" name="Split Data" width="90" x="581" y="442">
<enumeration key="partitions">
<parameter key="ratio" value="0.2"/>
<parameter key="ratio" value="0.8"/>
</enumeration>
<description align="center" color="transparent" colored="false" width="126">Downsampled!</description>
</operator>
<operator activated="true" class="extract_macro" compatibility="8.2.000" expanded="true" height="68" name="Extract Macro (2)" width="90" x="715" y="442">
<parameter key="macro" value="number_examples"/>
<list key="additional_macros"/>
</operator>
<operator activated="true" class="concurrency:loop" compatibility="8.2.000" expanded="true" height="82" name="Loop" width="90" x="514" y="697">
<parameter key="number_of_iterations" value="%{number_examples}"/>
<parameter key="enable_parallel_execution" value="false"/>
<process expanded="true">
<operator activated="true" class="filter_example_range" compatibility="8.2.000" expanded="true" height="82" name="Filter Example Range" width="90" x="112" y="34">
<parameter key="first_example" value="%{iteration}"/>
<parameter key="last_example" value="%{iteration}"/>
</operator>
<operator activated="true" class="extract_macro" compatibility="8.2.000" expanded="true" height="68" name="Extract Macro (3)" width="90" x="246" y="34">
<parameter key="macro" value="link"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="link"/>
<parameter key="example_index" value="1"/>
<list key="additional_macros">
<parameter key="id" value="id"/>
</list>
</operator>
<operator activated="true" class="handle_exception" compatibility="8.2.000" expanded="true" height="82" name="Handle Exception" width="90" x="380" y="34">
<process expanded="true">
<operator activated="true" class="web:get_webpage" compatibility="7.3.000" expanded="true" height="68" name="Get Page" width="90" x="112" y="34">
<parameter key="url" value="%{link}"/>
<list key="query_parameters"/>
<list key="request_properties"/>
</operator>
<connect from_op="Get Page" from_port="output" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
<process expanded="true">
<connect from_port="in 1" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:documents_to_data" compatibility="8.1.000" expanded="true" height="82" name="Documents to Data" width="90" x="514" y="34">
<parameter key="text_attribute" value="Text"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="8.2.000" expanded="true" height="82" name="Generate Attributes" width="90" x="648" y="34">
<list key="function_descriptions">
<parameter key="id" value="%{id}"/>
</list>
</operator>
<connect from_port="input 1" to_op="Filter Example Range" to_port="example set input"/>
<connect from_op="Filter Example Range" from_port="example set output" to_op="Extract Macro (3)" to_port="example set"/>
<connect from_op="Handle Exception" from_port="out 1" to_op="Documents to Data" to_port="documents 1"/>
<connect from_op="Documents to Data" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="append" compatibility="8.2.000" expanded="true" height="82" name="Append" width="90" x="648" y="697"/>
<operator activated="true" class="filter_examples" compatibility="8.2.000" expanded="true" height="103" name="Filter Examples (2)" width="90" x="782" y="697">
<list key="filters_list">
<parameter key="filters_entry_key" value="Response-Code.eq.200"/>
</list>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="916" y="697">
<parameter key="create_word_vector" value="false"/>
<parameter key="keep_text" value="true"/>
<parameter key="select_attributes_and_weights" value="true"/>
<list key="specify_weights">
<parameter key="Text" value="1.0"/>
</list>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content (2)" width="90" x="313" y="34"/>
<connect from_port="document" to_op="Extract Content (2)" to_port="document"/>
<connect from_op="Extract Content (2)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="parse_numbers" compatibility="8.2.000" expanded="true" height="82" name="Parse Numbers" width="90" x="1050" y="697">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="id"/>
</operator>
<operator activated="true" class="set_role" compatibility="8.2.000" expanded="true" height="82" name="Set Role" width="90" x="1184" y="697">
<parameter key="attribute_name" value="id"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="concurrency:join" compatibility="8.2.000" expanded="true" height="82" name="Join" width="90" x="916" y="85">
<parameter key="join_type" value="left"/>
<list key="key_attributes"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="8.2.000" expanded="true" height="82" name="Select Attributes (2)" width="90" x="1050" y="85">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="timestamp|text|message|link|excerpt"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="store" compatibility="8.2.000" expanded="true" height="68" name="Store" width="90" x="1184" y="85">
<parameter key="repository_entry" value="../data/sources/joined with text from links"/>
</operator>
<connect from_op="Retrieve joined" from_port="output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Join" to_port="left"/>
<connect from_op="Multiply" from_port="output 2" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Extract Macro (2)" to_port="example set"/>
<connect from_op="Extract Macro (2)" from_port="example set" to_op="Loop" to_port="input 1"/>
<connect from_op="Loop" from_port="output 1" to_op="Append" to_port="example set 1"/>
<connect from_op="Append" from_port="merged set" to_op="Filter Examples (2)" to_port="example set input"/>
<connect from_op="Filter Examples (2)" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Parse Numbers" to_port="example set input"/>
<connect from_op="Parse Numbers" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Join" to_port="right"/>
<connect from_op="Join" from_port="join" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Store" to_port="input"/>
<connect from_op="Store" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
When I run it, I get an abnormal error:
Process failed abnormally
Ooops. Seems like you have found a bug. Please report it in our community at https://community.rapidminer.com. Reason: Could not create meta attributes
com.rapidminer.operator.OperatorException: Could not create meta attributes
at com.rapidminer.operator.text.io.Document2ExampleSet.doWork(Document2ExampleSet.java:101)
at com.rapidminer.operator.Operator.execute(Operator.java:1025)
at com.rapidminer.operator.execution.SimpleUnitExecutor.execute(SimpleUnitExecutor.java:77)
at com.rapidminer.operator.ExecutionUnit$2.run(ExecutionUnit.java:812)
at com.rapidminer.operator.ExecutionUnit$2.run(ExecutionUnit.java:807)
at java.security.AccessController.doPrivileged(Native Method)
at com.rapidminer.operator.ExecutionUnit.execute(ExecutionUnit.java:807)
at com.rapidminer.extension.concurrency.operator.process_control.loops.AbstractLoopOperator.doIteration(AbstractLoopOperator.java:408)
at com.rapidminer.extension.concurrency.operator.process_control.loops.AbstractLoopOperator.performSynchronizedLoop(AbstractLoopOperator.java:381)
at com.rapidminer.extension.concurrency.operator.process_control.loops.AbstractLoopOperator.doWork(AbstractLoopOperator.java:457)
at com.rapidminer.operator.Operator.execute(Operator.java:1025)
at com.rapidminer.operator.execution.SimpleUnitExecutor.execute(SimpleUnitExecutor.java:77)
at com.rapidminer.operator.ExecutionUnit$2.run(ExecutionUnit.java:812)
at com.rapidminer.operator.ExecutionUnit$2.run(ExecutionUnit.java:807)
at java.security.AccessController.doPrivileged(Native Method)
at com.rapidminer.operator.ExecutionUnit.execute(ExecutionUnit.java:807)
at com.rapidminer.operator.OperatorChain.doWork(OperatorChain.java:428)
at com.rapidminer.operator.Operator.execute(Operator.java:1025)
at com.rapidminer.Process.execute(Process.java:1315)
at com.rapidminer.Process.run(Process.java:1290)
at com.rapidminer.Process.run(Process.java:1181)
at com.rapidminer.Process.run(Process.java:1134)
at com.rapidminer.Process.run(Process.java:1129)
at com.rapidminer.Process.run(Process.java:1119)
at com.rapidminer.execution.jobcontainer.execution.SimpleExecutor.executeProcess(SimpleExecutor.java:84)
at com.rapidminer.execution.jobcontainer.EngineRunner.onApplicationEvent(EngineRunner.java:77)
at com.rapidminer.execution.jobcontainer.EngineRunner.onApplicationEvent(EngineRunner.java:31)
at org.springframework.context.event.SimpleApplicationEventMulticaster.invokeListener(SimpleApplicationEventMulticaster.java:167)
at org.springframework.context.event.SimpleApplicationEventMulticaster.multicastEvent(SimpleApplicationEventMulticaster.java:139)
at org.springframework.context.support.AbstractApplicationContext.publishEvent(AbstractApplicationContext.java:393)
at org.springframework.context.support.AbstractApplicationContext.publishEvent(AbstractApplicationContext.java:347)
at org.springframework.boot.context.event.EventPublishingRunListener.finished(EventPublishingRunListener.java:101)
at org.springframework.boot.SpringApplicationRunListeners.callFinishedListener(SpringApplicationRunListeners.java:79)
at org.springframework.boot.SpringApplicationRunListeners.finished(SpringApplicationRunListeners.java:72)
at org.springframework.boot.SpringApplication.run(SpringApplication.java:305)
at com.rapidminer.execution.jobcontainer.Application.main(Application.java:44)
Any idea what could it be? Since 8.0 the loops are behaving a bit unpredictably
Answers
-
Hallo SGolbert,
I tried to reproduce your XML in my own RM 8.2.000; because I don't have your data, I think that it was quite difficult to have the same situation. Suggestion: could be an idea to use breakpoints in your process? I made as well research inside pure Java litterature
but I didn't find any relevant information.
Maerkli
0 -
Hi Maerkli,
thank you for looking into the problem. It turns out, I replaced the process by another (actually moved this step to the webscrapping process -made with Scrapy-).
If I come to the error again, I will try to reproduce it with simple data.
Regards,
Sebastian
1