Hi,
i've to analysis 2000 webpages. My test process with only 5 or up tu 99 pages to crawl runs fine. But when I change the crawler to crawl a larger number of pages like 100, 200, 1000 or 2000 I'got the error message: "Duplicate attribute name: id".
This is my process:
<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="subprocess" compatibility="8.0.001" expanded="true" height="82" name="Crawler" width="90" x="45" y="289">
<process expanded="true">
<operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="112" y="34">
<parameter key="url" value="http://www.spiegel.de"/>
<list key="crawling_rules">
<parameter key="store_with_matching_url" value=".+www.spiegel.+"/>
<parameter key="follow_link_with_matching_url" value=".+spiegel.+|.+de.+"/>
</list>
<parameter key="max_crawl_depth" value="10"/>
<parameter key="retrieve_as_html" value="true"/>
<parameter key="add_content_as_attribute" value="true"/>
<parameter key="max_pages" value="5"/>
<parameter key="max_page_size" value="500"/>
<parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0"/>
</operator>
<operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="246" y="34">
<parameter key="link_attribute" value="Link"/>
<parameter key="page_attribute" value="link"/>
<parameter key="random_user_agent" value="true"/>
</operator>
<connect from_op="Crawl Web" from_port="example set" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="45" y="187">
<parameter key="keep_text" value="true"/>
<parameter key="data_management" value="memory-optimized"/>
<list key="specify_weights">
<parameter key="link" value="1.0"/>
</list>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="45" y="34">
<parameter key="minimum_text_block_length" value="2"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize Token" width="90" x="179" y="34">
<parameter key="mode" value="linguistic tokens"/>
<parameter key="language" value="German"/>
</operator>
<operator activated="true" class="text:filter_tokens_by_content" compatibility="7.5.000" expanded="true" height="68" name="Filter Tokens a-zA-Z" width="90" x="313" y="34">
<parameter key="condition" value="matches"/>
<parameter key="regular_expression" value="[a-zA-Z]+"/>
</operator>
<operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="447" y="34"/>
<connect from_port="document" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_op="Tokenize Token" to_port="document"/>
<connect from_op="Tokenize Token" from_port="document" to_op="Filter Tokens a-zA-Z" to_port="document"/>
<connect from_op="Filter Tokens a-zA-Z" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="124" name="Process Doc2Data" width="90" x="45" y="34"/>
<operator activated="true" class="subprocess" compatibility="8.0.001" expanded="true" height="82" name="Filter tf-idf" width="90" x="179" y="85">
<process expanded="true">
<operator activated="true" class="transpose" compatibility="8.0.001" expanded="true" height="82" name="Ingress Transpose" width="90" x="45" y="34"/>
<operator activated="true" class="filter_example_range" compatibility="8.0.001" expanded="true" height="82" name="Filter Example Range" width="90" x="179" y="34">
<parameter key="first_example" value="1"/>
<parameter key="last_example" value="15"/>
<parameter key="invert_filter" value="true"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="8.0.001" expanded="true" height="103" name="Filter Examples" width="90" x="313" y="34">
<parameter key="invert_filter" value="true"/>
<list key="filters_list">
<parameter key="filters_entry_key" value="id.equals.text"/>
</list>
</operator>
<operator activated="true" class="transpose" compatibility="8.0.001" expanded="true" height="82" name="tf-idf Transpose" width="90" x="447" y="34"/>
<connect from_port="in 1" to_op="Ingress Transpose" to_port="example set input"/>
<connect from_op="Ingress Transpose" from_port="example set output" to_op="Filter Example Range" to_port="example set input"/>
<connect from_op="Filter Example Range" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="tf-idf Transpose" to_port="example set input"/>
<connect from_op="tf-idf Transpose" from_port="example set output" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="subprocess" compatibility="8.0.001" expanded="true" height="124" name="Splitting" width="90" x="179" y="187">
<process expanded="true">
<operator activated="true" class="select_attributes" compatibility="8.0.001" expanded="true" height="82" name="Select Attributes" width="90" x="45" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="text"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="generate_id" compatibility="8.0.001" expanded="true" height="82" name="Generate ID" width="90" x="45" y="136"/>
<operator activated="true" class="rename" compatibility="8.0.001" expanded="true" height="82" name="Rename ID" width="90" x="45" y="238">
<parameter key="old_name" value="id"/>
<parameter key="new_name" value="Document"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="set_role" compatibility="8.0.001" expanded="true" height="82" name="Set Role" width="90" x="45" y="340">
<parameter key="attribute_name" value="text"/>
<parameter key="target_role" value="label"/>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="rename" compatibility="8.0.001" expanded="true" height="82" name="Rename" width="90" x="179" y="34">
<parameter key="old_name" value="text"/>
<parameter key="new_name" value="word"/>
<list key="rename_additional_attributes"/>
</operator>
<operator activated="true" class="split" compatibility="8.0.001" expanded="true" height="82" name="Split" width="90" x="179" y="136">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attributes" value="word"/>
<parameter key="include_special_attributes" value="true"/>
<parameter key="split_pattern" value="\s+"/>
</operator>
<operator activated="true" class="transpose" compatibility="8.0.001" expanded="true" height="82" name="Splitting Output" width="90" x="313" y="34"/>
<connect from_port="in 1" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Rename ID" to_port="example set input"/>
<connect from_op="Rename ID" from_port="example set output" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Rename" to_port="example set input"/>
<connect from_op="Rename" from_port="example set output" to_op="Split" to_port="example set input"/>
<connect from_op="Split" from_port="example set output" to_op="Splitting Output" to_port="example set input"/>
<connect from_op="Split" from_port="original" to_port="out 3"/>
<connect from_op="Splitting Output" from_port="example set output" to_port="out 1"/>
<connect from_op="Splitting Output" from_port="original" to_port="out 2"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
<portSpacing port="sink_out 3" spacing="0"/>
<portSpacing port="sink_out 4" spacing="0"/>
</process>
</operator>
<operator activated="true" class="subprocess" compatibility="8.0.001" expanded="true" height="187" name="Association" width="90" x="313" y="238">
<process expanded="true">
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Generate Bigrams" width="90" x="45" y="34">
<parameter key="script" value="# rm_main is a mandatory function, # the number of arguments has to be the number of input ports (can be none) rm_main = function(data) { 	library(dplyr) 	library(tidytext) 	spon_bigrams <- data %>% 	 unnest_tokens(bigram, word, token = "ngrams", n = 2) 	print(spon_bigrams) return(list(spon_bigrams)) } "/>
</operator>
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Seperate Bigrams" width="90" x="45" y="136">
<parameter key="script" value="# rm_main is a mandatory function, # the number of arguments has to be the number of input ports (can be none) rm_main = function(data) { 	library(dplyr) 	library(tidytext) 	library(tidyr) 	devided_bigrams <-data %>% 	 separate(bigram, c("word1", "word2"), sep = " ") 	 print(devided_bigrams) 	#counted_bigrams <- devided_bigrams %>% 	# count(word1, word2, sort = TRUE) # print(counted_bigrams) return(list(devided_bigrams)) } "/>
</operator>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="145" name="Seperated Bigrams" width="90" x="45" y="238"/>
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Count Bigrams per Page" width="90" x="179" y="34">
<parameter key="script" value="rm_main = function(data) { 	library(dplyr) 	library(tidytext) 	library(tidyr) 	count_bigrams_per_page <- data %>% 	 count(Document, word1, word2, sort = TRUE) 	print(count_bigrams_per_page) 	counted_bigrams_per_page <- data.frame(count_bigrams_per_page) return(counted_bigrams_per_page) } "/>
</operator>
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Count all Bigrams" width="90" x="179" y="187">
<parameter key="script" value="rm_main = function(data) { 	library(dplyr) 	library(tidytext) 	library(tidyr) 	count_bigrams <- data %>% 	 count(word1, word2, sort = TRUE) 	print(count_bigrams) 	counted_bigrams <- data.frame(count_bigrams) return(counted_bigrams) } "/>
</operator>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Counted Bigrams per Page" width="90" x="313" y="34"/>
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Filter Word 2" width="90" x="246" y="289">
<parameter key="script" value="rm_main = function(data) { 	library(dplyr) 	library(tidytext) 	library(tidyr) 	bigrams_filtered <- data %>% 	 #count(word1, word2, sort = TRUE) 	 filter(word2 == "spiegel") %>% 	 count(word1, word2, sort =TRUE) 	 print(bigrams_filtered) 	bigrams_filtered <- data.frame(bigrams_filtered) return(bigrams_filtered) } "/>
</operator>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply" width="90" x="246" y="391"/>
<operator activated="false" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="68" name="Visual (2)" width="90" x="514" y="391">
<parameter key="script" value="# not finished rm_main = function(data) { 	library(dplyr) 	library(tidytext) 	library(tidyr) library(igraph) bigram_graph <- data %>% #filter(n >= 6) %>% graph_from_data_frame print(bigram_graph) 	# bigram_graph <- data.frame(bigram_graph) 	library(ggraph) 	set.seed(2017) 	graph <- ggraph(bigram_graph, layout = "fr") + 	 geom_edge_link() + 	 geom_node_point() + 	 geom_node_text(aes(label = name), vjust = 1, hjust =1) 	setwd("/home/knecht") 	#write.csv(graph, '/home/knecht/filtergraphtest001') 	png(filename="filtergraphtest001") 	plot(graph) 	dev.off() 	 	 return(list(ggraph)) } "/>
</operator>
<operator activated="false" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="68" name="Visual" width="90" x="447" y="85">
<parameter key="script" value="# not finished rm_main = function(data) { 	library(dplyr) 	library(tidytext) 	library(tidyr) library(igraph) bigram_graph <- data %>% filter(n >= 6) %>% graph_from_data_frame print(bigram_graph) 	# bigram_graph <- data.frame(bigram_graph) 	library(ggraph) 	set.seed(2017) 	graph <- ggraph(bigram_graph, layout = "fr") + 	 geom_edge_link() + 	 geom_node_point() + 	 geom_node_text(aes(label = name), vjust = 1, hjust =1) 	setwd("/home/knecht") 	#write.csv(graph, '/home/knecht/graphtest001') 	png(filename="imagetest002.png") 	plot(graph) 	dev.off() 	 	 return(list(ggraph)) } "/>
</operator>
<connect from_port="in 1" to_op="Generate Bigrams" to_port="input 1"/>
<connect from_op="Generate Bigrams" from_port="output 1" to_op="Seperate Bigrams" to_port="input 1"/>
<connect from_op="Seperate Bigrams" from_port="output 1" to_op="Seperated Bigrams" to_port="input"/>
<connect from_op="Seperated Bigrams" from_port="output 1" to_port="out 1"/>
<connect from_op="Seperated Bigrams" from_port="output 2" to_op="Count Bigrams per Page" to_port="input 1"/>
<connect from_op="Seperated Bigrams" from_port="output 3" to_op="Count all Bigrams" to_port="input 1"/>
<connect from_op="Seperated Bigrams" from_port="output 4" to_op="Filter Word 2" to_port="input 1"/>
<connect from_op="Count Bigrams per Page" from_port="output 1" to_op="Counted Bigrams per Page" to_port="input"/>
<connect from_op="Count all Bigrams" from_port="output 1" to_port="out 3"/>
<connect from_op="Counted Bigrams per Page" from_port="output 1" to_port="out 2"/>
<connect from_op="Counted Bigrams per Page" from_port="output 2" to_port="out 4"/>
<connect from_op="Filter Word 2" from_port="output 1" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_port="out 5"/>
<connect from_op="Multiply" from_port="output 2" to_port="out 6"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
<portSpacing port="sink_out 3" spacing="0"/>
<portSpacing port="sink_out 4" spacing="0"/>
<portSpacing port="sink_out 5" spacing="0"/>
<portSpacing port="sink_out 6" spacing="0"/>
<portSpacing port="sink_out 7" spacing="0"/>
</process>
</operator>
<connect from_op="Crawler" from_port="out 1" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Process Doc2Data" to_port="input"/>
<connect from_op="Process Doc2Data" from_port="output 1" to_port="result 1"/>
<connect from_op="Process Doc2Data" from_port="output 2" to_op="Filter tf-idf" to_port="in 1"/>
<connect from_op="Process Doc2Data" from_port="output 3" to_op="Splitting" to_port="in 1"/>
<connect from_op="Filter tf-idf" from_port="out 1" to_port="result 2"/>
<connect from_op="Splitting" from_port="out 1" to_port="result 3"/>
<connect from_op="Splitting" from_port="out 2" to_port="result 4"/>
<connect from_op="Splitting" from_port="out 3" to_op="Association" to_port="in 1"/>
<connect from_op="Association" from_port="out 1" to_port="result 5"/>
<connect from_op="Association" from_port="out 2" to_port="result 6"/>
<connect from_op="Association" from_port="out 3" to_port="result 7"/>
<connect from_op="Association" from_port="out 4" to_port="result 8"/>
<connect from_op="Association" from_port="out 5" to_port="result 9"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
<portSpacing port="sink_result 6" spacing="0"/>
<portSpacing port="sink_result 7" spacing="0"/>
<portSpacing port="sink_result 8" spacing="0"/>
<portSpacing port="sink_result 9" spacing="0"/>
<portSpacing port="sink_result 10" spacing="0"/>
</process>
</operator>
</process>
And this is the error message i've got:
Exception: java.lang.IllegalArgumentException
Message: Duplicate attribute name: id
Stack trace:
com.rapidminer.example.SimpleAttributes.register(SimpleAttributes.java:124)
com.rapidminer.example.SimpleAttributes.add(SimpleAttributes.java:203)
com.rapidminer.example.set.SimpleExampleSet.<init>(SimpleExampleSet.java:121)
com.rapidminer.example.utils.ExampleSetBuilder.build(ExampleSetBuilder.java:246)
com.rapidminer.operator.preprocessing.ExampleSetTranspose.apply(ExampleSetTranspose.java:165)
com.rapidminer.operator.AbstractExampleSetProcessing.doWork(AbstractExampleSetProcessing.java:117)
com.rapidminer.operator.Operator.execute(Operator.java:1004)
com.rapidminer.operator.execution.SimpleUnitExecutor.execute(SimpleUnitExecutor.java:77)
com.rapidminer.operator.ExecutionUnit$3.run(ExecutionUnit.java:812)
com.rapidminer.operator.ExecutionUnit$3.run(ExecutionUnit.java:807)
java.security.AccessController.doPrivileged(Native Method)
com.rapidminer.operator.ExecutionUnit.execute(ExecutionUnit.java:807)
com.rapidminer.operator.OperatorChain.doWork(OperatorChain.java:428)
com.rapidminer.operator.SimpleOperatorChain.doWork(SimpleOperatorChain.java:99)
com.rapidminer.operator.Operator.execute(Operator.java:1004)
com.rapidminer.operator.execution.SimpleUnitExecutor.execute(SimpleUnitExecutor.java:77)
com.rapidminer.operator.ExecutionUnit$3.run(ExecutionUnit.java:812)
com.rapidminer.operator.ExecutionUnit$3.run(ExecutionUnit.java:807)
java.security.AccessController.doPrivileged(Native Method)
com.rapidminer.operator.ExecutionUnit.execute(ExecutionUnit.java:807)
com.rapidminer.operator.OperatorChain.doWork(OperatorChain.java:428)
com.rapidminer.operator.Operator.execute(Operator.java:1004)
com.rapidminer.Process.execute(Process.java:1310)
com.rapidminer.Process.run(Process.java:1285)
com.rapidminer.Process.run(Process.java:1176)
com.rapidminer.Process.run(Process.java:1129)
com.rapidminer.Process.run(Process.java:1124)
com.rapidminer.Process.run(Process.java:1114)
com.rapidminer.gui.ProcessThread.run(ProcessThread.java:65)
Maybe here is someone who can help me.
Regards,
Tobias