Hi,
I 've some problems to integrate my r-script in RapidMiner. My script is running in RStudio with the data from RapidMiner process, which I have wrote in an res-file.
<?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
<parameter key="logfile" value="/home/knecht/Master2017/Rapp/Logfile.log"/>
<parameter key="resultfile" value="/home/knecht/Master2017/Rapp/resultfile.res"/>
<process expanded="true">
<operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="45" y="34">
<parameter key="url" value="http://www.fask.uni-mainz.de/user/rapp/papers/disshtml/main/main.html"/>
<list key="crawling_rules">
<parameter key="store_with_matching_url" value="http://www.fask.uni-mainz.de/user/rapp/papers/disshtml/.*"/>
<parameter key="follow_link_with_matching_url" value="http://www.fask.uni-mainz.de/user/rapp/papers/disshtml.*"/>
</list>
<parameter key="max_crawl_depth" value="10"/>
<parameter key="retrieve_as_html" value="true"/>
<parameter key="add_content_as_attribute" value="true"/>
<parameter key="write_pages_to_disk" value="true"/>
<parameter key="output_dir" value="/home/knecht/Crawler"/>
<parameter key="max_pages" value="1000"/>
<parameter key="max_page_size" value="500"/>
<parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0"/>
<parameter key="ignore_robot_exclusion" value="true"/>
</operator>
<operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="45" y="136">
<parameter key="link_attribute" value="Link"/>
<parameter key="page_attribute" value="link"/>
<parameter key="random_user_agent" value="true"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="45" y="238">
<parameter key="keep_text" value="true"/>
<list key="specify_weights">
<parameter key="link" value="1.0"/>
</list>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="45" y="34">
<parameter key="minimum_text_block_length" value="2"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize Token" width="90" x="45" y="136">
<parameter key="mode" value="linguistic tokens"/>
<parameter key="language" value="German"/>
</operator>
<operator activated="true" class="text:filter_stopwords_german" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (German)" width="90" x="45" y="238"/>
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize" width="90" x="179" y="85">
<parameter key="mode" value="regular expression"/>
<parameter key="expression" value="[-!"#$%&'()*+,./:;<=>?@\[\\\]_`{|}~]([a-z]+)[-!"#$%&'()*+,./:;<=>?@\[\\\]_`{|}~] ^[0-9]+[-!"#$%&'()*+,./:;<=>?@\[\\\]_`{|}~]^[0-9] "/>
</operator>
<operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="447" y="34"/>
<connect from_port="document" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_op="Tokenize Token" to_port="document"/>
<connect from_op="Tokenize Token" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
<connect from_op="Filter Stopwords (German)" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:data_to_documents" compatibility="7.5.000" expanded="true" height="68" name="Data to Documents" width="90" x="45" y="340">
<parameter key="select_attributes_and_weights" value="true"/>
<list key="specify_weights">
<parameter key="text" value="1.0"/>
</list>
</operator>
<operator activated="true" class="multiply" compatibility="7.6.001" expanded="true" height="103" name="Data to Document" width="90" x="179" y="136"/>
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Execute R" width="90" x="380" y="187">
<parameter key="script" value="# rm_main is a mandatory function, # the number of arguments has to be the number of input ports (can be none) rm_main = function(data) { library(readr) library(dplyr) library(tidytext) library(tm) library(tidyr) library(stringr) library(widyr) library(ggraph) library(igraph) set.seed(2017) #KorpusMitZahlen <- read_file("/home/knecht/Master2017/Korpus/17-12-03-Rapp-Korpus.res") KorpusOhneZahlen <- removeNumbers(data) Korpus_DF <- data_frame(text=KorpusOhneZahlen) GesamtTermAnzahl <- Korpus_DF %>% unnest_tokens(word, text) GesamtTermAnzahl write.csv(GesamtTermAnzahl, '/home/knecht/Master2017/Wortlisten/17-12-10-Rapp-GesamtTermAnzahl-Liste') TermHaeufigkeit <- Korpus_DF %>% unnest_tokens(word, text) %>% count (word, sort=TRUE)%>% ungroup() TermHaeufigkeit write.csv(TermHaeufigkeit, '/home/knecht/Master2017/Wortlisten/17-12-10-Rapp-TermHaeufigkeit-Liste') #WortRang <- TermHaeufigkeit %>% # group_by(text) %>% # mutate(rank = row_number(), # 'term frequenz' = /GesamtTermAnzahl) #WortRang NGramKorpus <-Korpus_DF %>% unnest_tokens(ngram, text, token = "ngrams", n = 2) NGramKorpus %>% count(ngram) NGramKorpusTeilen <- NGramKorpus %>% separate(ngram, c("word1", "word2")) NGramZaehlen <- NGramKorpusTeilen %>% count(word1, word2, sort=TRUE) NGramZaehlen #write.csv(NGramZaehlen, '/home/knecht/Master2017/N-Gramme-Listen/17-12-06-Spon-NGram-Liste') write.csv(NGramZaehlen, '/home/knecht/Master2017/N-Gramme-Listen/17-12-10-Rapp-NGram-Liste') NGramZaehlen%>% filter(n>= 20) %>% filter(n<= 750) %>% graph_from_data_frame() %>% ggraph(layout = "igraph", algorithm= 'fr') + geom_edge_link(aes(alpha = n, width = n)) + geom_node_point(size = 2, color = "lightblue") + geom_node_text(aes(label = name), repel = TRUE) + theme_void() } "/>
</operator>
<operator activated="true" class="write_as_text" compatibility="7.6.001" expanded="true" height="82" name="Write Korpus" width="90" x="380" y="34">
<parameter key="result_file" value="/home/knecht/Master2017/Korpus/17-12-11-Rapp-Korpus.res"/>
</operator>
<connect from_op="Crawl Web" from_port="example set" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Data to Documents" from_port="documents" to_op="Data to Document" to_port="input"/>
<connect from_op="Data to Document" from_port="output 1" to_op="Write Korpus" to_port="input 1"/>
<connect from_op="Data to Document" from_port="output 2" to_op="Execute R" to_port="input 1"/>
<connect from_op="Execute R" from_port="output 1" to_port="result 2"/>
<connect from_op="Write Korpus" from_port="input 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
R-Script (RStudio):
library(readr)
library(dplyr)
library(tidytext)
library(tm)
library(tidyr)
library(stringr)
library(widyr)
library(ggraph)
library(igraph)
set.seed(2017)
KorpusMitZahlen <- read_file("/home/knecht/Korpus/17-12-03-Rapp-Korpus.res")
KorpusOhneZahlen <- removeNumbers(KorpusMitZahlen)
Korpus_DF <- data_frame(text=KorpusOhneZahlen)
GesamtTermAnzahl <- Korpus_DF %>%
unnest_tokens(word, text)
GesamtTermAnzahl
write.csv(GesamtTermAnzahl, '/home/knecht/Wortlisten/17-12-10-Rapp-GesamtTermAnzahl-Liste')
TermHaeufigkeit <- Korpus_DF %>%
unnest_tokens(word, text) %>%
count (word, sort=TRUE)%>%
ungroup()
TermHaeufigkeit
write.csv(TermHaeufigkeit, '/home/knecht/Wortlisten/17-12-10-Rapp-TermHaeufigkeit-Liste')
NGramKorpus <-Korpus_DF %>%
unnest_tokens(ngram, text, token = "ngrams", n = 2)
NGramKorpus %>%
count(ngram)
NGramKorpusTeilen <- NGramKorpus %>%
separate(ngram, c("word1", "word2"))
NGramZaehlen <- NGramKorpusTeilen %>%
count(word1, word2, sort=TRUE)
NGramZaehlen
#write.csv(NGramZaehlen, '/home/knecht/N-Gramme-Listen/17-12-06-Spon-NGram-Liste')
write.csv(NGramZaehlen, '/home/knecht/N-Gramme-Listen/17-12-10-Rapp-NGram-Liste')
NGramZaehlen%>%
filter(n>= 20) %>%
filter(n<= 750) %>%
graph_from_data_frame() %>%
ggraph(layout = "igraph", algorithm= 'fr') +
geom_edge_link(aes(alpha = n, width = n)) +
geom_node_point(size = 2, color = "lightblue") +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
But when I paste my script the Execute-R operator, it won't work with the same input data. I've got the error massage wrong data at port. I thought the Execute-R runs only the r-script.
# rm_main is a mandatory function,
# the number of arguments has to be the number of input ports (can be none)
rm_main = function(data)
{
library(readr)
library(dplyr)
library(tidytext)
library(tm)
library(tidyr)
library(stringr)
library(widyr)
library(ggraph)
library(igraph)
set.seed(2017)
#KorpusMitZahlen <- read_file("/home/knecht/Korpus/17-12-03-Rapp-Korpus.res")
KorpusOhneZahlen <- removeNumbers(data)
Korpus_DF <- data_frame(text=KorpusOhneZahlen)
GesamtTermAnzahl <- Korpus_DF %>%
unnest_tokens(word, text)
GesamtTermAnzahl
write.csv(GesamtTermAnzahl, '/home/knecht/Wortlisten/17-12-10-Rapp-GesamtTermAnzahl-Liste')
TermHaeufigkeit <- Korpus_DF %>%
unnest_tokens(word, text) %>%
count (word, sort=TRUE)%>%
ungroup()
TermHaeufigkeit
write.csv(TermHaeufigkeit, '/home/knecht/Wortlisten/17-12-10-Rapp-TermHaeufigkeit-Liste')
#WortRang <- TermHaeufigkeit %>%
# group_by(text) %>%
# mutate(rank = row_number(),
# 'term frequenz' = /GesamtTermAnzahl)
#WortRang
NGramKorpus <-Korpus_DF %>%
unnest_tokens(ngram, text, token = "ngrams", n = 2)
NGramKorpus %>%
count(ngram)
NGramKorpusTeilen <- NGramKorpus %>%
separate(ngram, c("word1", "word2"))
NGramZaehlen <- NGramKorpusTeilen %>%
count(word1, word2, sort=TRUE)
NGramZaehlen
#write.csv(NGramZaehlen, '/home/knecht/N-Gramme-Listen/17-12-06-Spon-NGram-Liste')
write.csv(NGramZaehlen, '/home/knecht/N-Gramme-Listen/17-12-10-Rapp-NGram-Liste')
NGramZaehlen%>%
filter(n>= 20) %>%
filter(n<= 750) %>%
graph_from_data_frame() %>%
ggraph(layout = "igraph", algorithm= 'fr') +
geom_edge_link(aes(alpha = n, width = n)) +
geom_node_point(size = 2, color = "lightblue") +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
}
Maybe someone has an idea.
regards
Tobias