🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

loop a script over a large list of examples

TobiasNehrigUser: "TobiasNehrig"
New Altair Community Member
Updated by Jocelyn

Hi Experts,

I’ve a example set with 1 attribute and 1975 examples, each is the content of a web page).

The input looks like:

18-01-04-liste mit 1975 Spon Texten.png

 

Over each example I’d like to execute an R Script to split the words, create a bi-gram graph list and store this in a list for later to analysis them.

I thought, I could use the Loop Value Operator to run the scripts over each example, but the Operator would loop over all 1975 examples for 1975 times.

If I use the Loop Example Operator it also runs over all examples but in this case the process terminates at the begin of the second loop with the error message: PM INFO: [1] "Failed to execute the script."; PM INFO: [1] "Evaluation error: argument `...` should be a character vector (or an object coercible to)."

 

This is my process:

<?xml version="1.0" encoding="UTF-8"?><process version="8.0.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.0.001" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="retrieve" compatibility="8.0.001" expanded="true" height="68" name="Retrieve 18-01-04-list of 4650 crawled pages" width="90" x="45" y="34">
<parameter key="repository_entry" value="//Local Repository/data/18-01-04-list of 4650 crawled pages"/>
</operator>
<operator activated="true" class="generate_id" compatibility="8.0.001" expanded="true" height="82" name="Generate ID" width="90" x="179" y="34">
<parameter key="create_nominal_ids" value="true"/>
</operator>
<operator activated="true" class="concurrency:loop_values" compatibility="8.0.001" expanded="true" height="124" name="Loop Values" width="90" x="313" y="34">
<parameter key="attribute" value="text"/>
<parameter key="enable_parallel_execution" value="false"/>
<process expanded="true">
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Split Text in Words" width="90" x="45" y="34">
<parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;&#10;rm_main = function(data)&#10;{&#10; if(is.data.frame(data)){&#10;&#9;spon_words &lt;- data %&gt;%&#10;&#9; unnest_tokens(bigram, text, token = &quot;ngrams&quot;, n = 2)&#10;&#9; }&#10;&#9;print(spon_words)&#10;&#10; return(list(spon_words)) &#10;}&#10;"/>
</operator>
<operator activated="true" class="free_memory" compatibility="8.0.001" expanded="true" height="82" name="Free Memory" width="90" x="45" y="136"/>
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Seperat" width="90" x="45" y="238">
<parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;library(tidyr)&#10;library(tokenizers)&#10;&#10;rm_main = function(data)&#10;{&#10;devided_bigrams &lt;-data %&gt;%&#10;&#9;separate(bigram, c(&quot;word1&quot;, &quot;word2&quot;), sep = &quot; &quot;)&#10;&#9;print(devided_bigrams)&#10; return(list(devided_bigrams))&#10;}&#10;"/>
</operator>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply (2)" width="90" x="179" y="34"/>
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Count all Bigrams" width="90" x="179" y="187">
<parameter key="script" value="rm_main = function(data)&#10;{&#10;&#9;library(dplyr)&#10;&#9;library(tidytext)&#10;&#9;library(tidyr)&#10;&#10;&#9;count_bigrams &lt;- data %&gt;%&#10;&#9; count(word1, word2, sort = TRUE)&#10;&#9;print(count_bigrams)&#10;&#10;&#9;counted_bigrams &lt;- data.frame(count_bigrams)&#10; &#10; return(counted_bigrams)&#10;}&#10;"/>
</operator>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply" width="90" x="313" y="85"/>
<operator activated="true" class="free_memory" compatibility="8.0.001" expanded="true" height="82" name="Free Memory (2)" width="90" x="313" y="187"/>
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="draw graph" width="90" x="447" y="187">
<parameter key="script" value="rm_main = function(data)&#10;{&#10;&#9;library(dplyr)&#10;&#9;library(tidytext)&#10;&#9;library(tidyr)&#10; library(igraph)&#10;&#10; bigram_graph &lt;- data %&gt;%&#10; filter(n &gt;= 10) %&gt;%&#10; graph_from_data_frame&#10; print(bigram_graph)&#10; &#9;# bigram_graph &lt;- data.frame(bigram_graph)&#10;&#10; &#9;library(ggraph)&#10; &#9;set.seed(2017)&#10;&#10; &#9;graph &lt;- ggraph(bigram_graph, layout = &quot;fr&quot;) +&#10; &#9; geom_edge_link() +&#10; &#9; geom_node_point() +&#10; &#9; geom_node_text(aes(label = name), vjust = 1, hjust =1)&#10;&#10; &#9;setwd(&quot;/home/knecht&quot;)&#10;&#9;#graph.write(graph, &quot;/home/knecht/graph01.txt&quot;,, &quot;edgelist&quot;)&#10; &#9;#ggsave(filename = &quot;foo300.png&quot;, width = 5, height = 4, dpi = 300, units = &quot;in&quot;, device='png')&#10; &#9; &#9;&#10; return(list(graph))&#10;}&#10;"/>
</operator>
<operator activated="true" class="free_memory" compatibility="8.0.001" expanded="true" height="82" name="Free Memory (3)" width="90" x="581" y="187"/>
<connect from_port="input 1" to_op="Split Text in Words" to_port="input 1"/>
<connect from_op="Split Text in Words" from_port="output 1" to_op="Free Memory" to_port="through 1"/>
<connect from_op="Free Memory" from_port="through 1" to_op="Seperat" to_port="input 1"/>
<connect from_op="Seperat" from_port="output 1" to_op="Multiply (2)" to_port="input"/>
<connect from_op="Multiply (2)" from_port="output 1" to_port="output 1"/>
<connect from_op="Multiply (2)" from_port="output 2" to_op="Count all Bigrams" to_port="input 1"/>
<connect from_op="Count all Bigrams" from_port="output 1" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_port="output 2"/>
<connect from_op="Multiply" from_port="output 2" to_op="Free Memory (2)" to_port="through 1"/>
<connect from_op="Free Memory (2)" from_port="through 1" to_op="draw graph" to_port="input 1"/>
<connect from_op="draw graph" from_port="output 1" to_op="Free Memory (3)" to_port="through 1"/>
<connect from_op="Free Memory (3)" from_port="through 1" to_port="output 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
<portSpacing port="sink_output 3" spacing="0"/>
<portSpacing port="sink_output 4" spacing="0"/>
</process>
</operator>
<operator activated="false" class="loop_examples" compatibility="8.0.001" expanded="true" height="124" name="Loop Examples" width="90" x="313" y="187">
<process expanded="true">
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Split Text in Words (2)" width="90" x="45" y="34">
<parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;&#10;rm_main = function(data)&#10;{&#10; if(is.data.frame(data)){&#10;&#9;spon_words &lt;- data %&gt;%&#10;&#9; unnest_tokens(bigram, text, token = &quot;ngrams&quot;, n = 2)&#10;&#9; }&#10;&#9;print(spon_words)&#10;&#10; return(list(spon_words)) &#10;}&#10;"/>
</operator>
<operator activated="true" class="free_memory" compatibility="8.0.001" expanded="true" height="82" name="Free Memory (4)" width="90" x="45" y="136"/>
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Seperat (2)" width="90" x="45" y="238">
<parameter key="script" value="library(dplyr)&#10;library(tidytext)&#10;library(tidyr)&#10;library(tokenizers)&#10;&#10;rm_main = function(data)&#10;{&#10;devided_bigrams &lt;-data %&gt;%&#10;&#9;separate(bigram, c(&quot;word1&quot;, &quot;word2&quot;), sep = &quot; &quot;)&#10;&#9;print(devided_bigrams)&#10; return(list(devided_bigrams))&#10;}&#10;"/>
</operator>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply (3)" width="90" x="179" y="34"/>
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="Count all Bigrams (2)" width="90" x="179" y="187">
<parameter key="script" value="rm_main = function(data)&#10;{&#10;&#9;library(dplyr)&#10;&#9;library(tidytext)&#10;&#9;library(tidyr)&#10;&#10;&#9;count_bigrams &lt;- data %&gt;%&#10;&#9; count(word1, word2, sort = TRUE)&#10;&#9;print(count_bigrams)&#10;&#10;&#9;counted_bigrams &lt;- data.frame(count_bigrams)&#10; &#10; return(counted_bigrams)&#10;}&#10;"/>
</operator>
<operator activated="true" class="multiply" compatibility="8.0.001" expanded="true" height="103" name="Multiply (4)" width="90" x="313" y="85"/>
<operator activated="true" class="free_memory" compatibility="8.0.001" expanded="true" height="82" name="Free Memory (5)" width="90" x="313" y="187"/>
<operator activated="true" class="r_scripting:execute_r" compatibility="7.2.000" expanded="true" height="82" name="draw graph (2)" width="90" x="447" y="187">
<parameter key="script" value="rm_main = function(data)&#10;{&#10;&#9;library(dplyr)&#10;&#9;library(tidytext)&#10;&#9;library(tidyr)&#10; library(igraph)&#10;&#10; bigram_graph &lt;- data %&gt;%&#10; filter(n &gt;= 10) %&gt;%&#10; graph_from_data_frame&#10; print(bigram_graph)&#10; &#9;# bigram_graph &lt;- data.frame(bigram_graph)&#10;&#10; &#9;library(ggraph)&#10; &#9;set.seed(2017)&#10;&#10; &#9;graph &lt;- ggraph(bigram_graph, layout = &quot;fr&quot;) +&#10; &#9; geom_edge_link() +&#10; &#9; geom_node_point() +&#10; &#9; geom_node_text(aes(label = name), vjust = 1, hjust =1)&#10;&#10; &#9;setwd(&quot;/home/knecht&quot;)&#10;&#9;#graph.write(graph, &quot;/home/knecht/graph01.txt&quot;,, &quot;edgelist&quot;)&#10; &#9;#ggsave(filename = &quot;foo300.png&quot;, width = 5, height = 4, dpi = 300, units = &quot;in&quot;, device='png')&#10; &#9; &#9;&#10; return(list(graph))&#10;}&#10;"/>
</operator>
<operator activated="true" class="free_memory" compatibility="8.0.001" expanded="true" height="82" name="Free Memory (6)" width="90" x="581" y="187"/>
<connect from_port="example set" to_op="Split Text in Words (2)" to_port="input 1"/>
<connect from_op="Split Text in Words (2)" from_port="output 1" to_op="Free Memory (4)" to_port="through 1"/>
<connect from_op="Free Memory (4)" from_port="through 1" to_op="Seperat (2)" to_port="input 1"/>
<connect from_op="Seperat (2)" from_port="output 1" to_op="Multiply (3)" to_port="input"/>
<connect from_op="Multiply (3)" from_port="output 1" to_port="example set"/>
<connect from_op="Multiply (3)" from_port="output 2" to_op="Count all Bigrams (2)" to_port="input 1"/>
<connect from_op="Count all Bigrams (2)" from_port="output 1" to_op="Multiply (4)" to_port="input"/>
<connect from_op="Multiply (4)" from_port="output 1" to_port="output 1"/>
<connect from_op="Multiply (4)" from_port="output 2" to_op="Free Memory (5)" to_port="through 1"/>
<connect from_op="Free Memory (5)" from_port="through 1" to_op="draw graph (2)" to_port="input 1"/>
<connect from_op="draw graph (2)" from_port="output 1" to_op="Free Memory (6)" to_port="through 1"/>
<connect from_op="Free Memory (6)" from_port="through 1" to_port="output 2"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
<portSpacing port="sink_output 3" spacing="0"/>
</process>
</operator>
<connect from_op="Retrieve 18-01-04-list of 4650 crawled pages" from_port="output" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Loop Values" to_port="input 1"/>
<connect from_op="Loop Values" from_port="output 1" to_port="result 1"/>
<connect from_op="Loop Values" from_port="output 2" to_port="result 2"/>
<connect from_op="Loop Values" from_port="output 3" to_port="result 3"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
</process>
</operator>
</process>

Has maybe someone an idea how I can solve it?

 

regards

Tobias

Find more posts tagged with

Sort by:
1 - 1 of 11