Create a co-occurrence graph

TobiasNehrig
TobiasNehrig New Altair Community Member
edited November 2024 in Community Q&A

Hello Experts,

 

I have to create a co-occurrence graph with RapidMiner. I have already crawled a given web page and processed the files to create the text corpus. But now I have no idea how to create the co- occurrence graph.

 

My Question is:

Is there a way to create a co-occurrence graph directly with RapidMiner from a text corpus or is there a better way with an R-script and which would that be?

 

This is my code so far:

 

 

<?xml version="1.0" encoding="UTF-8"?><process version="7.6.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.6.001" expanded="true" name="Process">
<parameter key="logfile" value="/home/knecht/Master2017/Rapp/Logfile.log"/>
<parameter key="resultfile" value="/home/knecht/Master2017/Rapp/resultfile.res"/>
<process expanded="true">
<operator activated="true" class="web:crawl_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Crawl Web" width="90" x="45" y="34">
<parameter key="url" value="http://www.fask.uni-mainz.de/user/rapp/papers/disshtml/main/main.html"/>
<list key="crawling_rules">
<parameter key="store_with_matching_url" value="http://www.fask.uni-mainz.de/user/rapp/papers/disshtml/.*"/>
<parameter key="follow_link_with_matching_url" value="http://www.fask.uni-mainz.de/user/rapp/papers/disshtml.*"/>
</list>
<parameter key="max_crawl_depth" value="10"/>
<parameter key="retrieve_as_html" value="true"/>
<parameter key="add_content_as_attribute" value="true"/>
<parameter key="write_pages_to_disk" value="true"/>
<parameter key="output_dir" value="/home/knecht/Crawler"/>
<parameter key="max_pages" value="1000"/>
<parameter key="max_page_size" value="500"/>
<parameter key="user_agent" value="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:55.0) Gecko/20100101 Firefox/55.0"/>
<parameter key="ignore_robot_exclusion" value="true"/>
</operator>
<operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="45" y="136">
<parameter key="link_attribute" value="Link"/>
<parameter key="page_attribute" value="link"/>
<parameter key="random_user_agent" value="true"/>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="7.5.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="45" y="289">
<parameter key="keep_text" value="true"/>
<parameter key="prune_method" value="percentual"/>
<list key="specify_weights">
<parameter key="link" value="1.0"/>
</list>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="7.3.000" expanded="true" height="68" name="Extract Content" width="90" x="45" y="34">
<parameter key="minimum_text_block_length" value="2"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize non letters" width="90" x="45" y="136"/>
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize Sätze" width="90" x="45" y="238">
<parameter key="mode" value="linguistic tokens"/>
<parameter key="language" value="German"/>
</operator>
<operator activated="true" class="text:tokenize" compatibility="7.5.000" expanded="true" height="68" name="Tokenize linguistic token" width="90" x="45" y="340">
<parameter key="mode" value="linguistic tokens"/>
<parameter key="language" value="German"/>
</operator>
<operator activated="true" class="text:filter_stopwords_german" compatibility="7.5.000" expanded="true" height="68" name="Filter Stopwords (German)" width="90" x="45" y="437"/>
<operator activated="true" class="text:transform_cases" compatibility="7.5.000" expanded="true" height="68" name="Transform Cases" width="90" x="246" y="34"/>
<connect from_port="document" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_op="Tokenize non letters" to_port="document"/>
<connect from_op="Tokenize non letters" from_port="document" to_op="Tokenize Sätze" to_port="document"/>
<connect from_op="Tokenize Sätze" from_port="document" to_op="Tokenize linguistic token" to_port="document"/>
<connect from_op="Tokenize linguistic token" from_port="document" to_op="Filter Stopwords (German)" to_port="document"/>
<connect from_op="Filter Stopwords (German)" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:wordlist_to_data" compatibility="7.5.000" expanded="true" height="82" name="WordList to Data" width="90" x="179" y="289"/>
<operator activated="true" class="write_as_text" compatibility="7.6.001" expanded="true" height="82" name="Write Wordlist exa" width="90" x="581" y="442">
<parameter key="result_file" value="/home/knecht/Korpus/Rapp_Wordlist_exa.res"/>
</operator>
<operator activated="true" class="write_as_text" compatibility="7.6.001" expanded="true" height="82" name="Write Wordlist" width="90" x="581" y="340">
<parameter key="result_file" value="/home/knecht/Korpus/Rapp_Wordlist.res"/>
</operator>
<operator activated="true" class="multiply" compatibility="7.6.001" expanded="true" height="103" name="Multiply" width="90" x="179" y="136"/>
<operator activated="true" class="write_as_text" compatibility="7.6.001" expanded="true" height="82" name="Write Process Documents" width="90" x="581" y="238">
<parameter key="result_file" value="/home/knecht/Korpus/Rapp_Proccess_Documents.res"/>
</operator>
<operator activated="true" class="text:data_to_documents" compatibility="7.5.000" expanded="true" height="68" name="Data to Documents" width="90" x="179" y="34">
<parameter key="select_attributes_and_weights" value="true"/>
<list key="specify_weights">
<parameter key="text" value="1.0"/>
</list>
</operator>
<operator activated="true" class="multiply" compatibility="7.6.001" expanded="true" height="103" name="Data to Document" width="90" x="313" y="34"/>
<operator activated="true" class="text:documents_to_data" compatibility="7.5.000" expanded="true" height="82" name="Documents to Data" width="90" x="447" y="136">
<parameter key="text_attribute" value="text"/>
<parameter key="label_attribute" value="text"/>
<parameter key="data_management" value="memory-optimized"/>
</operator>
<operator activated="true" class="write_as_text" compatibility="7.6.001" expanded="true" height="82" name="Write Documents to Data" width="90" x="581" y="136">
<parameter key="result_file" value="/home/knecht/Korpus/Rapp_Document_to_Data.res"/>
</operator>
<operator activated="true" class="write_as_text" compatibility="7.6.001" expanded="true" height="82" name="Write Korpus" width="90" x="581" y="34">
<parameter key="result_file" value="/home/knecht/Korpus/Rapp_Corpus.res"/>
</operator>
<connect from_op="Crawl Web" from_port="example set" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Multiply" to_port="input"/>
<connect from_op="Process Documents from Data" from_port="word list" to_op="WordList to Data" to_port="word list"/>
<connect from_op="WordList to Data" from_port="word list" to_op="Write Wordlist" to_port="input 1"/>
<connect from_op="WordList to Data" from_port="example set" to_op="Write Wordlist exa" to_port="input 1"/>
<connect from_op="Write Wordlist exa" from_port="input 1" to_port="result 5"/>
<connect from_op="Write Wordlist" from_port="input 1" to_port="result 4"/>
<connect from_op="Multiply" from_port="output 1" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Multiply" from_port="output 2" to_op="Write Process Documents" to_port="input 1"/>
<connect from_op="Write Process Documents" from_port="input 1" to_port="result 3"/>
<connect from_op="Data to Documents" from_port="documents" to_op="Data to Document" to_port="input"/>
<connect from_op="Data to Document" from_port="output 1" to_op="Write Korpus" to_port="input 1"/>
<connect from_op="Data to Document" from_port="output 2" to_op="Documents to Data" to_port="documents 1"/>
<connect from_op="Documents to Data" from_port="example set" to_op="Write Documents to Data" to_port="input 1"/>
<connect from_op="Write Documents to Data" from_port="input 1" to_port="result 2"/>
<connect from_op="Write Korpus" from_port="input 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
<portSpacing port="sink_result 4" spacing="0"/>
<portSpacing port="sink_result 5" spacing="0"/>
<portSpacing port="sink_result 6" spacing="0"/>
</process>
</operator>
</process>

 

 

Thanks for the help!

 

Tobias

 

 

Tagged:

Welcome!

It looks like you're new here. Sign in or register to get started.

Best Answer

Answers

Welcome!

It looks like you're new here. Sign in or register to get started.

Welcome!

It looks like you're new here. Sign in or register to get started.