Hello to everyone,
I'm new with rapidminer and I created a project where I save txt files which are articles from a site (i use the operator "Crawl Web")
After that i use the operator "process documents from files" to read the files.
Inside the operator i use the operator "extract information" (x-path).
I get the comments successfully and i want to ask if it is possible to write only the comments in a document (for example .txt) ?
I'm sorry for my English !!!
MY CODE:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.004">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.004" expanded="true" name="Process">
<parameter key="encoding" value="UTF-8"/>
<process expanded="true" height="614" width="886">
<operator activated="false" class="web:crawl_web" compatibility="5.1.000" expanded="true" height="60" name="Crawl Web" width="90" x="246" y="345">
<parameter key="url" value="
http://nba.sport24.gr/category/nba_news/?locale=el_gr"/>
<list key="crawling_rules">
<parameter key="store_with_matching_url" value=".*article.*"/>
<parameter key="follow_link_with_matching_url" value=".*article.*|.*.gr.*"/>
</list>
<parameter key="output_dir" value="C:\Users\elenious\Desktop\diplomatiki\newresults\temp"/>
<parameter key="max_pages" value="2"/>
<parameter key="max_depth" value="1"/>
<parameter key="user_agent" value="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.100 Safari/534.30 "/>
</operator>
<operator activated="true" class="text:process_document_from_file" compatibility="5.1.001" expanded="true" height="76" name="Process Documents from Files" width="90" x="179" y="75">
<list key="text_directories">
<parameter key="comments" value="C:\Users\elenious\Desktop\diplomatiki\newresults\temp"/>
</list>
<parameter key="encoding" value="UTF-8"/>
<process expanded="true" height="502" width="979">
<operator activated="true" class="text:extract_information" compatibility="5.1.001" expanded="true" height="60" name="Extract Information" width="90" x="246" y="165">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="com" value="//h:div[
@class='body']/h:p/text()"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="write_as_text" compatibility="5.1.004" expanded="true" height="76" name="Write as Text" width="90" x="479" y="110">
<parameter key="result_file" value="C:\Users\elenious\Desktop\diplomatiki\newresults\crawl\new.txt"/>
<parameter key="encoding" value="UTF-8"/>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Write as Text" to_port="input 1"/>
<connect from_op="Write as Text" from_port="input 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
???