"A question about process documents from files"
platanas20
New Altair Community Member
Hello to everyone,
I'm new with rapidminer and I created a project where I save txt files which are articles from a site (i use the operator "Crawl Web")
After that i use the operator "process documents from files" to read the files.
Inside the operator i use the operator "extract information" (x-path).
I get the comments successfully and i want to ask if it is possible to write only the comments in a document (for example .txt) ?
I'm sorry for my English !!!
MY CODE:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.004">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.004" expanded="true" name="Process">
<parameter key="encoding" value="UTF-8"/>
<process expanded="true" height="614" width="886">
<operator activated="false" class="web:crawl_web" compatibility="5.1.000" expanded="true" height="60" name="Crawl Web" width="90" x="246" y="345">
<parameter key="url" value="http://nba.sport24.gr/category/nba_news/?locale=el_gr"/>
<list key="crawling_rules">
<parameter key="store_with_matching_url" value=".*article.*"/>
<parameter key="follow_link_with_matching_url" value=".*article.*|.*.gr.*"/>
</list>
<parameter key="output_dir" value="C:\Users\elenious\Desktop\diplomatiki\newresults\temp"/>
<parameter key="max_pages" value="2"/>
<parameter key="max_depth" value="1"/>
<parameter key="user_agent" value="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.100 Safari/534.30 "/>
</operator>
<operator activated="true" class="text:process_document_from_file" compatibility="5.1.001" expanded="true" height="76" name="Process Documents from Files" width="90" x="179" y="75">
<list key="text_directories">
<parameter key="comments" value="C:\Users\elenious\Desktop\diplomatiki\newresults\temp"/>
</list>
<parameter key="encoding" value="UTF-8"/>
<process expanded="true" height="502" width="979">
<operator activated="true" class="text:extract_information" compatibility="5.1.001" expanded="true" height="60" name="Extract Information" width="90" x="246" y="165">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="com" value="//h:div[@class='body']/h:p/text()"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="write_as_text" compatibility="5.1.004" expanded="true" height="76" name="Write as Text" width="90" x="479" y="110">
<parameter key="result_file" value="C:\Users\elenious\Desktop\diplomatiki\newresults\crawl\new.txt"/>
<parameter key="encoding" value="UTF-8"/>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Write as Text" to_port="input 1"/>
<connect from_op="Write as Text" from_port="input 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
???
I'm new with rapidminer and I created a project where I save txt files which are articles from a site (i use the operator "Crawl Web")
After that i use the operator "process documents from files" to read the files.
Inside the operator i use the operator "extract information" (x-path).
I get the comments successfully and i want to ask if it is possible to write only the comments in a document (for example .txt) ?
I'm sorry for my English !!!
MY CODE:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.004">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.004" expanded="true" name="Process">
<parameter key="encoding" value="UTF-8"/>
<process expanded="true" height="614" width="886">
<operator activated="false" class="web:crawl_web" compatibility="5.1.000" expanded="true" height="60" name="Crawl Web" width="90" x="246" y="345">
<parameter key="url" value="http://nba.sport24.gr/category/nba_news/?locale=el_gr"/>
<list key="crawling_rules">
<parameter key="store_with_matching_url" value=".*article.*"/>
<parameter key="follow_link_with_matching_url" value=".*article.*|.*.gr.*"/>
</list>
<parameter key="output_dir" value="C:\Users\elenious\Desktop\diplomatiki\newresults\temp"/>
<parameter key="max_pages" value="2"/>
<parameter key="max_depth" value="1"/>
<parameter key="user_agent" value="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.100 Safari/534.30 "/>
</operator>
<operator activated="true" class="text:process_document_from_file" compatibility="5.1.001" expanded="true" height="76" name="Process Documents from Files" width="90" x="179" y="75">
<list key="text_directories">
<parameter key="comments" value="C:\Users\elenious\Desktop\diplomatiki\newresults\temp"/>
</list>
<parameter key="encoding" value="UTF-8"/>
<process expanded="true" height="502" width="979">
<operator activated="true" class="text:extract_information" compatibility="5.1.001" expanded="true" height="60" name="Extract Information" width="90" x="246" y="165">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="com" value="//h:div[@class='body']/h:p/text()"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="write_as_text" compatibility="5.1.004" expanded="true" height="76" name="Write as Text" width="90" x="479" y="110">
<parameter key="result_file" value="C:\Users\elenious\Desktop\diplomatiki\newresults\crawl\new.txt"/>
<parameter key="encoding" value="UTF-8"/>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Write as Text" to_port="input 1"/>
<connect from_op="Write as Text" from_port="input 1" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
???
Tagged:
0
Answers
-
Hello platanas.
Since I dont have your files:
Your problem is that you just want to save the extracted comments named "com" and you dont know how to do that right?
You just can use the "Select Attributes" operator to get your comments, that you extracted with your Xpath path command. Just add the attribute name where your texts are saved. In your project you named it: "com".
With this operator the result will just be the extracted comments that you can easily save in a database or text file using the write operator.
Hope this helps.
Greetings
Miguel
0 -
Hello Miguel,
Thank you very much.This is exactly what i want to do. I use the "Select Attribute" operator and i choose the attribute 'com'. After i use the operator "Write excel" but the results in the excel file are all the attributes.
Greetings
platanas0 -
Hi,
please post your process xml so we can see what's wrong. Select Attribute followed by Write Excel works fine for me here
Regards,
Marco0 -
Here is my XML code:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.004">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.004" expanded="true" name="Process">
<parameter key="encoding" value="UTF-8"/>
<process expanded="true" height="614" width="886">
<operator activated="false" class="web:crawl_web" compatibility="5.1.000" expanded="true" height="60" name="Crawl Web" width="90" x="246" y="345">
<parameter key="url" value="http://nba.sport24.gr/category/nba_news/?locale=el_gr"/>
<list key="crawling_rules">
<parameter key="store_with_matching_url" value=".*article.*"/>
<parameter key="follow_link_with_matching_url" value=".*article.*|.*.gr.*"/>
</list>
<parameter key="output_dir" value="C:\Users\elenious\Desktop\diplomatiki\newresults\temp"/>
<parameter key="max_pages" value="2"/>
<parameter key="max_depth" value="1"/>
<parameter key="user_agent" value="Mozilla/5.0 (Windows NT 6.1) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.100 Safari/534.30 "/>
</operator>
<operator activated="true" class="text:process_document_from_file" compatibility="5.1.001" expanded="true" height="76" name="Process Documents from Files" width="90" x="112" y="75">
<list key="text_directories">
<parameter key="comments" value="C:\Users\elenious\Desktop\diplomatiki\newresults\temp"/>
</list>
<parameter key="encoding" value="UTF-8"/>
<process expanded="true" height="502" width="979">
<operator activated="true" class="text:extract_information" compatibility="5.1.001" expanded="true" height="60" name="Extract Information" width="90" x="246" y="165">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="com" value="//h:div[@class='body']/h:p/text()"/>
</list>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.1.004" expanded="true" height="76" name="Select Attributes (2)" width="90" x="313" y="165">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="com"/>
</operator>
<operator activated="true" class="write_excel" compatibility="5.1.004" expanded="true" height="60" name="Write Excel" width="90" x="581" y="255">
<parameter key="excel_file" value="C:\Users\elenious\Desktop\diplomatiki\newresults\crawl\test.xls"/>
<parameter key="encoding" value="UTF-8"/>
</operator>
<connect from_op="Process Documents from Files" from_port="example set" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Write Excel" to_port="input"/>
<connect from_op="Write Excel" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
0 -
Hi platanas,
try checking the "include special attributes" option for the "Select Attributes" operator. Since you let some meta data be appended by "Process Documents from Data", there exist special attributes, which are not filtered out by default.
Best regards
Matthias
P.S. Please consider using the CODE-Tags when posting longer parts of code to improve readability and keep postings shorter.0 -
Yes now it works fine!!!
Thank you very much0