"Extracting text from a website using an

New Altair Community Member
Updated by Jocelyn
Hello. I'm newbie with RapidMiner and I want to analyse text of certain parts of web-pages. From news pages I want to extract title, main text and date. Text and title must be cleaned from html and all other tags, date must be kept in date data format. Is it possible? I tryed "get pages" and "extract information" operators, but the latter keeps the whole text and the parts needed as attributes, so I can't use HTML processing operator to those attributes.
So, I'm stuck with this (just random example with BBC news site):
So, I'm stuck with this (just random example with BBC news site):
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.004">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.1.004" expanded="true" name="Process">
<process expanded="true" height="460" width="681">
<operator activated="true" class="web:get_webpage" compatibility="5.1.000" expanded="true" height="60" name="Get Page" width="90" x="63" y="145">
<parameter key="url" value="http://www.bbc.co.uk/news/uk-12778022"/>
<parameter key="random_user_agent" value="true"/>
<list key="query_parameters"/>
</operator>
<operator activated="true" class="text:extract_information" compatibility="5.1.001" expanded="true" height="60" name="Extract Information" width="90" x="209" y="147">
<parameter key="query_type" value="Regular Expression"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries">
<parameter key="Title" value="<h1 class="story-header">(.*?)</h1>"/>
<parameter key="Story" value="<p class="introduction" id="story_continues_1">(.*?)</div><!-- / story-body -->"/>
<parameter key="Date" value="<span class="date">(.*?)</span>"/>
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries"/>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_op="Get Page" from_port="output" to_op="Extract Information" to_port="document"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
</process>
</operator>
</process>