Spoilerwebpage crawling numerics tokenize
Hey,
I have a problem with a webpage crawling issue where I would like to read prices from a table like this:
https://www.epexspot.com/de/marktdaten/intradaycontinuous/intraday-table/2018-10-24/DE
In order to get the content of the webpage I built the following process. Please have a look here:
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.003">
<operator activated="true" class="web:get_webpage" compatibility="7.3.000" expanded="true" height="68" name="get page" width="90" x="45" y="34">
<parameter key="url" value="https://www.epexspot.com/de/marktdaten/intradaycontinuous/intraday-table/2018-10-24/DE"/>
<parameter key="random_user_agent" value="true"/>
<parameter key="connection_timeout" value="50000"/>
<parameter key="read_timeout" value="50000"/>
<parameter key="follow_redirects" value="true"/>
<parameter key="accept_cookies" value="all"/>
<parameter key="cookie_scope" value="global"/>
<parameter key="request_method" value="GET"/>
<list key="query_parameters"/>
<list key="request_properties"/>
<parameter key="override_encoding" value="false"/>
<parameter key="encoding" value="SYSTEM"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.003">
<operator activated="true" class="text:documents_to_data" compatibility="8.1.000" expanded="true" height="82" name="Documents to Data (2)" width="90" x="179" y="34">
<parameter key="text_attribute" value="Inhalt"/>
<parameter key="add_meta_information" value="true"/>
<parameter key="datamanagement" value="double_sparse_array"/>
<parameter key="data_management" value="auto"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.003">
<operator activated="true" class="replace" compatibility="9.0.003" expanded="true" height="82" name="Replace" width="90" x="380" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Inhalt"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="nominal"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="file_path"/>
<parameter key="block_type" value="single_value"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="single_value"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="replace_what" value="[a-zA-Z €äöü()ÄÖÜß/?&@=&quot;'≥]"/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.003">
<operator activated="true" class="replace" compatibility="9.0.003" expanded="true" height="82" name="Replace (3)" width="90" x="581" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="Inhalt"/>
<parameter key="attributes" value=""/>
<parameter key="use_except_expression" value="false"/>
<parameter key="value_type" value="nominal"/>
<parameter key="use_value_type_exception" value="false"/>
<parameter key="except_value_type" value="file_path"/>
<parameter key="block_type" value="single_value"/>
<parameter key="use_block_type_exception" value="false"/>
<parameter key="except_block_type" value="single_value"/>
<parameter key="invert_selection" value="false"/>
<parameter key="include_special_attributes" value="false"/>
<parameter key="replace_what" value="132012,.@.2018.."/>
</operator>
</process>
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.003">
<operator activated="true" class="store" compatibility="9.0.003" expanded="true" height="68" name="Store" width="90" x="849" y="34">
<parameter key="repository_entry" value="../Data/Extract"/>
</operator>
</process>
Now I have the content of the webpage in an attribute called "Inhalt" but it's all in one cell
and I dont know how to get it splitt into the format I wish to achieve.
After the header in the content attribute "Inhalt" it looks roughly like this:
'07-08
<>
<>25,01<>
<>50,00<>
<>36,00<>
<>44,20<>
<>44,20<>
<>44,72<>
<>43,39<>
<>4.058,0<>
<>3.553,8<>'
So I would like to transform this text into a column time(07-08), price1(25,01), price2(50,00), price3(36,00) ...
As it is no text all the text modules offered by rapidminer doesnt seem to help me to cut the content into accessible pieces as I wish.
So could you tell which operator is best to find numbers in such a html-like-code? I think tokenize is getting me into the right direction but I dont know how to get the data into several columns then...
I am happy for any hint and help
Greets,
Markus