"How to extract text from multiple webpages"
ArnoG
New Altair Community Member
I like to extract text from blogs. I created a excelsheet with URL to blog and news pages. In rapidminer I use the "Get Pages" operator to extract the page content. At the moment I extract the content of the entire page and I just want extract the text of the blog.
Does anybody knows how to extract only text?
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.0.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="6.0.002" expanded="true" height="60" name="Read Excel" width="90" x="45" y="75">
<parameter key="excel_file" value="C:\Improve Your Business\Qing\Sales\Conquaestor\Links KLM.xlsx"/>
<parameter key="sheet_number" value="2"/>
<parameter key="imported_cell_range" value="A1:A2"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="Link.true.file_path.attribute"/>
</list>
</operator>
<operator activated="true" class="web:retrieve_webpages" compatibility="5.3.001" expanded="true" height="60" name="Get Pages" width="90" x="179" y="75">
<parameter key="link_attribute" value="Link"/>
<parameter key="follow_redirects" value="false"/>
</operator>
<operator activated="true" class="generate_id" compatibility="6.0.002" expanded="true" height="76" name="Generate ID" width="90" x="313" y="30"/>
<operator activated="true" class="text:data_to_documents" compatibility="5.3.002" expanded="true" height="60" name="Data to Documents" width="90" x="514" y="75">
<list key="specify_weights"/>
</operator>
<operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Documents" width="90" x="179" y="210">
<parameter key="vector_creation" value="Term Occurrences"/>
<parameter key="keep_text" value="true"/>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="5.3.001" expanded="true" height="60" name="Extract Content" width="90" x="45" y="30"/>
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="179" y="75"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="313" y="75"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="447" y="75"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="715" y="75">
<parameter key="min_chars" value="3"/>
</operator>
<connect from_port="document" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read Excel" from_port="output" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Data to Documents" from_port="documents" to_op="Process Documents" to_port="documents 1"/>
<connect from_op="Process Documents" from_port="example set" to_port="result 1"/>
<connect from_op="Process Documents" from_port="word list" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
Does anybody knows how to extract only text?
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.0.002">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="6.0.002" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="6.0.002" expanded="true" height="60" name="Read Excel" width="90" x="45" y="75">
<parameter key="excel_file" value="C:\Improve Your Business\Qing\Sales\Conquaestor\Links KLM.xlsx"/>
<parameter key="sheet_number" value="2"/>
<parameter key="imported_cell_range" value="A1:A2"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information">
<parameter key="0" value="Link.true.file_path.attribute"/>
</list>
</operator>
<operator activated="true" class="web:retrieve_webpages" compatibility="5.3.001" expanded="true" height="60" name="Get Pages" width="90" x="179" y="75">
<parameter key="link_attribute" value="Link"/>
<parameter key="follow_redirects" value="false"/>
</operator>
<operator activated="true" class="generate_id" compatibility="6.0.002" expanded="true" height="76" name="Generate ID" width="90" x="313" y="30"/>
<operator activated="true" class="text:data_to_documents" compatibility="5.3.002" expanded="true" height="60" name="Data to Documents" width="90" x="514" y="75">
<list key="specify_weights"/>
</operator>
<operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Documents" width="90" x="179" y="210">
<parameter key="vector_creation" value="Term Occurrences"/>
<parameter key="keep_text" value="true"/>
<process expanded="true">
<operator activated="true" class="web:extract_html_text_content" compatibility="5.3.001" expanded="true" height="60" name="Extract Content" width="90" x="45" y="30"/>
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="179" y="75"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="313" y="75"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="447" y="75"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="715" y="75">
<parameter key="min_chars" value="3"/>
</operator>
<connect from_port="document" to_op="Extract Content" to_port="document"/>
<connect from_op="Extract Content" from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read Excel" from_port="output" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Data to Documents" from_port="documents" to_op="Process Documents" to_port="documents 1"/>
<connect from_op="Process Documents" from_port="example set" to_port="result 1"/>
<connect from_op="Process Documents" from_port="word list" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
Tagged:
0
Answers
-
Hi,
with Extract Information you can use XPaths to extract parts of the sites after you have crawled them. Does that help?
Best regards,
Marius0 -
Hi Marius,
That helps a little. I got 2 problems:
1: I like to scrape google blog search. For instance on the keywords Lufthansa & Customer Service. So I got hundreds of URL to blog/news sites. A lot of sites have different X-path to select the text. Is there a general x-path to select text from a webpage/blogsite?
2: When I use the operator "Get Pages" Rapidminer stores the content in a new attribute. How can I extract the text content of that attribute and use it as input to create a word vector?
I hope you cab give me some tips.
0 -
1. Unfortunately there is no general XPath. If its enough to get rid of the html tags the Extract Content operator will do.
2. First of all I would store the result of Get Pages into the repository with the Store operator such that you don't need to crawl the web again, especially during the testing phase. Afterwards, you have to use Nominal to Text on the attribute containing the text, and then pass the example set into Process Documents from Data. For usage of that operator please have a look at this blog series: http://vancouverdata.blogspot.de/2010/11/text-analytics-with-rapidminer-loading.html
Best regards,
Marius0