Creating wordlists from PDFs via an URL
I would like to create a wordlist (for applying a machine learning model that was specified before) with a PDF as a source. This usually works using the Process Documents operator. But I need to access the PDF via an URL. I thought about using the Web Mining extension for this.
The Get Pages operator does not work, it seems to only accept HTML as an input. The output from this operator is just a random string of "strange" characters so there seems to be a problem with the data format, i.e., with PDF.
The Process Documents from Web operator does not work at all. No wordlist can be created.
The Get Page operator also does not work because I cannot convert the PDF to a document. There seems to be no operator to do that. A PDF to Document operator would be great because then I could just use the Process Documents operator to create my wordlist.
Is there an operator that converts PDFs to documents? Is there an operator that creates a wordlist from a PDF that is accessed via an URL? Is there any other way to create wordlists from PDFs that are accessed via an URL?
You can find the code below. Thank you.
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.1.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="false" class="web:process_web_modern" compatibility="7.3.000" expanded="true" height="68" name="Process Documents from Web" width="90" x="179" y="238">
<parameter key="url" value="https://db.com/ir/en/download/Deutsche_Bank_Annual_Report_2016.pdf"/>
<list key="crawling_rules">
<parameter key="store_with_matching_url" value="https://www.db.com/ir/en/download/Deutsche_Bank_Annual_Report_2016.pdf"/>
</list>
<parameter key="user_agent" value="useragent"/>
<parameter key="ignore_robot_exclusion" value="true"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (English)" width="90" x="179" y="34"/>
<operator activated="true" class="text:stem_porter" compatibility="8.1.000" expanded="true" height="68" name="Stem (Porter)" width="90" x="313" y="34"/>
<operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases" width="90" x="447" y="34"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Stem (Porter)" to_port="document"/>
<connect from_op="Stem (Porter)" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="false" class="text:data_to_documents" compatibility="8.1.000" expanded="true" height="68" name="Data to Documents (2)" width="90" x="313" y="238">
<list key="specify_weights"/>
</operator>
<operator activated="false" class="text:process_documents" compatibility="8.1.000" expanded="true" height="103" name="Process Documents (3)" width="90" x="447" y="238">
<parameter key="keep_text" value="true"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize (4)" width="90" x="45" y="34"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (3)" width="90" x="179" y="34"/>
<operator activated="true" class="text:stem_porter" compatibility="8.1.000" expanded="true" height="68" name="Stem (3)" width="90" x="313" y="34"/>
<operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases (3)" width="90" x="447" y="34"/>
<connect from_port="document" to_op="Tokenize (4)" to_port="document"/>
<connect from_op="Tokenize (4)" from_port="document" to_op="Filter Stopwords (3)" to_port="document"/>
<connect from_op="Filter Stopwords (3)" from_port="document" to_op="Stem (3)" to_port="document"/>
<connect from_op="Stem (3)" from_port="document" to_op="Transform Cases (3)" to_port="document"/>
<connect from_op="Transform Cases (3)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="false" class="web:get_webpage" compatibility="7.3.000" expanded="true" height="68" name="Get Page" width="90" x="45" y="391">
<parameter key="url" value="https://www.db.com/ir/en/download/Deutsche_Bank_Annual_Report_2016.pdf"/>
<list key="query_parameters"/>
<list key="request_properties"/>
</operator>
<operator activated="false" class="store" compatibility="8.1.000" expanded="true" height="68" name="Store" width="90" x="179" y="391">
<parameter key="repository_entry" value="../data/current_PDF/current_PDF"/>
</operator>
<operator activated="false" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize (2)" width="90" x="313" y="391"/>
<operator activated="false" class="generate_tfidf" compatibility="8.1.000" expanded="true" height="82" name="Generate TFIDF" width="90" x="447" y="391"/>
<operator activated="false" class="text:process_documents" compatibility="8.1.000" expanded="true" height="82" name="Process Documents" width="90" x="179" y="493">
<process expanded="true">
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
</process>
</operator>
<operator activated="true" class="operator_toolbox:create_exampleset_from_doc" compatibility="0.9.000" expanded="true" height="68" name="Create ExampleSet" width="90" x="45" y="34">
<parameter key="Input Csv" value="https://db.com/ir/en/download/Deutsche_Bank_Annual_Report_2016.pdf https://db.com/ir/en/download/Deutsche_Bank_Annual_Report_2016.pdf"/>
</operator>
<operator activated="true" class="web:retrieve_webpages" compatibility="7.3.000" expanded="true" height="68" name="Get Pages" width="90" x="179" y="34">
<parameter key="link_attribute" value="https://db.com/ir/en/download/Deutsche_Bank_Annual_Report_2016.pdf"/>
<parameter key="page_attribute" value="PDF"/>
</operator>
<operator activated="true" class="text:data_to_documents" compatibility="8.1.000" expanded="true" height="68" name="Data to Documents" width="90" x="313" y="34">
<list key="specify_weights"/>
</operator>
<operator activated="true" class="text:process_documents" compatibility="8.1.000" expanded="true" height="103" name="Process Documents (2)" width="90" x="447" y="34">
<parameter key="keep_text" value="true"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize (3)" width="90" x="45" y="34"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="8.1.000" expanded="true" height="68" name="Filter Stopwords (2)" width="90" x="179" y="34"/>
<operator activated="true" class="text:stem_porter" compatibility="8.1.000" expanded="true" height="68" name="Stem (2)" width="90" x="313" y="34"/>
<operator activated="true" class="text:transform_cases" compatibility="8.1.000" expanded="true" height="68" name="Transform Cases (2)" width="90" x="447" y="34"/>
<connect from_port="document" to_op="Tokenize (3)" to_port="document"/>
<connect from_op="Tokenize (3)" from_port="document" to_op="Filter Stopwords (2)" to_port="document"/>
<connect from_op="Filter Stopwords (2)" from_port="document" to_op="Stem (2)" to_port="document"/>
<connect from_op="Stem (2)" from_port="document" to_op="Transform Cases (2)" to_port="document"/>
<connect from_op="Transform Cases (2)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Process Documents from Web" from_port="example set" to_op="Data to Documents (2)" to_port="example set"/>
<connect from_op="Data to Documents (2)" from_port="documents" to_op="Process Documents (3)" to_port="documents 1"/>
<connect from_op="Tokenize (2)" from_port="document" to_op="Generate TFIDF" to_port="example set input"/>
<connect from_op="Create ExampleSet" from_port="output" to_op="Get Pages" to_port="Example Set"/>
<connect from_op="Get Pages" from_port="Example Set" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Data to Documents" from_port="documents" to_op="Process Documents (2)" to_port="documents 1"/>
<connect from_op="Process Documents (2)" from_port="example set" to_port="result 1"/>
<connect from_op="Process Documents (2)" from_port="word list" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>