🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

Read Document Error & Skipping Over Errors

User: "carl"
New Altair Community Member
Updated by Jocelyn

I get the following error from the Read Document operator (inside Loop Examples after Read Excel with the input URLs).  It stops after successully reading several hundred records.  I have a log that tells me where the process stops, but do not see anything obviously wrong with the input URL.

 

Any thoughts on the possible cause?  And is there a way to skip past any troublesome input URLs rather than stopping the process with no output?

Error.jpg

<?xml version="1.0" encoding="UTF-8"?><process version="7.3.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="7.3.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="read_excel" compatibility="7.3.000" expanded="true" height="68" name="Read Excel" width="90" x="45" y="34">
<parameter key="excel_file" value="/Users/carl/Documents/SD PDFs.xlsx"/>
<parameter key="imported_cell_range" value="A"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="locale" value="English (United Kingdom)"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="SvceDef Link.true.file_path.attribute"/>
</list>
</operator>
<operator activated="true" class="loop_examples" compatibility="7.3.000" expanded="true" height="103" name="Loop Examples" width="90" x="179" y="34">
<process expanded="true">
<operator activated="true" class="extract_macro" compatibility="7.3.000" expanded="true" height="68" name="Extract Macro" width="90" x="45" y="136">
<parameter key="macro" value="GetURL"/>
<parameter key="macro_type" value="data_value"/>
<parameter key="attribute_name" value="SvceDef Link"/>
<parameter key="example_index" value="%{example}"/>
<list key="additional_macros"/>
</operator>
<operator activated="true" class="log" compatibility="7.3.000" expanded="true" height="82" name="Log" width="90" x="179" y="136">
<parameter key="filename" value="/Users/carl/Documents/Log.log"/>
<list key="log">
<parameter key="Log" value="operator.Extract Macro.value.applycount"/>
</list>
</operator>
<operator activated="true" class="open_file" compatibility="7.3.000" expanded="true" height="68" name="Open File" width="90" x="246" y="34">
<parameter key="resource_type" value="URL"/>
<parameter key="url" value="%{GetURL}"/>
</operator>
<operator activated="true" class="text:read_document" compatibility="7.3.000" expanded="true" height="68" name="Read Document" width="90" x="380" y="34">
<parameter key="content_type" value="pdf"/>
<parameter key="encoding" value="UTF-8"/>
</operator>
<operator activated="true" class="text:extract_information" compatibility="7.3.000" expanded="true" height="68" name="Extract Information" width="90" x="514" y="34">
<parameter key="query_type" value="Regular Expression"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries">
<parameter key="Ark" value="[Cc]rown [Hh]osting|Ark Data Cent[re|er]s?|Cody Park|[sS]kyscape|Spring Park"/>
<parameter key="Mainframe" value="Mm]ainframe"/>
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries"/>
<list key="namespaces"/>
<list key="index_queries"/>
<list key="jsonpath_queries"/>
</operator>
<operator activated="true" class="text:documents_to_data" compatibility="7.3.000" expanded="true" height="82" name="Documents to Data" width="90" x="648" y="34">
<parameter key="text_attribute" value="OriginalText"/>
</operator>
<operator activated="true" class="generate_attributes" compatibility="7.3.000" expanded="true" height="82" name="Generate Attributes" width="90" x="782" y="34">
<list key="function_descriptions">
<parameter key="URL" value="%{GetURL}"/>
</list>
</operator>
<connect from_port="example set" to_op="Extract Macro" to_port="example set"/>
<connect from_op="Extract Macro" from_port="example set" to_op="Log" to_port="through 1"/>
<connect from_op="Open File" from_port="file" to_op="Read Document" to_port="file"/>
<connect from_op="Read Document" from_port="output" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_op="Documents to Data" to_port="documents 1"/>
<connect from_op="Documents to Data" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_port="output 1"/>
<portSpacing port="source_example set" spacing="0"/>
<portSpacing port="sink_example set" spacing="0"/>
<portSpacing port="sink_output 1" spacing="0"/>
<portSpacing port="sink_output 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="append" compatibility="7.3.000" expanded="true" height="82" name="Append" width="90" x="313" y="34"/>
<operator activated="true" class="select_attributes" compatibility="7.3.000" expanded="true" height="82" name="Select Attributes" width="90" x="447" y="34">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value="Text"/>
<parameter key="attributes" value="URL|Ark|Mainframe"/>
</operator>
<operator activated="true" class="filter_examples" compatibility="7.3.000" expanded="true" height="103" name="Filter Examples" width="90" x="581" y="34">
<list key="filters_list">
<parameter key="filters_entry_key" value="Ark.is_not_missing."/>
<parameter key="filters_entry_key" value="Mainframe.is_not_missing."/>
</list>
<parameter key="filters_logic_and" value="false"/>
</operator>
<operator activated="true" class="order_attributes" compatibility="7.3.000" expanded="true" height="82" name="Reorder Attributes" width="90" x="715" y="34">
<parameter key="attribute_ordering" value="URL|Mainframe|Ark"/>
</operator>
<operator activated="true" class="write_excel" compatibility="7.3.000" expanded="true" height="82" name="Write Excel" width="90" x="849" y="34">
<parameter key="excel_file" value="/Users/carl/Documents/Service Definition Matches.xlsx"/>
</operator>
<connect from_port="input 1" to_op="Read Excel" to_port="file"/>
<connect from_op="Read Excel" from_port="output" to_op="Loop Examples" to_port="example set"/>
<connect from_op="Loop Examples" from_port="output 1" to_op="Append" to_port="example set 1"/>
<connect from_op="Append" from_port="merged set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Reorder Attributes" to_port="example set input"/>
<connect from_op="Reorder Attributes" from_port="example set output" to_op="Write Excel" to_port="input"/>
<connect from_op="Write Excel" from_port="through" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="source_input 2" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>

Find more posts tagged with

Sort by:
1 - 1 of 11
    User: "sgenzer"
    Altair Employee
    Accepted Answer

    hi...ok I've looked at your process.  Some thoughts..

    - Are all the URLs that you're going point to PDF files?  Your Read Document operator is only looking for pdfs.

    - I tend not to use the Open File operator to get a web page.  I prefer to use the "Get Page" operator in the Web Mining extension.  There's a lot more functionality there.

    - That yellow text warning is what you want.  It's telling you that Handle Exception is skipping over the operator "Read Document" when it cannot do it.  If it were me, I would put both the Open File and the Read Document in the "Try" section.

    - That red text warning is telling you that whatever succeeds in the Handle Exception and is being passed onto Extract Information is not always a document, and hence it gives you an error (Extract Information requires a document).

     

    SO if it were me, I would try the following:

    - Place ALL the operators inside the Loop Examples inside the Handle Exception.  This way it skips over any problems it has along the way, and only passes complete successes to the output.

    - Rebuild the URL grab using Get Page rather than Open File.

     

    Scott