"xml names cannot begin with character"

szabo
szabo New Altair Community Member
edited November 5 in Community Q&A
My process stops in certaing cases and i do not really find the answer what the reason is. Sometimes the message is that: Process failed: Tha name "µ
g" is not legal for JDOM/XML attributes: XML names cannot begin with the charater "µ". Do you know the reason??

my code is:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.1.003">
  <context>
    <input/>
    <output/>
    <macros/>
  </context>
  <operator activated="true" class="process" compatibility="5.1.003" expanded="true" name="Process">
    <process expanded="true" height="426" width="1036">
      <operator activated="true" class="read_csv" compatibility="5.1.003" expanded="true" height="60" name="Read CSV" width="90" x="45" y="120">
        <parameter key="csv_file" value="C:\input.csv"/>
        <list key="annotations"/>
        <list key="data_set_meta_data_information"/>
      </operator>
      <operator activated="true" class="web:retrieve_webpages" compatibility="5.1.000" expanded="true" height="60" name="Get Pages" width="90" x="179" y="120">
        <parameter key="link_attribute" value="link"/>
        <parameter key="page_attribute" value="oldal"/>
        <parameter key="random_user_agent" value="true"/>
        <parameter key="follow_redirects" value="false"/>
      </operator>
      <operator activated="true" class="text:data_to_documents" compatibility="5.1.001" expanded="true" height="60" name="Data to Documents" width="90" x="313" y="120">
        <list key="specify_weights"/>
      </operator>
      <operator activated="true" class="multiply" compatibility="5.1.003" expanded="true" height="94" name="Multiply" width="90" x="447" y="165"/>
      <operator activated="true" class="text:process_documents" compatibility="5.1.001" expanded="true" height="94" name="Process Documents (2)" width="90" x="648" y="30">
        <parameter key="create_word_vector" value="false"/>
        <process expanded="true" height="428" width="634">
          <operator activated="true" class="text:cut_document" compatibility="5.1.001" expanded="true" height="60" name="Cut Document (3)" width="90" x="313" y="30">
            <parameter key="query_type" value="XPath"/>
            <list key="string_machting_queries"/>
            <list key="regular_expression_queries"/>
            <list key="regular_region_queries"/>
            <list key="xpath_queries">
              <parameter key="termeklap" value="h:html/h:body/h:table[2]/h:tr/h:td[2]/h:table[1]/h:tr/h:td[1]/h:div[1]"/>
            </list>
            <list key="namespaces"/>
            <list key="index_queries"/>
            <process expanded="true" height="446" width="652">
              <operator activated="true" class="text:extract_information" compatibility="5.1.001" expanded="true" height="60" name="Extract Information (3)" width="90" x="281" y="30">
                <parameter key="query_type" value="XPath"/>
                <list key="string_machting_queries"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries"/>
                <list key="xpath_queries">
                  <parameter key="kategoria" value="//h:a[1]/text()"/>
                  <parameter key="kategoria2" value="//h:a[2]/text()"/>
                  <parameter key="kategoria3" value="//h:a[3]/text()"/>
                </list>
                <list key="namespaces"/>
                <list key="index_queries"/>
              </operator>
              <connect from_port="segment" to_op="Extract Information (3)" to_port="document"/>
              <connect from_op="Extract Information (3)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_segment" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_port="document" to_op="Cut Document (3)" to_port="document"/>
          <connect from_op="Cut Document (3)" from_port="documents" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="write_excel" compatibility="5.1.003" expanded="true" height="60" name="Write Excel" width="90" x="849" y="75">
        <parameter key="excel_file" value="C:\termek_kategoriak.xls"/>
      </operator>
      <operator activated="true" class="text:process_documents" compatibility="5.1.001" expanded="true" height="94" name="Process Documents" width="90" x="648" y="210">
        <parameter key="create_word_vector" value="false"/>
        <process expanded="true" height="446" width="652">
          <operator activated="true" class="text:cut_document" compatibility="5.1.001" expanded="true" height="60" name="Cut Document (2)" width="90" x="246" y="75">
            <parameter key="query_type" value="XPath"/>
            <list key="string_machting_queries"/>
            <list key="regular_expression_queries"/>
            <list key="regular_region_queries"/>
            <list key="xpath_queries">
              <parameter key="termeklap" value="h:html/h:head"/>
            </list>
            <list key="namespaces"/>
            <list key="index_queries"/>
            <process expanded="true" height="446" width="652">
              <operator activated="true" class="text:extract_information" compatibility="5.1.001" expanded="true" height="60" name="Extract Information (2)" width="90" x="281" y="30">
                <parameter key="query_type" value="XPath"/>
                <list key="string_machting_queries"/>
                <list key="regular_expression_queries"/>
                <list key="regular_region_queries"/>
                <list key="xpath_queries">
                  <parameter key="termeknev" value="//h:title/text()"/>
                </list>
                <list key="namespaces"/>
                <list key="index_queries"/>
              </operator>
              <connect from_port="segment" to_op="Extract Information (2)" to_port="document"/>
              <connect from_op="Extract Information (2)" from_port="document" to_port="document 1"/>
              <portSpacing port="source_segment" spacing="0"/>
              <portSpacing port="sink_document 1" spacing="0"/>
              <portSpacing port="sink_document 2" spacing="0"/>
            </process>
          </operator>
          <connect from_port="document" to_op="Cut Document (2)" to_port="document"/>
          <connect from_op="Cut Document (2)" from_port="documents" to_port="document 1"/>
          <portSpacing port="source_document" spacing="0"/>
          <portSpacing port="sink_document 1" spacing="0"/>
          <portSpacing port="sink_document 2" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="write_excel" compatibility="5.1.003" expanded="true" height="60" name="Write Excel (2)" width="90" x="849" y="300">
        <parameter key="excel_file" value="C:\output.xls"/>
      </operator>
      <connect from_op="Read CSV" from_port="output" to_op="Get Pages" to_port="Example Set"/>
      <connect from_op="Get Pages" from_port="Example Set" to_op="Data to Documents" to_port="example set"/>
      <connect from_op="Data to Documents" from_port="documents" to_op="Multiply" to_port="input"/>
      <connect from_op="Multiply" from_port="output 1" to_op="Process Documents" to_port="documents 1"/>
      <connect from_op="Multiply" from_port="output 2" to_op="Process Documents (2)" to_port="documents 1"/>
      <connect from_op="Process Documents (2)" from_port="example set" to_op="Write Excel" to_port="input"/>
      <connect from_op="Process Documents (2)" from_port="word list" to_port="result 1"/>
      <connect from_op="Write Excel" from_port="through" to_port="result 2"/>
      <connect from_op="Process Documents" from_port="example set" to_op="Write Excel (2)" to_port="input"/>
      <connect from_op="Process Documents" from_port="word list" to_port="result 3"/>
      <connect from_op="Write Excel (2)" from_port="through" to_port="result 4"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="0"/>
      <portSpacing port="sink_result 2" spacing="0"/>
      <portSpacing port="sink_result 3" spacing="0"/>
      <portSpacing port="sink_result 4" spacing="0"/>
      <portSpacing port="sink_result 5" spacing="0"/>
    </process>
  </operator>
</process>
Tagged:

Answers

  • land
    land New Altair Community Member
    Hi,
    it seems to me there might be some encoding problems inside your web page processing. Sometimes webpages do not have the encoding they declare to have. So there might occur problems when trying to parse them as XML files (as the XPath expression would need to do).

    Please try to check that...

    Greetings,
      Sebastian