Hello all,
i want to cluster a set of xml documents by extracting content out of the xml document by xpath querys. once i have extracted the content from the xml files i want to cluster them with a given clustering algorithm (eg kmeans)
so the textual content im extracting in that way
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
<context>
<input>
<location/>
</input>
<output>
<location/>
</output>
<macros/>
</context>
<operator activated="true" class="process" expanded="true" name="Process">
<process expanded="true" height="688" width="1007">
<operator activated="true" class="text:process_document_from_file" expanded="true" height="76" name="Process Documents from Files (2)" width="90" x="179" y="120">
<list key="text_directories">
<parameter key="wsdls" value="/home/simon/work/workspace/MasterThesis/wsdls"/>
</list>
<parameter key="extract_text_only" value="false"/>
<process expanded="true" height="688" width="1007">
<operator activated="true" class="text:cut_document" expanded="true" height="60" name="Cut Document (3)" width="90" x="458" y="30">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="operation" value="//wsdl:operation/@name"/>
</list>
<list key="namespaces">
<parameter key="wsdl" value="http://schemas.xmlsoap.org/wsdl/"/>
</list>
<parameter key="ignore_CDATA" value="false"/>
<parameter key="assume_html" value="false"/>
<list key="index_queries"/>
<process expanded="true" height="688" width="1007">
<operator activated="true" class="text:extract_information" expanded="true" height="60" name="Extract Information (3)" width="90" x="458" y="30">
<parameter key="query_type" value="Regular Expression"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries">
<parameter key="extraction" value="(.*)"/>
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="operation" value="//wsdl:operation/@name"/>
</list>
<list key="namespaces">
<parameter key="wsdl" value="http://schemas.xmlsoap.org/wsdl/"/>
</list>
<parameter key="ignore_CDATA" value="false"/>
<parameter key="assume_html" value="false"/>
<list key="index_queries"/>
</operator>
<connect from_port="segment" to_op="Extract Information (3)" to_port="document"/>
<connect from_op="Extract Information (3)" from_port="document" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_port="document" to_op="Cut Document (3)" to_port="document"/>
<connect from_op="Cut Document (3)" from_port="documents" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
</process>
</operator>
</process>
with that i have several examples for each extracted content from one file (there are several tags matching my xpath query) if im feeding now for instance a kmeans algorithm with the whole example set, the algorithm clusters the extracted content (the single results form the xpath query) and not the xml documents regarding its extracted content.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
<context>
<input>
<location/>
</input>
<output>
<location/>
<location/>
</output>
<macros/>
</context>
<operator activated="true" class="process" expanded="true" name="Process">
<process expanded="true" height="688" width="1007">
<operator activated="false" class="text:process_document_from_file" expanded="true" height="76" name="Process Documents from Files" width="90" x="715" y="750">
<list key="text_directories">
<parameter key="wsdls" value="/home/simon/work/workspace/MasterThesis/wsdls"/>
</list>
<parameter key="file_pattern" value="*.wsdl"/>
<parameter key="extract_text_only" value="false"/>
<parameter key="use_file_extension_as_type" value="false"/>
<parameter key="content_type" value="xml"/>
<parameter key="create_word_vector" value="false"/>
<parameter key="keep_text" value="true"/>
<process expanded="true">
<operator activated="false" class="text:cut_document" expanded="true" height="60" name="Cut Document" width="90" x="246" y="75">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="operationName" value="//wsdl:operation/@name"/>
<parameter key="documentation" value="//wsdl:documentation/text()"/>
</list>
<list key="namespaces">
<parameter key="wsdl" value="http://schemas.xmlsoap.org/wsdl/"/>
</list>
<parameter key="ignore_CDATA" value="false"/>
<parameter key="assume_html" value="false"/>
<list key="index_queries"/>
<process expanded="true">
<operator activated="false" class="text:extract_information" expanded="true" height="60" name="Extract Information" width="90" x="514" y="120">
<parameter key="query_type" value="Regular Expression"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries">
<parameter key="all" value="(.*)"/>
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries"/>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="segment" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_port="document" to_op="Cut Document" to_port="document"/>
<connect from_op="Cut Document" from_port="documents" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:process_document_from_file" expanded="true" height="76" name="Process Documents from Files (2)" width="90" x="45" y="30">
<list key="text_directories">
<parameter key="wsdls" value="/home/simon/work/workspace/MasterThesis/wsdls"/>
</list>
<parameter key="extract_text_only" value="false"/>
<process expanded="true">
<operator activated="true" class="text:cut_document" expanded="true" height="60" name="Cut Document (3)" width="90" x="380" y="75">
<parameter key="query_type" value="XPath"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries"/>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="operation" value="//wsdl:operation/@name"/>
</list>
<list key="namespaces">
<parameter key="wsdl" value="http://schemas.xmlsoap.org/wsdl/"/>
</list>
<parameter key="ignore_CDATA" value="false"/>
<parameter key="assume_html" value="false"/>
<list key="index_queries"/>
<process expanded="true">
<operator activated="true" class="text:extract_information" expanded="true" height="60" name="Extract Information (3)" width="90" x="179" y="30">
<parameter key="query_type" value="Regular Expression"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries">
<parameter key="extraction" value="(.*)"/>
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries">
<parameter key="operation" value="//wsdl:operation/@name"/>
</list>
<list key="namespaces">
<parameter key="wsdl" value="http://schemas.xmlsoap.org/wsdl/"/>
</list>
<parameter key="ignore_CDATA" value="false"/>
<parameter key="assume_html" value="false"/>
<list key="index_queries"/>
</operator>
<connect from_port="segment" to_op="Extract Information (3)" to_port="document"/>
<connect from_op="Extract Information (3)" from_port="document" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_port="document" to_op="Cut Document (3)" to_port="document"/>
<connect from_op="Cut Document (3)" from_port="documents" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="k_means" expanded="true" height="76" name="Clustering" width="90" x="581" y="120"/>
<connect from_op="Process Documents from Files (2)" from_port="example set" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="cluster model" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
now my question is, what i have to do, that i can cluster the single documents regarding the extracted content of them. i think there must be a way, as there is meta data about the filename entaild.
i hope i described my problem comprehensible
regards
simon knoll