"[SOLVED] How to filter documents by their length?"
New Altair Community Member
Hi forum,
I wonder if somebody knows how to filter documents by their length instead of their content. I need to remove short documents from my training examples. But I could only find "Filter Document by Contents" which I think cannot be used for this case.
Thanks all
I wonder if somebody knows how to filter documents by their length instead of their content. I need to remove short documents from my training examples. But I could only find "Filter Document by Contents" which I think cannot be used for this case.
Thanks all
see the answer below for a better way of doing it. However, it is also possible to do it this way which allows for more complex filtering if needed:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.4.000-SNAPSHOT">
<operator activated="true" class="process" compatibility="6.4.000-SNAPSHOT" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:create_document" compatibility="6.1.000" expanded="true" height="60" name="Create Document" width="90" x="45" y="30">
<parameter key="text" value="This is a document. Its length is sufficient to pass the following "Filter Examples" operator."/>
<operator activated="true" class="text:documents_to_data" compatibility="6.1.000" expanded="true" height="76" name="Documents to Data" width="90" x="179" y="30">
<parameter key="text_attribute" value="text"/>
<operator activated="true" class="generate_attributes" compatibility="6.4.000-SNAPSHOT" expanded="true" height="76" name="Generate Attributes" width="90" x="313" y="30">
<list key="function_descriptions">
<parameter key="length" value="length(text)"/>
<operator activated="true" class="filter_examples" compatibility="6.4.000-SNAPSHOT" expanded="true" height="94" name="Filter Examples" width="90" x="447" y="30">
<list key="filters_list">
<parameter key="filters_entry_key" value="length.ge.50"/>
<parameter key="filters_check_metadata" value="false"/>
<operator activated="true" class="text:data_to_documents" compatibility="6.1.000" expanded="true" height="60" name="Data to Documents" width="90" x="581" y="30">
<list key="specify_weights"/>
<operator activated="true" class="text:create_document" compatibility="6.1.000" expanded="true" height="60" name="Create Document (2)" width="90" x="45" y="165">
<parameter key="text" value="Too short"/>
<operator activated="true" class="text:documents_to_data" compatibility="6.1.000" expanded="true" height="76" name="Documents to Data (2)" width="90" x="179" y="165">
<parameter key="text_attribute" value="text"/>
<operator activated="true" class="generate_attributes" compatibility="6.4.000-SNAPSHOT" expanded="true" height="76" name="Generate Attributes (2)" width="90" x="313" y="165">
<list key="function_descriptions">
<parameter key="length" value="length(text)"/>
<operator activated="true" class="filter_examples" compatibility="6.4.000-SNAPSHOT" expanded="true" height="94" name="Filter Examples (2)" width="90" x="447" y="165">
<list key="filters_list">
<parameter key="filters_entry_key" value="length.ge.50"/>
<parameter key="filters_check_metadata" value="false"/>
<operator activated="true" class="text:data_to_documents" compatibility="6.1.000" expanded="true" height="60" name="Data to Documents (2)" width="90" x="581" y="165">
<list key="specify_weights"/>
<connect from_op="Create Document" from_port="output" to_op="Documents to Data" to_port="documents 1"/>
<connect from_op="Documents to Data" from_port="example set" to_op="Generate Attributes" to_port="example set input"/>
<connect from_op="Generate Attributes" from_port="example set output" to_op="Filter Examples" to_port="example set input"/>
<connect from_op="Filter Examples" from_port="example set output" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Data to Documents" from_port="documents" to_port="result 1"/>
<connect from_op="Create Document (2)" from_port="output" to_op="Documents to Data (2)" to_port="documents 1"/>
<connect from_op="Documents to Data (2)" from_port="example set" to_op="Generate Attributes (2)" to_port="example set input"/>
<connect from_op="Generate Attributes (2)" from_port="example set output" to_op="Filter Examples (2)" to_port="example set input"/>
<connect from_op="Filter Examples (2)" from_port="example set output" to_op="Data to Documents (2)" to_port="example set"/>
<connect from_op="Data to Documents (2)" from_port="documents" to_port="result 2"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
Marco0 -
There is always "Filter Tokens by Length". This will work on documents because these are just tokens.
Here's an example<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="6.2.000">
<operator activated="true" class="process" compatibility="6.2.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="text:create_document" compatibility="6.1.000" expanded="true" height="60" name="Create Document" width="90" x="112" y="75">
<parameter key="text" value="This is a long sentence that is more than 25 characters in length"/>
<operator activated="true" class="text:create_document" compatibility="6.1.000" expanded="true" height="60" name="Create Document (2)" width="90" x="112" y="210">
<parameter key="text" value="short"/>
<operator activated="true" class="text:create_document" compatibility="6.1.000" expanded="true" height="60" name="Create Document (3)" width="90" x="112" y="345">
<parameter key="text" value="this is medium length"/>
<operator activated="true" class="text:combine_documents" compatibility="6.1.000" expanded="true" height="112" name="Combine Documents" width="90" x="313" y="75"/>
<operator activated="true" class="text:filter_by_length" compatibility="6.1.000" expanded="true" height="60" name="Filter Tokens (by Length)" width="90" x="447" y="75">
<parameter key="min_chars" value="10"/>
<connect from_op="Create Document" from_port="output" to_op="Combine Documents" to_port="documents 1"/>
<connect from_op="Create Document (2)" from_port="output" to_op="Combine Documents" to_port="documents 2"/>
<connect from_op="Create Document (3)" from_port="output" to_op="Combine Documents" to_port="documents 3"/>
<connect from_op="Combine Documents" from_port="document" to_op="Filter Tokens (by Length)" to_port="document"/>
<connect from_op="Filter Tokens (by Length)" from_port="document" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
Andrew0 -
Thanks a lot Marco and Andrew. I found both of the solutions pretty nice and marked the question as SOLVED.0