"Generate N Grams error [Solved]"

New Altair Community Member
Updated by Jocelyn
Hi,
I am using generate N grams to extract all the numbers of length 14.
But it is considering Space also as an character and not giving correct output.
Can anyone help me on this.
Regards
Gunjan
I am using generate N grams to extract all the numbers of length 14.
But it is considering Space also as an character and not giving correct output.
Can anyone help me on this.
Regards
Gunjan
Sort by:
1 - 9 of
91
I have an alphanumeric number in comments text. I wanted to extract that alphanumeric number. Could you please suggest what needs to be done.
Below is the process
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.001" expanded="true" name="Process">
<process expanded="true" height="386" width="748">
<operator activated="true" class="read_excel" compatibility="5.2.001" expanded="true" height="60" name="Read Excel" width="90" x="45" y="120">
<parameter key="excel_file" value="C:\Users\guagg\Desktop\bookk.xls"/>
<parameter key="imported_cell_range" value="A2:A14"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="5.2.001" expanded="true" height="76" name="Nominal to Text" width="90" x="165" y="202"/>
<operator activated="true" class="text:process_document_from_data" compatibility="5.2.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="210">
<parameter key="prune_method" value="absolute"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="9999"/>
<list key="specify_weights"/>
<process expanded="true" height="414" width="762">
<operator activated="true" class="text:generate_n_grams_characters" compatibility="5.2.002" expanded="true" height="60" name="Generate n-Grams (Characters)" width="90" x="313" y="30">
<parameter key="length" value="14"/>
</operator>
<connect from_port="document" to_op="Generate n-Grams (Characters)" to_port="document"/>
<connect from_op="Generate n-Grams (Characters)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 2"/>
<connect from_op="Process Documents from Data" from_port="word list" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
Below is the process
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.001">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.001" expanded="true" name="Process">
<process expanded="true" height="386" width="748">
<operator activated="true" class="read_excel" compatibility="5.2.001" expanded="true" height="60" name="Read Excel" width="90" x="45" y="120">
<parameter key="excel_file" value="C:\Users\guagg\Desktop\bookk.xls"/>
<parameter key="imported_cell_range" value="A2:A14"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<list key="data_set_meta_data_information"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="5.2.001" expanded="true" height="76" name="Nominal to Text" width="90" x="165" y="202"/>
<operator activated="true" class="text:process_document_from_data" compatibility="5.2.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="380" y="210">
<parameter key="prune_method" value="absolute"/>
<parameter key="prune_below_absolute" value="2"/>
<parameter key="prune_above_absolute" value="9999"/>
<list key="specify_weights"/>
<process expanded="true" height="414" width="762">
<operator activated="true" class="text:generate_n_grams_characters" compatibility="5.2.002" expanded="true" height="60" name="Generate n-Grams (Characters)" width="90" x="313" y="30">
<parameter key="length" value="14"/>
</operator>
<connect from_port="document" to_op="Generate n-Grams (Characters)" to_port="document"/>
<connect from_op="Generate n-Grams (Characters)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Read Excel" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 2"/>
<connect from_op="Process Documents from Data" from_port="word list" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
<portSpacing port="sink_result 3" spacing="0"/>
</process>
</operator>
</process>
Do you want something like this? This process extracts a 14-character alphanumeric string consisting of uppercase letters and numbers from a document and stores it as the "serial" attribute.
Best, Marius
Best, Marius
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.006">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.006" expanded="true" name="Process">
<process expanded="true" height="658" width="727">
<operator activated="true" class="generate_data_user_specification" compatibility="5.2.006" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="112" y="30">
<list key="attribute_values">
<parameter key="comment" value=""example text with AFG346GJA57AW1 serial number""/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="5.2.006" expanded="true" height="76" name="Nominal to Text" width="90" x="246" y="30"/>
<operator activated="true" class="text:process_document_from_data" compatibility="5.2.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="447" y="30">
<list key="specify_weights"/>
<process expanded="true" height="676" width="727">
<operator activated="true" class="text:extract_information" compatibility="5.2.002" expanded="true" height="60" name="Extract Information" width="90" x="246" y="30">
<parameter key="query_type" value="Regular Expression"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries">
<parameter key="serial" value="\s([A-Z0-9]{14})\s"/>
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries"/>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<connect from_op="Generate Data by User Specification" from_port="output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
You can assign IDs to the examples, then after Process Documents filter only the interesting attributes (serial), then join this attribute back to the original dataset. Then remove the ID again, and at the end use Write Excel to write the data back into an excel file.
Best, Marius
Best, Marius
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.006">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.006" expanded="true" name="Process">
<process expanded="true" height="658" width="681">
<operator activated="true" class="generate_data_user_specification" compatibility="5.2.006" expanded="true" height="60" name="Generate Data by User Specification" width="90" x="45" y="75">
<list key="attribute_values">
<parameter key="comment" value=""example text with AFG346GJA57AW1 serial number""/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="5.2.006" expanded="true" height="60" name="Generate Data by User Specification (2)" width="90" x="45" y="165">
<list key="attribute_values">
<parameter key="comment" value=""bla bla AFGASDFGA57AW1 hrznfrz""/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="append" compatibility="5.2.006" expanded="true" height="94" name="Append" width="90" x="179" y="75"/>
<operator activated="true" class="generate_id" compatibility="5.2.006" expanded="true" height="76" name="Generate ID" width="90" x="313" y="75"/>
<operator activated="true" class="nominal_to_text" compatibility="5.2.006" expanded="true" height="76" name="Nominal to Text" width="90" x="447" y="75"/>
<operator activated="true" class="multiply" compatibility="5.2.006" expanded="true" height="94" name="Multiply" width="90" x="45" y="345"/>
<operator activated="true" class="text:process_document_from_data" compatibility="5.2.002" expanded="true" height="76" name="Process Documents from Data" width="90" x="179" y="435">
<list key="specify_weights"/>
<process expanded="true" height="676" width="727">
<operator activated="true" class="text:extract_information" compatibility="5.2.002" expanded="true" height="60" name="Extract Information" width="90" x="179" y="30">
<parameter key="query_type" value="Regular Expression"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries">
<parameter key="serial" value="\s([A-Z0-9]{14})\s"/>
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries"/>
<list key="namespaces"/>
<list key="index_queries"/>
</operator>
<connect from_port="document" to_op="Extract Information" to_port="document"/>
<connect from_op="Extract Information" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.2.006" expanded="true" height="76" name="Select Attributes" width="90" x="313" y="435">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="serial"/>
</operator>
<operator activated="true" class="join" compatibility="5.2.006" expanded="true" height="76" name="Join" width="90" x="447" y="345">
<list key="key_attributes"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.2.006" expanded="true" height="76" name="Select Attributes (2)" width="90" x="581" y="345">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="id"/>
<parameter key="invert_selection" value="true"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
<connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
<connect from_op="Append" from_port="merged set" to_op="Generate ID" to_port="example set input"/>
<connect from_op="Generate ID" from_port="example set output" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Multiply" to_port="input"/>
<connect from_op="Multiply" from_port="output 1" to_op="Join" to_port="left"/>
<connect from_op="Multiply" from_port="output 2" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Join" to_port="right"/>
<connect from_op="Join" from_port="join" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="306"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Best, Marius