CSV with multiline field won't load up properly
Find more posts tagged with
Sort by:
1 - 1 of
11
Hi @avorona!
Right now the Read CSV Operator does not support multiline values. You can however do something similar to what I described in this thread. Here is an example process that works with your samples.csv. (You'll need the text extension, which can be found on the marketplace.)
<?xml version="1.0" encoding="UTF-8"?><process version="9.0.003">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="9.0.003" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="open_file" compatibility="9.0.003" expanded="true" height="68" name="Open File" width="90" x="45" y="34">
<parameter key="filename" value="sample.csv"/>
</operator>
<operator activated="true" class="text:read_document" compatibility="8.2.000-SNAPSHOT" expanded="true" height="68" name="Read Document" width="90" x="179" y="34">
<parameter key="extract_text_only" value="false"/>
</operator>
<operator activated="true" class="text:cut_document" compatibility="8.2.000-SNAPSHOT" expanded="true" height="68" name="Cut Document" width="90" x="313" y="34">
<parameter key="query_type" value="Regular Expression"/>
<list key="string_machting_queries"/>
<list key="regular_expression_queries">
<parameter key="text" value="((?:[^"]+?|"(.|\n)*?"|)*?)\n"/>
</list>
<list key="regular_region_queries"/>
<list key="xpath_queries"/>
<list key="namespaces"/>
<list key="index_queries"/>
<list key="jsonpath_queries"/>
<process expanded="true">
<operator activated="true" class="text:remove_document_parts" compatibility="8.2.000-SNAPSHOT" expanded="true" height="68" name="Remove Document Parts" width="90" x="45" y="34">
<parameter key="deletion_regex" value="""/>
</operator>
<connect from_port="segment" to_op="Remove Document Parts" to_port="document"/>
<connect from_op="Remove Document Parts" from_port="document" to_port="document 1"/>
<portSpacing port="source_segment" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="text:documents_to_data" compatibility="8.2.000-SNAPSHOT" expanded="true" height="82" name="Documents to Data" width="90" x="447" y="34">
<parameter key="text_attribute" value="text"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="9.0.003" expanded="true" height="82" name="Select Attributes" width="90" x="581" y="34">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="text"/>
</operator>
<operator activated="true" class="split" compatibility="9.0.003" expanded="true" height="82" name="Split" width="90" x="715" y="34"/>
<operator activated="true" class="rename_by_example_values" compatibility="9.0.003" expanded="true" height="82" name="Rename by Example Values" width="90" x="849" y="34"/>
<operator activated="true" class="guess_types" compatibility="9.0.003" expanded="true" height="82" name="Guess Types" width="90" x="983" y="34">
<parameter key="include_special_attributes" value="true"/>
</operator>
<connect from_op="Open File" from_port="file" to_op="Read Document" to_port="file"/>
<connect from_op="Read Document" from_port="output" to_op="Cut Document" to_port="document"/>
<connect from_op="Cut Document" from_port="documents" to_op="Documents to Data" to_port="documents 1"/>
<connect from_op="Documents to Data" from_port="example set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Split" to_port="example set input"/>
<connect from_op="Split" from_port="example set output" to_op="Rename by Example Values" to_port="example set input"/>
<connect from_op="Rename by Example Values" from_port="example set output" to_op="Guess Types" to_port="example set input"/>
<connect from_op="Guess Types" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
I used a regular expression to cut the the plain text into rows, convert them to an example set and then split the text by ",". This has limitations, i.e. it only works if multiline values are in quotes, and only if there are no commata in any value.
I hope this helps!
Cheers,
Jan
Hi @avorona!
Right now the Read CSV Operator does not support multiline values. You can however do something similar to what I described in this thread. Here is an example process that works with your samples.csv. (You'll need the text extension, which can be found on the marketplace.)
I used a regular expression to cut the the plain text into rows, convert them to an example set and then split the text by ",". This has limitations, i.e. it only works if multiline values are in quotes, and only if there are no commata in any value.
I hope this helps!
Cheers,
Jan