Extract drug names

Answers
-
Hi @t_klok,
this is one of the problems were i started with "hey that's easy" and it turned out to be a 15operator process. Maybe there is another way to do this? @sgenzer might find one
. Anyway, my solution is attached.
You might want to link up with @SvenVanPoucke . He is a physician and our medical expert in the community.
Best,
Martin
<?xml version="1.0" encoding="UTF-8"?><process version="8.1.000">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="8.1.000" expanded="true" name="Process">
<process expanded="true">
<operator activated="true" class="subprocess" compatibility="8.1.000" expanded="true" height="82" name="Subprocess (2)" width="90" x="45" y="34">
<process expanded="true">
<operator activated="true" class="generate_data_user_specification" compatibility="8.1.000" expanded="true" height="68" name="Generate Data by User Specification (4)" width="90" x="45" y="34">
<list key="attribute_values">
<parameter key="text" value=""This is a drug which includes mydrug""/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="8.1.000" expanded="true" height="68" name="Generate Data by User Specification (5)" width="90" x="45" y="136">
<list key="attribute_values">
<parameter key="text" value=""just a text""/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="8.1.000" expanded="true" height="68" name="Generate Data by User Specification (6)" width="90" x="45" y="238">
<list key="attribute_values">
<parameter key="text" value=""thirddrug in another text twice: thirddrug""/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="append" compatibility="8.1.000" expanded="true" height="124" name="Append (2)" width="90" x="179" y="85"/>
<connect from_op="Generate Data by User Specification (4)" from_port="output" to_op="Append (2)" to_port="example set 1"/>
<connect from_op="Generate Data by User Specification (5)" from_port="output" to_op="Append (2)" to_port="example set 2"/>
<connect from_op="Generate Data by User Specification (6)" from_port="output" to_op="Append (2)" to_port="example set 3"/>
<connect from_op="Append (2)" from_port="merged set" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
<description align="center" color="transparent" colored="false" width="126">Dummy data for drug texts you can replace this with read excel</description>
</operator>
<operator activated="true" class="nominal_to_text" compatibility="8.1.000" expanded="true" height="82" name="Nominal to Text" width="90" x="179" y="34">
<description align="center" color="transparent" colored="false" width="126">Att needs to be text to work with Process Documents</description>
</operator>
<operator activated="true" class="text:process_document_from_data" compatibility="8.1.000" expanded="true" height="82" name="Process Documents from Data" width="90" x="313" y="34">
<parameter key="vector_creation" value="Term Occurrences"/>
<list key="specify_weights"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="8.1.000" expanded="true" height="68" name="Tokenize" width="90" x="45" y="34"/>
<operator activated="false" class="operator_toolbox:FilterTokensUsingExampleSet" compatibility="0.11.000-SNAPSHOT" expanded="true" height="82" name="Filter Tokens Using ExampleSet" width="90" x="380" y="238">
<parameter key="attribute" value="drugname"/>
<description align="center" color="transparent" colored="false" width="126">only use specifed drug names</description>
</operator>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
<description align="center" color="transparent" colored="false" width="126">Generate bag of words</description>
</operator>
<operator activated="true" class="subprocess" compatibility="8.1.000" expanded="true" height="82" name="Subprocess" width="90" x="447" y="136">
<process expanded="true">
<operator activated="true" class="generate_data_user_specification" compatibility="8.1.000" expanded="true" height="68" name="Generate Data by User Specification" width="90" x="45" y="34">
<list key="attribute_values">
<parameter key="drugname" value=""mydrug""/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="8.1.000" expanded="true" height="68" name="Generate Data by User Specification (2)" width="90" x="45" y="136">
<list key="attribute_values">
<parameter key="drugname" value=""anotherdrug""/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="generate_data_user_specification" compatibility="8.1.000" expanded="true" height="68" name="Generate Data by User Specification (3)" width="90" x="45" y="238">
<list key="attribute_values">
<parameter key="drugname" value=""thirddrug""/>
</list>
<list key="set_additional_roles"/>
</operator>
<operator activated="true" class="append" compatibility="8.1.000" expanded="true" height="124" name="Append" width="90" x="179" y="85"/>
<connect from_op="Generate Data by User Specification" from_port="output" to_op="Append" to_port="example set 1"/>
<connect from_op="Generate Data by User Specification (2)" from_port="output" to_op="Append" to_port="example set 2"/>
<connect from_op="Generate Data by User Specification (3)" from_port="output" to_op="Append" to_port="example set 3"/>
<connect from_op="Append" from_port="merged set" to_port="out 1"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
</process>
<description align="center" color="transparent" colored="false" width="126">Dummy data for drug names. You can replace this with read excel</description>
</operator>
<operator activated="true" class="set_role" compatibility="8.1.000" expanded="true" height="82" name="Set Role" width="90" x="581" y="136">
<parameter key="attribute_name" value="drugname"/>
<parameter key="target_role" value="id"/>
<list key="set_additional_roles"/>
<description align="center" color="transparent" colored="false" width="126">id will become header in transpose</description>
</operator>
<operator activated="true" class="transpose" compatibility="8.1.000" expanded="true" height="82" name="Transpose" width="90" x="715" y="136"/>
<operator activated="true" class="data_to_weights" compatibility="8.1.000" expanded="true" height="82" name="Data to Weights" width="90" x="849" y="136"/>
<operator activated="true" class="select_by_weights" compatibility="8.1.000" expanded="true" height="103" name="Select by Weights" width="90" x="1031" y="34">
<description align="center" color="transparent" colored="false" width="126">Only let attributes through which were present in the lower exa</description>
</operator>
<operator activated="true" class="aggregate" compatibility="8.1.000" expanded="true" height="82" name="Aggregate" width="90" x="1184" y="34">
<parameter key="use_default_aggregation" value="true"/>
<parameter key="default_aggregation_function" value="sum"/>
<list key="aggregation_attributes"/>
</operator>
<connect from_op="Subprocess (2)" from_port="out 1" to_op="Nominal to Text" to_port="example set input"/>
<connect from_op="Nominal to Text" from_port="example set output" to_op="Process Documents from Data" to_port="example set"/>
<connect from_op="Process Documents from Data" from_port="example set" to_op="Select by Weights" to_port="example set input"/>
<connect from_op="Subprocess" from_port="out 1" to_op="Set Role" to_port="example set input"/>
<connect from_op="Set Role" from_port="example set output" to_op="Transpose" to_port="example set input"/>
<connect from_op="Transpose" from_port="example set output" to_op="Data to Weights" to_port="example set"/>
<connect from_op="Data to Weights" from_port="weights" to_op="Select by Weights" to_port="weights"/>
<connect from_op="Select by Weights" from_port="example set output" to_op="Aggregate" to_port="example set input"/>
<connect from_op="Aggregate" from_port="example set output" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>0 -
Hi Martin,
Rapid(miner) answers..
Thx I think I understand.
But I would like to filter out drugnames using a list which contains the drugnames.
I do not want to enter all the reference drugnames by hand....0 -
Hi,
sure you can just read in the Excel file instead of generating them by hand. That was just to generate some dummy data.
Best,
Martin
0 -
Hi each country provides a list with official drug names. Additionally, SNOMED can help you find drug names in a text.
@sgenzer wrote:hi @t_klok - I'd want to see the data before really weighing in but just from what you describe I would use the Text Processing extension, tokenize, and then Filter Tokens (Dictionary) with the drug names. It's very similar to what @mschmitz built with his XML.
Scott2 -
Hi,
Please take a look at the technology Microsoft is testing: https://www.youtube.com/watch?v=c6exHAzNwy4#action=share
Cheers Sven
1 -
Thank you all.
I have a (large) list of drugnames and I want to see if freetext fields in an xcl contain any of these names.
So I query an xcl file with freetext cells and the reference is a file with all drugnames.
I do not want to enter all these drugnames one by one in rapidminer.
0