🎉Community Raffle - Win $25

An exclusive raffle opportunity for active members like you! Complete your profile, answer questions and get your first accepted badge to enter the raffle.
Join and Win

Problem Mandarin Text mining - HanMiner

YoGVAUser: "YoGVA"
New Altair Community Member
Updated by Jocelyn
Hi everyone,

I am a newbie here but here is my situation.
I need to conduct a qualitative content analysis of a large number of Chinese reports. However, Rapid Miner needs an extension to capture Chinese characters - I found one called Hanminer posted by another member.

I followed the instructions and installed the extension via Github; but the extension does not show up on RapidMiner ... 

Any ideas to solve that issue? Or another was to text mine Chinese documents?

Any help would be much appreciated!
Yoyo

Find more posts tagged with

Sort by:
1 - 1 of 11
    jwpfauUser: "jwpfau"
    Altair Employee
    Accepted Answer
    Updated by jwpfau
    Hi,

    the third party HenMiner Extension has no option to define the encoding of the imported file, as a workaround you could use Macros:

    <?xml version="1.0" encoding="UTF-8"?><process version="10.1.002">
      <context>
        <input/>
        <output/>
        <macros/>
      </context>
      <operator activated="true" class="process" compatibility="10.1.002" expanded="true" name="Process">
        <parameter key="logverbosity" value="init"/>
        <parameter key="random_seed" value="2001"/>
        <parameter key="send_mail" value="never"/>
        <parameter key="notification_email" value=""/>
        <parameter key="process_duration_for_mail" value="30"/>
        <parameter key="encoding" value="UTF-8"/>
        <process expanded="true">
          <operator activated="true" class="open_file" compatibility="10.1.002" expanded="true" height="68" name="Open File" width="90" x="112" y="34">
            <parameter key="resource_type" value="URL"/>
            <parameter key="filename" value=""/>
            <parameter key="url" value="https://us.v-cdn.net/6030995/uploads/editor/sf/nq6mm23abhpa.txt"/>
          </operator>
          <operator activated="true" class="multiply" compatibility="10.1.002" expanded="true" height="103" name="Multiply" width="90" x="246" y="85"/>
          <operator activated="true" class="text:read_document" compatibility="10.0.000" expanded="true" height="68" name="Read Document (2)" width="90" x="380" y="34">
            <parameter key="extract_text_only" value="true"/>
            <parameter key="use_file_extension_as_type" value="true"/>
            <parameter key="content_type" value="txt"/>
            <parameter key="encoding" value="UTF-8"/>
          </operator>
          <operator activated="true" class="text:documents_to_data" compatibility="10.0.000" expanded="true" height="82" name="Documents to Data (2)" width="90" x="514" y="34">
            <parameter key="text_attribute" value="text"/>
            <parameter key="add_meta_information" value="false"/>
            <parameter key="datamanagement" value="double_sparse_array"/>
            <parameter key="data_management" value="auto"/>
            <parameter key="use_processed_text" value="false"/>
          </operator>
          <operator activated="true" class="extract_macro" compatibility="10.1.002" expanded="true" height="68" name="Extract Macro" width="90" x="648" y="34">
            <parameter key="macro" value="text"/>
            <parameter key="macro_type" value="data_value"/>
            <parameter key="statistics" value="average"/>
            <parameter key="attribute_name" value="text"/>
            <parameter key="example_index" value="1"/>
            <list key="additional_macros"/>
          </operator>
          <operator activated="true" class="hanminer:read_document" compatibility="1.0.003" expanded="true" height="68" name="Read Document" width="90" x="782" y="136">
            <parameter key="encoding" value="UTF-8"/>
            <parameter key="import_from_file" value="false"/>
            <parameter key="text" value="%{text}"/>
            <parameter key="file" value="C:/Users/Rui/Downloads/archive (6)-chinese/chinese-dataset-subset.txt"/>
          </operator>
          <operator activated="true" class="hanminer:tokenize" compatibility="1.0.003" expanded="true" height="68" name="Tokenize" width="90" x="916" y="136">
            <parameter key="high_speed_mode" value="false"/>
          </operator>
          <connect from_op="Open File" from_port="file" to_op="Multiply" to_port="input"/>
          <connect from_op="Multiply" from_port="output 1" to_op="Read Document (2)" to_port="file"/>
          <connect from_op="Multiply" from_port="output 2" to_op="Read Document" to_port="file"/>
          <connect from_op="Read Document (2)" from_port="output" to_op="Documents to Data (2)" to_port="documents 1"/>
          <connect from_op="Documents to Data (2)" from_port="example set" to_op="Extract Macro" to_port="example set"/>
          <connect from_op="Read Document" from_port="output" to_op="Tokenize" to_port="document set"/>
          <connect from_op="Tokenize" from_port="document set" to_port="result 1"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
        </process>
      </operator>
    </process>
    

    Greetings,
    Jonas