Dear all,
I'm having a problem with a Process I've developed for twitter follower analysis.
The process roughly does:
-reads data twitter followers data (i.e. Twitter ID, description, location, #followers, etc.. ) from a repository or from a file.
-filter out examples with missing text description
At this point one subprocess branches out and does this:
Input: "text description", ID -> Create Vectors TF-IDF -> Cluster them into 5 groups -> Output: ID, cluster
The output of this subproces is joined with an inner join to the original dataset
ad this is where my issue starts.
After the join only ~2k ID out of ~12k original one are matching?!.
I created some breakpoint and compared the IDs along the way and the ID coming out from the subprocess are different from the original ones! (although the number of records/examples is the same) the difference are pretty random.. no evident pattern..
These ID are Integer from the original dataset and not consecutives..
Any idea?..
Here the XML from the subproces:
<operator activated="true" class="subprocess" compatibility="5.3.015" expanded="true" height="94" name="Cluster" width="90" x="179" y="30">
<process expanded="true">
<operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes (2)" width="90" x="45" y="120">
<parameter key="attribute_filter_type" value="subset"/>
<parameter key="attribute" value="description"/>
<parameter key="attributes" value="|id|description"/>
<parameter key="include_special_attributes" value="true"/>
</operator>
<operator activated="true" class="text:data_to_documents" compatibility="5.3.002" expanded="true" height="60" name="Data to Documents" width="90" x="179" y="30">
<list key="specify_weights">
<parameter key="description" value="1.0"/>
</list>
</operator>
<operator activated="true" class="text:process_documents" compatibility="5.3.002" expanded="true" height="94" name="Process Documents" width="90" x="313" y="30">
<parameter key="prune_method" value="percentual"/>
<parameter key="prune_below_percent" value="0.2"/>
<parameter key="prune_above_percent" value="99.99"/>
<parameter key="datamanagement" value="float_array"/>
<process expanded="true">
<operator activated="true" class="text:tokenize" compatibility="5.3.002" expanded="true" height="60" name="Tokenize" width="90" x="45" y="30"/>
<operator activated="true" class="text:transform_cases" compatibility="5.3.002" expanded="true" height="60" name="Transform Cases" width="90" x="112" y="120"/>
<operator activated="true" class="text:filter_stopwords_english" compatibility="5.3.002" expanded="true" height="60" name="Filter Stopwords (English)" width="90" x="179" y="210"/>
<operator activated="true" class="text:filter_by_length" compatibility="5.3.002" expanded="true" height="60" name="Filter Tokens (6)" width="90" x="246" y="300">
<parameter key="min_chars" value="3"/>
</operator>
<operator activated="true" class="text:filter_stopwords_dictionary" compatibility="5.3.002" expanded="true" height="76" name="Filter Stopwords (Dictionary)" width="90" x="380" y="165">
<parameter key="file" value="C:\Users\MenghinI\Documents\Twitter\stopwords.txt"/>
</operator>
<operator activated="true" class="text:stem_snowball" compatibility="5.3.002" expanded="true" height="60" name="Stem (Snowball)" width="90" x="514" y="75"/>
<connect from_port="document" to_op="Tokenize" to_port="document"/>
<connect from_op="Tokenize" from_port="document" to_op="Transform Cases" to_port="document"/>
<connect from_op="Transform Cases" from_port="document" to_op="Filter Stopwords (English)" to_port="document"/>
<connect from_op="Filter Stopwords (English)" from_port="document" to_op="Filter Tokens (6)" to_port="document"/>
<connect from_op="Filter Tokens (6)" from_port="document" to_op="Filter Stopwords (Dictionary)" to_port="document"/>
<connect from_op="Filter Stopwords (Dictionary)" from_port="document" to_op="Stem (Snowball)" to_port="document"/>
<connect from_op="Stem (Snowball)" from_port="document" to_port="document 1"/>
<portSpacing port="source_document" spacing="0"/>
<portSpacing port="sink_document 1" spacing="0"/>
<portSpacing port="sink_document 2" spacing="0"/>
</process>
</operator>
<operator activated="true" class="fast_k_means" compatibility="5.3.015" expanded="true" height="76" name="Clustering" width="90" x="447" y="30">
<parameter key="k" value="5"/>
<parameter key="max_runs" value="1"/>
<parameter key="max_optimization_steps" value="1"/>
</operator>
<operator activated="true" class="select_attributes" compatibility="5.3.015" expanded="true" height="76" name="Select Attributes" width="90" x="447" y="120">
<parameter key="attribute_filter_type" value="single"/>
<parameter key="attribute" value="cluster"/>
</operator>
<operator activated="true" class="join" compatibility="5.3.015" expanded="true" height="76" name="Join" width="90" x="447" y="210">
<parameter key="remove_double_attributes" value="false"/>
<list key="key_attributes"/>
</operator>
<connect from_port="in 1" to_op="Select Attributes (2)" to_port="example set input"/>
<connect from_op="Select Attributes (2)" from_port="example set output" to_op="Data to Documents" to_port="example set"/>
<connect from_op="Select Attributes (2)" from_port="original" to_op="Join" to_port="right"/>
<connect from_op="Data to Documents" from_port="documents" to_op="Process Documents" to_port="documents 1"/>
<connect from_op="Process Documents" from_port="example set" to_op="Clustering" to_port="example set"/>
<connect from_op="Clustering" from_port="cluster model" to_port="out 1"/>
<connect from_op="Clustering" from_port="clustered set" to_op="Select Attributes" to_port="example set input"/>
<connect from_op="Select Attributes" from_port="example set output" to_op="Join" to_port="left"/>
<connect from_op="Join" from_port="join" to_port="out 2"/>
<portSpacing port="source_in 1" spacing="0"/>
<portSpacing port="source_in 2" spacing="0"/>
<portSpacing port="sink_out 1" spacing="0"/>
<portSpacing port="sink_out 2" spacing="0"/>
<portSpacing port="sink_out 3" spacing="0"/>
</process>
</operator>
Any help would be greatly appreciated

Igor