ReadCSV not making progress.
Thanks for the suggestion of increasing RAM to 4gb, RapidMiner's no longer hanging trying to read a ~600mb CSV file.
Problem now is its not making any progress after 4.5 hours. ActivityMonitor shows its no longer making context switches or making mach calls. GUI is responding, but that's about it. Only the Pause and Stop buttons are lit; the Go arrow's still greyed out. Screen bottom shows [1] Process 4:30:xx Read CSV 4:30:xx with time ticking over.
Something's got to be wrong. 600mb is just not that big and my Java datacleaner produces it in less that a minute with neglibible RAM.
Most of all, how to get insight into what's going on. The GUI just isn't that illuminating.
Program's slightly changed from last time:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<parameter key="resultfile" value="/Users/brad/Dropbox-Overflow/ASADataExpo2009/decisions"/>
<parameter key="parallelize_main_process" value="true"/>
<process expanded="true" height="161" width="631">
<operator activated="true" class="read_csv" compatibility="5.2.008" expanded="true" height="60" name="Read CSV" width="90" x="45" y="30">
<parameter key="csv_file" value="/Users/brad/Dropbox-Overflow/ASADataExpo2009/2008.csv"/>
<parameter key="column_separators" value=","/>
<parameter key="trim_lines" value="true"/>
<parameter key="use_quotes" value="false"/>
<parameter key="skip_comments" value="true"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="encoding" value="MacRoman"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="Year.false.integer.attribute"/>
<parameter key="1" value="Month.true.integer.attribute"/>
<parameter key="2" value="DayofMonth.true.integer.attribute"/>
<parameter key="3" value="DayOfWeek.true.integer.attribute"/>
<parameter key="4" value="DepTime.true.integer.attribute"/>
<parameter key="5" value="CRSDepTime.true.integer.attribute"/>
<parameter key="6" value="ArrTime.true.integer.attribute"/>
<parameter key="7" value="CRSArrTime.true.integer.attribute"/>
<parameter key="8" value="UniqueCarrier.true.binominal.attribute"/>
<parameter key="9" value="FlightNum.true.integer.attribute"/>
<parameter key="10" value="TailNum.false.polynominal.attribute"/>
<parameter key="11" value="ActualElapsedTime.true.integer.attribute"/>
<parameter key="12" value="CRSElapsedTime.true.integer.attribute"/>
<parameter key="13" value="AirTime.true.integer.attribute"/>
<parameter key="14" value="ArrDelay.true.polynominal.label"/>
<parameter key="15" value="DepDelay.true.integer.attribute"/>
<parameter key="16" value="Origin.true.polynominal.attribute"/>
<parameter key="17" value="Dest.true.polynominal.attribute"/>
<parameter key="18" value="Distance.true.integer.attribute"/>
<parameter key="19" value="TaxiIn.false.integer.attribute"/>
<parameter key="20" value="TaxiOut.false.integer.attribute"/>
<parameter key="21" value="Cancelled.true.integer.attribute"/>
<parameter key="22" value="CancellationCode.false.attribute_value.attribute"/>
<parameter key="23" value="Diverted.false.integer.attribute"/>
<parameter key="24" value="CarrierDelay.false.integer.attribute"/>
<parameter key="25" value="WeatherDelay.false.integer.attribute"/>
<parameter key="26" value="NASDelay.false.integer.attribute"/>
<parameter key="27" value="SecurityDelay.false.integer.attribute"/>
<parameter key="28" value="LateAircraftDelay.false.integer.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="false"/>
</operator>
<operator activated="true" class="sample" compatibility="5.2.008" expanded="true" height="76" name="Sample" width="90" x="179" y="30">
<parameter key="sample_size" value="1000"/>
<list key="sample_size_per_class"/>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="5.2.008" expanded="true" height="94" name="Nominal to Numerical" width="90" x="313" y="30">
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="classification_by_regression" compatibility="5.2.008" expanded="true" height="76" name="Classification by Regression" width="90" x="447" y="30">
<process expanded="true">
<operator activated="true" class="support_vector_machine" compatibility="5.2.008" expanded="true" height="112" name="SVM" width="90" x="514" y="30"/>
<connect from_port="training set" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
</process>
</operator>
<connect from_op="Read CSV" from_port="output" to_op="Sample" to_port="example set input"/>
<connect from_op="Sample" from_port="example set output" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Classification by Regression" to_port="training set"/>
<connect from_op="Classification by Regression" from_port="model" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>
Problem now is its not making any progress after 4.5 hours. ActivityMonitor shows its no longer making context switches or making mach calls. GUI is responding, but that's about it. Only the Pause and Stop buttons are lit; the Go arrow's still greyed out. Screen bottom shows [1] Process 4:30:xx Read CSV 4:30:xx with time ticking over.
Something's got to be wrong. 600mb is just not that big and my Java datacleaner produces it in less that a minute with neglibible RAM.
Most of all, how to get insight into what's going on. The GUI just isn't that illuminating.
Program's slightly changed from last time:
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.2.008">
<context>
<input/>
<output/>
<macros/>
</context>
<operator activated="true" class="process" compatibility="5.2.008" expanded="true" name="Process">
<parameter key="resultfile" value="/Users/brad/Dropbox-Overflow/ASADataExpo2009/decisions"/>
<parameter key="parallelize_main_process" value="true"/>
<process expanded="true" height="161" width="631">
<operator activated="true" class="read_csv" compatibility="5.2.008" expanded="true" height="60" name="Read CSV" width="90" x="45" y="30">
<parameter key="csv_file" value="/Users/brad/Dropbox-Overflow/ASADataExpo2009/2008.csv"/>
<parameter key="column_separators" value=","/>
<parameter key="trim_lines" value="true"/>
<parameter key="use_quotes" value="false"/>
<parameter key="skip_comments" value="true"/>
<parameter key="first_row_as_names" value="false"/>
<list key="annotations">
<parameter key="0" value="Name"/>
</list>
<parameter key="encoding" value="MacRoman"/>
<list key="data_set_meta_data_information">
<parameter key="0" value="Year.false.integer.attribute"/>
<parameter key="1" value="Month.true.integer.attribute"/>
<parameter key="2" value="DayofMonth.true.integer.attribute"/>
<parameter key="3" value="DayOfWeek.true.integer.attribute"/>
<parameter key="4" value="DepTime.true.integer.attribute"/>
<parameter key="5" value="CRSDepTime.true.integer.attribute"/>
<parameter key="6" value="ArrTime.true.integer.attribute"/>
<parameter key="7" value="CRSArrTime.true.integer.attribute"/>
<parameter key="8" value="UniqueCarrier.true.binominal.attribute"/>
<parameter key="9" value="FlightNum.true.integer.attribute"/>
<parameter key="10" value="TailNum.false.polynominal.attribute"/>
<parameter key="11" value="ActualElapsedTime.true.integer.attribute"/>
<parameter key="12" value="CRSElapsedTime.true.integer.attribute"/>
<parameter key="13" value="AirTime.true.integer.attribute"/>
<parameter key="14" value="ArrDelay.true.polynominal.label"/>
<parameter key="15" value="DepDelay.true.integer.attribute"/>
<parameter key="16" value="Origin.true.polynominal.attribute"/>
<parameter key="17" value="Dest.true.polynominal.attribute"/>
<parameter key="18" value="Distance.true.integer.attribute"/>
<parameter key="19" value="TaxiIn.false.integer.attribute"/>
<parameter key="20" value="TaxiOut.false.integer.attribute"/>
<parameter key="21" value="Cancelled.true.integer.attribute"/>
<parameter key="22" value="CancellationCode.false.attribute_value.attribute"/>
<parameter key="23" value="Diverted.false.integer.attribute"/>
<parameter key="24" value="CarrierDelay.false.integer.attribute"/>
<parameter key="25" value="WeatherDelay.false.integer.attribute"/>
<parameter key="26" value="NASDelay.false.integer.attribute"/>
<parameter key="27" value="SecurityDelay.false.integer.attribute"/>
<parameter key="28" value="LateAircraftDelay.false.integer.attribute"/>
</list>
<parameter key="read_not_matching_values_as_missings" value="false"/>
</operator>
<operator activated="true" class="sample" compatibility="5.2.008" expanded="true" height="76" name="Sample" width="90" x="179" y="30">
<parameter key="sample_size" value="1000"/>
<list key="sample_size_per_class"/>
<list key="sample_ratio_per_class"/>
<list key="sample_probability_per_class"/>
</operator>
<operator activated="true" class="nominal_to_numerical" compatibility="5.2.008" expanded="true" height="94" name="Nominal to Numerical" width="90" x="313" y="30">
<list key="comparison_groups"/>
</operator>
<operator activated="true" class="classification_by_regression" compatibility="5.2.008" expanded="true" height="76" name="Classification by Regression" width="90" x="447" y="30">
<process expanded="true">
<operator activated="true" class="support_vector_machine" compatibility="5.2.008" expanded="true" height="112" name="SVM" width="90" x="514" y="30"/>
<connect from_port="training set" to_op="SVM" to_port="training set"/>
<connect from_op="SVM" from_port="model" to_port="model"/>
<portSpacing port="source_training set" spacing="0"/>
<portSpacing port="sink_model" spacing="0"/>
</process>
</operator>
<connect from_op="Read CSV" from_port="output" to_op="Sample" to_port="example set input"/>
<connect from_op="Sample" from_port="example set output" to_op="Nominal to Numerical" to_port="example set input"/>
<connect from_op="Nominal to Numerical" from_port="example set output" to_op="Classification by Regression" to_port="training set"/>
<connect from_op="Classification by Regression" from_port="model" to_port="result 1"/>
<portSpacing port="source_input 1" spacing="0"/>
<portSpacing port="sink_result 1" spacing="0"/>
<portSpacing port="sink_result 2" spacing="0"/>
</process>
</operator>
</process>