Odd results

c1borg
c1borg New Altair Community Member
edited November 5 in Community Q&A
Hi

We have a process which is a time series prediction of daily data.
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<process version="5.0">
  <context>
    <input>
      <location/>
    </input>
    <output>
      <location/>
      <location/>
      <location/>
    </output>
    <macros/>
  </context>
  <operator activated="true" class="process" expanded="true" name="Process">
    <process expanded="true" height="325" width="1173">
      <operator activated="true" class="read_excel" expanded="true" height="60" name="Reference Data" width="90" x="45" y="30">
        <parameter key="excel_file" value="C:\Documents and Settings\Nev\My Documents\My Dropbox\Project Files SVN\C1borgs Space\Files\GBPUSD.xls"/>
        <parameter key="sheet_number" value="3"/>
      </operator>
      <operator activated="true" class="set_role" expanded="true" height="76" name="ID" width="90" x="179" y="30">
        <parameter key="name" value="Date"/>
        <parameter key="target_role" value="id"/>
      </operator>
      <operator activated="true" class="split_data" expanded="true" height="94" name="Split Data" width="90" x="313" y="30">
        <enumeration key="partitions">
          <parameter key="ratio" value="0.9"/>
          <parameter key="ratio" value="0.1"/>
        </enumeration>
        <parameter key="sampling_type" value="linear sampling"/>
      </operator>
      <operator activated="true" class="set_role" expanded="true" height="76" name="Prediction" width="90" x="447" y="210">
        <parameter key="name" value="Label"/>
        <parameter key="target_role" value="prediction"/>
      </operator>
      <operator activated="true" class="set_role" expanded="true" height="76" name="Label" width="90" x="447" y="30">
        <parameter key="name" value="Label"/>
        <parameter key="target_role" value="label"/>
      </operator>
      <operator activated="true" class="optimize_weights_evolutionary" expanded="true" height="94" name="Optimize Weights (Evolutionary)" width="90" x="581" y="30">
        <parameter key="population_size" value="10"/>
        <parameter key="use_early_stopping" value="true"/>
        <parameter key="selection_scheme" value="roulette wheel"/>
        <parameter key="p_crossover" value="0.2"/>
        <parameter key="crossover_type" value="shuffle"/>
        <process expanded="true" height="287" width="527">
          <operator activated="true" class="split_validation" expanded="true" height="112" name="Validation" width="90" x="45" y="30">
            <process expanded="true" height="305" width="238">
              <operator activated="true" class="neural_net" expanded="true" height="76" name="Neural Net" width="90" x="112" y="30">
                <list key="hidden_layers"/>
                <parameter key="learning_rate" value="0.5"/>
                <parameter key="decay" value="true"/>
              </operator>
              <connect from_port="training" to_op="Neural Net" to_port="training set"/>
              <connect from_op="Neural Net" from_port="model" to_port="model"/>
              <portSpacing port="source_training" spacing="0"/>
              <portSpacing port="sink_model" spacing="0"/>
              <portSpacing port="sink_through 1" spacing="0"/>
            </process>
            <process expanded="true" height="305" width="419">
              <operator activated="true" class="apply_model" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
                <list key="application_parameters"/>
              </operator>
              <operator activated="true" class="write_model" expanded="true" height="60" name="Write Model" width="90" x="246" y="165">
                <parameter key="model_file" value="C:\Projects\RM5\Forex\data\models\model.mod"/>
              </operator>
              <operator activated="true" class="performance" expanded="true" height="76" name="Performance" width="90" x="246" y="30"/>
              <connect from_port="model" to_op="Apply Model" to_port="model"/>
              <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
              <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
              <connect from_op="Apply Model" from_port="model" to_op="Write Model" to_port="input"/>
              <connect from_op="Performance" from_port="performance" to_port="averagable 1"/>
              <portSpacing port="source_model" spacing="0"/>
              <portSpacing port="source_test set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="sink_averagable 1" spacing="0"/>
              <portSpacing port="sink_averagable 2" spacing="0"/>
            </process>
          </operator>
          <connect from_port="example set" to_op="Validation" to_port="training"/>
          <connect from_op="Validation" from_port="averagable 1" to_port="performance"/>
          <portSpacing port="source_example set" spacing="0"/>
          <portSpacing port="source_through 1" spacing="0"/>
          <portSpacing port="sink_performance" spacing="0"/>
        </process>
      </operator>
      <operator activated="true" class="select_by_weights" expanded="true" height="94" name="Select by Weights" width="90" x="715" y="210">
        <parameter key="weight_relation" value="greater"/>
        <parameter key="weight" value="0.5"/>
        <parameter key="use_absolute_weights" value="false"/>
      </operator>
      <operator activated="true" class="read_model" expanded="true" height="60" name="Read Model" width="90" x="715" y="120">
        <parameter key="model_file" value="C:\Projects\RM5\Forex\data\models\model.mod"/>
      </operator>
      <operator activated="true" class="apply_model" expanded="true" height="76" name="Apply Model (2)" width="90" x="849" y="165">
        <list key="application_parameters"/>
      </operator>
      <connect from_op="Reference Data" from_port="output" to_op="ID" to_port="example set input"/>
      <connect from_op="ID" from_port="example set output" to_op="Split Data" to_port="example set"/>
      <connect from_op="Split Data" from_port="partition 1" to_op="Label" to_port="example set input"/>
      <connect from_op="Split Data" from_port="partition 2" to_op="Prediction" to_port="example set input"/>
      <connect from_op="Prediction" from_port="example set output" to_op="Select by Weights" to_port="example set input"/>
      <connect from_op="Label" from_port="example set output" to_op="Optimize Weights (Evolutionary)" to_port="example set in"/>
      <connect from_op="Optimize Weights (Evolutionary)" from_port="weights" to_op="Select by Weights" to_port="weights"/>
      <connect from_op="Optimize Weights (Evolutionary)" from_port="performance" to_port="result 1"/>
      <connect from_op="Select by Weights" from_port="example set output" to_op="Apply Model (2)" to_port="unlabelled data"/>
      <connect from_op="Read Model" from_port="output" to_op="Apply Model (2)" to_port="model"/>
      <connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 2"/>
      <portSpacing port="source_input 1" spacing="0"/>
      <portSpacing port="sink_result 1" spacing="36"/>
      <portSpacing port="sink_result 2" spacing="72"/>
      <portSpacing port="sink_result 3" spacing="18"/>
    </process>
  </operator>
</process>
The datas are split in two subsets through the operator "split data".
90% (about 225 bars) will be the part we use to learn, 10% (about 25) are unseen bars.
Split validation operator will further split the 90% into 70% (about 157 bars) and 30% (about 68 bars).

The results are acceptable but there seems to be a major error the 25 unseen bars are either all BUY or all SELL predictions? However the Testing data has mixed signals of BUY and SELL.

Any guidance you can give as to why this is happening would be appreciated
Tagged:

Answers

  • haddock
    haddock New Altair Community Member
    Hi there c1borg,

    The model is built on the full set of attributes ( during the weight optimisation ), but applied on a slimmed down attribute set on the 25 test examples. But, if the model refers to attributes which have been filtered away whacky behaviour should be expected  ;D 

    As an aside the model is actually only trained on 70% of the training set, being the product of the last pass through the validation.

    So weight up the training set and make a model from that, and all should be .....

    Hope so .

  • c1borg
    c1borg New Altair Community Member
    Many thanks Haddock  ;D

    It seems so obvious where we have been going wrong, now that you have pointed us in the right direction. I will have a play around with the experiment, and if you dont mind I may pick your brains again if I get stuck.

    Thanks again.
    C1borg
  • haddock
    haddock New Altair Community Member
    Nice one C1Borg, good luck.
  • Nick_Coldhand
    Nick_Coldhand New Altair Community Member
    I'm c1borg's mate. Thanks Haddock for pointing us in the right direction, but I don't understand what you mean (most probably because I'm a complete noob).

    When I run the process above, I see "optimize weights" that runs only once, while "split validation" (and all the operators inside, including "write model") run several times. I believe this is due to the fact that Raidminer is trying different sets of weights, following the genetic optimization.

    If this is true, the latest time the model is written, should be related to the latest set of weights, which is what we want.

    With the operator "select by weights" I meant to recall this set and apply the model to a set with the same weights as in the learning (training) process.
  • haddock
    haddock New Altair Community Member
    Hi Nick,

    I hear where you are coming from, it seems intuitive that the last pass through the validation should have the best performance. But that would be wrong because for each combo of weights there is a validation, which in turn is made up of several passes through the data, and all that is being caught is the model made on the very last pass.

    So the last pass could be on sub-optimal weights ( because the weight optimiser can run on past the optimal ) and on an unfortunate 70% subset of the training data. To illustrate the point I've added a log to the following code, and through put both example sets.
    .
    <?xml version="1.0" encoding="UTF-8" standalone="no"?>
    <process version="5.0">
      <context>
        <input>
          <location/>
        </input>
        <output>
          <location/>
          <location/>
          <location/>
          <location/>
        </output>
        <macros/>
      </context>
      <operator activated="true" class="process" expanded="true" name="Process">
        <process expanded="true" height="561" width="915">
          <operator activated="false" class="read_excel" expanded="true" height="60" name="Reference Data" width="90" x="45" y="30">
            <parameter key="excel_file" value="C:\Documents and Settings\Nev\My Documents\My Dropbox\Project Files SVN\C1borgs Space\Files\GBPUSD.xls"/>
            <parameter key="sheet_number" value="3"/>
          </operator>
          <operator activated="false" class="set_role" expanded="true" height="76" name="ID" width="90" x="179" y="30">
            <parameter key="name" value="Date"/>
            <parameter key="target_role" value="id"/>
          </operator>
          <operator activated="true" class="read_model" expanded="true" height="60" name="Read Model" width="90" x="715" y="120">
            <parameter key="model_file" value="C:\Projects\RM5\Forex\data\models\model.mod"/>
          </operator>
          <operator activated="true" class="generate_data" expanded="true" height="60" name="Generate Data" width="90" x="1" y="104"/>
          <operator activated="true" class="discretize_by_bins" expanded="true" height="94" name="Discretize" width="90" x="112" y="165">
            <parameter key="attribute_filter_type" value="single"/>
            <parameter key="attribute" value="label"/>
            <parameter key="include_special_attributes" value="true"/>
          </operator>
          <operator activated="true" class="split_data" expanded="true" height="94" name="Split Data" width="90" x="313" y="120">
            <enumeration key="partitions">
              <parameter key="ratio" value="0.9"/>
              <parameter key="ratio" value="0.1"/>
            </enumeration>
            <parameter key="sampling_type" value="linear sampling"/>
          </operator>
          <operator activated="false" class="set_role" expanded="true" height="76" name="Prediction" width="90" x="447" y="210">
            <parameter key="name" value="Label"/>
            <parameter key="target_role" value="prediction"/>
          </operator>
          <operator activated="false" class="set_role" expanded="true" height="76" name="Label" width="90" x="313" y="30">
            <parameter key="name" value="Label"/>
            <parameter key="target_role" value="label"/>
          </operator>
          <operator activated="true" class="optimize_weights_evolutionary" expanded="true" height="94" name="Optimize Weights (Evolutionary)" width="90" x="581" y="30">
            <parameter key="population_size" value="10"/>
            <parameter key="use_early_stopping" value="true"/>
            <parameter key="selection_scheme" value="roulette wheel"/>
            <parameter key="p_crossover" value="0.2"/>
            <parameter key="crossover_type" value="shuffle"/>
            <process expanded="true" height="561" width="915">
              <operator activated="true" class="split_validation" expanded="true" height="112" name="Validation" width="90" x="45" y="30">
                <process expanded="true" height="561" width="432">
                  <operator activated="true" class="neural_net" expanded="true" height="76" name="Neural Net" width="90" x="112" y="30">
                    <list key="hidden_layers"/>
                    <parameter key="learning_rate" value="0.5"/>
                    <parameter key="decay" value="true"/>
                  </operator>
                  <connect from_port="training" to_op="Neural Net" to_port="training set"/>
                  <connect from_op="Neural Net" from_port="model" to_port="model"/>
                  <portSpacing port="source_training" spacing="0"/>
                  <portSpacing port="sink_model" spacing="0"/>
                  <portSpacing port="sink_through 1" spacing="0"/>
                </process>
                <process expanded="true" height="561" width="432">
                  <operator activated="true" class="apply_model" expanded="true" height="76" name="Apply Model" width="90" x="45" y="30">
                    <list key="application_parameters"/>
                  </operator>
                  <operator activated="true" class="write_model" expanded="true" height="60" name="Write Model" width="90" x="45" y="210">
                    <parameter key="model_file" value="C:\Projects\RM5\Forex\data\models\model.mod"/>
                  </operator>
                  <operator activated="true" class="performance" expanded="true" height="76" name="Performance" width="90" x="179" y="120"/>
                  <operator activated="true" class="log" expanded="true" height="76" name="Log" width="90" x="313" y="30">
                    <list key="log">
                      <parameter key="pass" value="operator.Apply Model.value.applycount"/>
                      <parameter key="performance" value="operator.Performance.value.performance"/>
                    </list>
                  </operator>
                  <connect from_port="model" to_op="Apply Model" to_port="model"/>
                  <connect from_port="test set" to_op="Apply Model" to_port="unlabelled data"/>
                  <connect from_op="Apply Model" from_port="labelled data" to_op="Performance" to_port="labelled data"/>
                  <connect from_op="Apply Model" from_port="model" to_op="Write Model" to_port="input"/>
                  <connect from_op="Performance" from_port="performance" to_op="Log" to_port="through 1"/>
                  <connect from_op="Log" from_port="through 1" to_port="averagable 1"/>
                  <portSpacing port="source_model" spacing="0"/>
                  <portSpacing port="source_test set" spacing="0"/>
                  <portSpacing port="source_through 1" spacing="0"/>
                  <portSpacing port="sink_averagable 1" spacing="0"/>
                  <portSpacing port="sink_averagable 2" spacing="0"/>
                </process>
              </operator>
              <connect from_port="example set" to_op="Validation" to_port="training"/>
              <connect from_op="Validation" from_port="averagable 1" to_port="performance"/>
              <portSpacing port="source_example set" spacing="0"/>
              <portSpacing port="source_through 1" spacing="0"/>
              <portSpacing port="sink_performance" spacing="0"/>
            </process>
          </operator>
          <operator activated="true" class="select_by_weights" expanded="true" height="94" name="Select by Weights" width="90" x="715" y="210">
            <parameter key="weight_relation" value="greater"/>
            <parameter key="weight" value="0.5"/>
            <parameter key="use_absolute_weights" value="false"/>
          </operator>
          <operator activated="true" class="apply_model" expanded="true" height="76" name="Apply Model (2)" width="90" x="849" y="165">
            <list key="application_parameters"/>
          </operator>
          <connect from_op="Read Model" from_port="output" to_op="Apply Model (2)" to_port="model"/>
          <connect from_op="Generate Data" from_port="output" to_op="Discretize" to_port="example set input"/>
          <connect from_op="Discretize" from_port="example set output" to_op="Split Data" to_port="example set"/>
          <connect from_op="Split Data" from_port="partition 1" to_op="Optimize Weights (Evolutionary)" to_port="example set in"/>
          <connect from_op="Split Data" from_port="partition 2" to_op="Select by Weights" to_port="example set input"/>
          <connect from_op="Optimize Weights (Evolutionary)" from_port="example set out" to_port="result 3"/>
          <connect from_op="Optimize Weights (Evolutionary)" from_port="weights" to_op="Select by Weights" to_port="weights"/>
          <connect from_op="Optimize Weights (Evolutionary)" from_port="performance" to_port="result 1"/>
          <connect from_op="Select by Weights" from_port="example set output" to_op="Apply Model (2)" to_port="unlabelled data"/>
          <connect from_op="Apply Model (2)" from_port="labelled data" to_port="result 2"/>
          <portSpacing port="source_input 1" spacing="0"/>
          <portSpacing port="sink_result 1" spacing="0"/>
          <portSpacing port="sink_result 2" spacing="0"/>
          <portSpacing port="sink_result 3" spacing="0"/>
          <portSpacing port="sink_result 4" spacing="0"/>
        </process>
      </operator>
    </process>
  • Nick_Coldhand
    Nick_Coldhand New Altair Community Member
    I understand now. Maybe there are further attempts to find better weights, but not necessarily the latest one is the best.

    What I can't figure out yet, is how can I write a model with the output of "optimize weights (evolutionary)".
    First step is to weigh the training set, like you suggested. Using "Select by weights" should do.
    But then? Should I run again "Split Validation" with nested learner on the new set, or what?
    Sorry for being demanding, it's just that we have been stuck on this for days.
  • haddock
    haddock New Altair Community Member

    Try to separate the issues. The weighting issue thins down the attributes, then there is the modeling, then there is the testing.