00001
00008 #include "M5Tree.hh"
00009
00010
00011
00012
00013 #define WANT_MATH // include.h will get math fns
00014
00015 #include "../newmat/newmatap.h"
00016 #ifdef use_namespace
00017 using namespace NEWMAT;
00018 #endif
00019
00020
00021
00022 M5Tree::M5Tree(int id, int trainMode, int trainFreq, int m,
00023 float featPct, bool simple, bool allowAllFeats,
00024 float min_sdr, Random rng):
00025 id(id), mode(trainMode), freq(trainFreq), M(m),
00026 featPct(featPct), SIMPLE(simple), ALLOW_ALL_FEATS(allowAllFeats),
00027 MIN_SDR(min_sdr), rng(rng)
00028 {
00029
00030 nnodes = 0;
00031 nOutput = 0;
00032 nExperiences = 0;
00033 hadError = false;
00034 totalnodes = 0;
00035 maxnodes = N_M5_NODES;
00036
00037
00038
00039 SPLIT_MARGIN = 0.0;
00040
00041 LMDEBUG = false;
00042 DTDEBUG = false;
00043 SPLITDEBUG = false;
00044 STOCH_DEBUG = false;
00045 INCDEBUG = false;
00046 NODEDEBUG = false;
00047 COPYDEBUG = false;
00048 nfeat = 4;
00049
00050 cout << "Created m5 decision tree " << id;
00051 if (SIMPLE) cout << " simple regression";
00052 else cout << " multivariate regression";
00053 if (ALLOW_ALL_FEATS) cout << " (all feats)";
00054 else cout << " (subtree feats)";
00055 if (DTDEBUG){
00056 cout << " mode: " << mode << " freq: " << freq << endl;
00057 }
00058 cout << " MIN_SDR: " << MIN_SDR << endl;
00059
00060 initNodes();
00061 initTree();
00062
00063
00064 }
00065
00066 M5Tree::M5Tree(const M5Tree& m5):
00067 id(m5.id), mode(m5.mode), freq(m5.freq), M(m5.M),
00068 featPct(m5.featPct), SIMPLE(m5.SIMPLE), ALLOW_ALL_FEATS(m5.ALLOW_ALL_FEATS),
00069 MIN_SDR(m5.MIN_SDR), rng(m5.rng)
00070 {
00071 COPYDEBUG = m5.COPYDEBUG;
00072 if (COPYDEBUG) cout << "m5 copy " << id << endl;
00073 nnodes = 0;
00074 nOutput = m5.nOutput;
00075 nExperiences = m5.nExperiences;
00076 hadError = m5.hadError;
00077 totalnodes = 0;
00078 maxnodes = m5.maxnodes;
00079 SPLIT_MARGIN = m5.SPLIT_MARGIN;
00080 LMDEBUG = m5.LMDEBUG;
00081 DTDEBUG = m5.DTDEBUG;
00082 SPLITDEBUG = m5.SPLITDEBUG;
00083 STOCH_DEBUG = m5.STOCH_DEBUG;
00084 INCDEBUG = m5.INCDEBUG;
00085 NODEDEBUG = m5.NODEDEBUG;
00086 nfeat = m5.nfeat;
00087
00088 if (COPYDEBUG) cout << " M5 copy nodes, experiences, root, etc" << endl;
00089
00090 for (int i = 0; i < N_M5_EXP; i++){
00091 allExp[i] = m5.allExp[i];
00092 }
00093 if (COPYDEBUG) cout << " M5 copied exp array" << endl;
00094
00095
00096 experiences.resize(m5.experiences.size());
00097 for (unsigned i = 0; i < m5.experiences.size(); i++){
00098 experiences[i] = &(allExp[i]);
00099 }
00100 if (COPYDEBUG) cout << " M5 set experience pointers" << endl;
00101
00102
00103 initNodes();
00104
00105 if (COPYDEBUG) cout << " M5 copy tree " << endl;
00106 root = allocateNode();
00107 lastNode = root;
00108 copyTree(root, m5.root);
00109 if (COPYDEBUG) cout << " M5 tree copy done" << endl;
00110
00111 if (COPYDEBUG) {
00112 cout << endl << "New tree: " << endl;
00113 printTree(root, 0);
00114 cout << endl;
00115 cout << " m5 copy done" << endl;
00116 }
00117
00118 }
00119
00120 void M5Tree::copyTree(tree_node* newNode, tree_node* origNode){
00121
00122 int nodeId = newNode->id;
00123
00124 if (COPYDEBUG) {
00125 cout << " Copy node " << newNode->id << " from node " << origNode->id << endl;
00126 cout << " NewAddy " << newNode << ", old: " << origNode << endl;
00127 }
00128
00129
00130 *newNode = *origNode;
00131 newNode->id = nodeId;
00132
00133
00134 if (origNode->l != NULL && !newNode->leaf){
00135 newNode->l = allocateNode();
00136 if (COPYDEBUG) cout << " Copy left node " << newNode->l->id << " from " << origNode->l->id << endl;
00137 copyTree(newNode->l, origNode->l);
00138 } else {
00139 newNode->l = NULL;
00140 }
00141
00142 if (origNode->r != NULL && !newNode->leaf){
00143 newNode->r = allocateNode();
00144 if (COPYDEBUG) cout << " Copy right node " << newNode->r->id << " from " << origNode->r->id << endl;
00145 copyTree(newNode->r, origNode->r);
00146 } else {
00147 newNode->r = NULL;
00148 }
00149 }
00150
00151 M5Tree* M5Tree::getCopy(){
00152 M5Tree* copy = new M5Tree(*this);
00153 return copy;
00154 }
00155
00156 M5Tree::~M5Tree() {
00157 deleteTree(root);
00158 for (unsigned i = N_M5_EXP; i < experiences.size(); i++){
00159 delete experiences[i];
00160 }
00161 experiences.clear();
00162 }
00163
00164
00165 bool M5Tree::trainInstance(classPair &instance){
00166
00167 if (DTDEBUG) cout << id << " trainInstance" << endl;
00168
00169
00170
00171 nfeat = instance.in.size();
00172
00173 bool modelChanged = false;
00174
00175
00176
00177
00178 tree_experience *e;
00179 if (nExperiences < N_M5_EXP){
00180
00181 e = &(allExp[nExperiences]);
00182
00183 } else {
00184
00185 e = new tree_experience;
00186 }
00187
00188
00189 e->input = instance.in;
00190 e->output = instance.out;
00191 experiences.push_back(e);
00192 nExperiences++;
00193
00194 if (nExperiences == 1000000){
00195 cout << "Reached limit of # experiences allowed." << endl;
00196 return false;
00197 }
00198
00199 if (nExperiences != (int)experiences.size())
00200 cout << "ERROR: experience size mismatch: " << nExperiences << ", " << experiences.size() << endl;
00201
00202
00203
00204
00205
00206
00207
00208
00209 if ( DTDEBUG) {
00210 cout << "Original input: ";
00211 for (unsigned i = 0; i < instance.in.size(); i++){
00212 cout << instance.in[i] << ", ";
00213 }
00214 cout << endl << " Original output: " << instance.out << endl;
00215 cout << "Added exp id: " << nExperiences << " output: " << e->output << endl;
00216 cout << "Address: " << e << " Input : ";
00217 for (unsigned i = 0; i < e->input.size(); i++){
00218 cout << e->input[i] << ", ";
00219 }
00220 cout << endl << " Now have " << nExperiences << " experiences." << endl;
00221 }
00222
00223
00224
00225
00226 if (mode == BUILD_EVERY || nExperiences <= 1){
00227 rebuildTree();
00228 modelChanged = true;
00229 }
00230
00231
00232 else if (mode == BUILD_ON_ERROR){
00233
00234
00235
00236 std::map<float, float> answer;
00237 testInstance(e->input, &answer);
00238 float val = answer.begin()->first;
00239 float error = fabs(val - e->output);
00240
00241 if (error > 0.0){
00242 rebuildTree();
00243 modelChanged = true;
00244 }
00245 }
00246
00247
00248 else if (mode == BUILD_EVERY_N){
00249
00250 if (!modelChanged && (nExperiences % freq) == 0){
00251 rebuildTree();
00252 modelChanged = true;
00253 }
00254 }
00255
00256 if (modelChanged){
00257 if (DTDEBUG) cout << "DT " << id << " tree re-built." << endl;
00258
00259 if (DTDEBUG){
00260 cout << endl << "DT: " << id << endl;
00261 printTree(root, 0);
00262 cout << "Done printing tree" << endl;
00263 }
00264 }
00265
00266 return modelChanged;
00267
00268 }
00269
00270
00271
00272 bool M5Tree::trainInstances(std::vector<classPair> &instances){
00273 if (DTDEBUG) cout << id << " DT trainInstances: "
00274 << instances.size()
00275 << " nExp: " << nExperiences << endl;
00276 nfeat = instances[0].in.size();
00277
00278
00279
00280 bool modelChanged = false;
00281
00282 bool doBuild = false;
00283
00284
00285 for (unsigned a = 0; a < instances.size(); a++){
00286 classPair instance = instances[a];
00287
00288
00289
00290
00291 tree_experience *e;
00292 if (nExperiences < N_M5_EXP){
00293
00294 e = &(allExp[nExperiences]);
00295
00296 } else {
00297
00298 e = new tree_experience;
00299 }
00300
00301
00302 e->input = instance.in;
00303 e->output = instance.out;
00304 experiences.push_back(e);
00305 nExperiences++;
00306
00307 if (nExperiences == 1000000){
00308 cout << "Reached limit of # experiences allowed." << endl;
00309 return false;
00310 }
00311
00312 if (nExperiences != (int)experiences.size())
00313 cout << "ERROR: experience size mismatch: " << nExperiences << ", " << experiences.size() << endl;
00314
00315 if (DTDEBUG) {
00316 cout << "Original input: ";
00317 for (unsigned i = 0; i < instance.in.size(); i++){
00318 cout << instance.in[i] << ", ";
00319 }
00320 cout << endl << " Original output: " << instance.out << endl;
00321 cout << "Added exp id: " << nExperiences << " output: " << e->output << endl;
00322 cout << "Address: " << e << " Input : ";
00323 for (unsigned i = 0; i < e->input.size(); i++){
00324 cout << e->input[i] << ", ";
00325 }
00326 cout << endl << " Now have " << nExperiences << " experiences." << endl;
00327 }
00328
00329
00330
00331
00332 if (doBuild) continue;
00333
00334
00335 if (mode == BUILD_EVERY || nExperiences <= 1){
00336 doBuild = true;
00337 }
00338
00339
00340 else if (mode == BUILD_ON_ERROR){
00341
00342
00343
00344 std::map<float, float> answer;
00345 testInstance(e->input, &answer);
00346 float val = answer.begin()->first;
00347 float error = fabs(val - e->output);
00348
00349 if (error > 0.0){
00350 doBuild = true;
00351 }
00352
00353 }
00354
00355
00356 else if (mode == BUILD_EVERY_N){
00357
00358 if (!modelChanged && (nExperiences % freq) == 0){
00359 doBuild = true;
00360 }
00361 }
00362
00363 }
00364
00365 if (DTDEBUG) cout << "Added " << instances.size() << " new instances. doBuild = " << doBuild << endl;
00366
00367 if (doBuild){
00368 rebuildTree();
00369 modelChanged = true;
00370 }
00371
00372 if (modelChanged){
00373 if (DTDEBUG) cout << "DT " << id << " tree re-built." << endl;
00374
00375 if (DTDEBUG){
00376 cout << endl << "DT: " << id << endl;
00377 printTree(root, 0);
00378 cout << "Done printing tree" << endl;
00379 }
00380 }
00381
00382 return modelChanged;
00383
00384 }
00385
00386
00387 void M5Tree::rebuildTree(){
00388
00389
00390 buildTree(root, experiences, false);
00391
00392 }
00393
00394
00395
00396 void M5Tree::testInstance(const std::vector<float> &input, std::map<float, float>* retval){
00397 if (DTDEBUG) cout << "testInstance" << endl;
00398
00399 retval->clear();
00400
00401
00402 if (experiences.size() == 0){
00403 (*retval)[0.0] = 1.0;
00404 return;
00405 }
00406
00407
00408 tree_node* leaf = traverseTree(root, input);
00409 lastNode = leaf;
00410
00411
00412 leafPrediction(leaf, input, retval);
00413
00414 }
00415
00416 float M5Tree::getConf(const std::vector<float> &input){
00417 if (DTDEBUG) cout << "numVisits" << endl;
00418
00419
00420 if (experiences.size() == 0){
00421 return 0;
00422 }
00423
00424 if (lastNode == NULL){
00425 return 0;
00426 }
00427
00428
00429
00430
00431
00432 float conf = (float)lastNode->nInstances / (float)(2.0*M);
00433 if (conf > 1.0)
00434 return 1.0;
00435 else
00436 return conf;
00437
00438 }
00439
00440
00441
00442
00443
00444
00445 void M5Tree::initTree(){
00446 if (DTDEBUG) cout << "initTree()" << endl;
00447 root = allocateNode();
00448
00449 if (DTDEBUG) cout << " root id = " << root->id << endl;
00450
00451
00452 for (int i = 0; i < id; i++){
00453 rng.uniform(0, 1);
00454 }
00455
00456 }
00457
00458
00459
00460
00461 void M5Tree::initTreeNode(tree_node* node){
00462 if (DTDEBUG) cout << "initTreeNode()";
00463
00464 node->id = nnodes++;
00465 if (DTDEBUG) cout << " id = " << node->id << endl;
00466
00467 totalnodes++;
00468 if (totalnodes > maxnodes){
00469 maxnodes = totalnodes;
00470 if (DTDEBUG) cout << id << " M5 MAX nodes: " << maxnodes << endl;
00471 }
00472
00473
00474 node->dim = -1;
00475 node->val = -1;
00476
00477
00478 node->nInstances = 0;
00479 node->constant = 0;
00480
00481
00482
00483
00484
00485 node->l = NULL;
00486 node->r = NULL;
00487
00488 node->leaf = true;
00489
00490 }
00491
00492 void M5Tree::deleteTree(tree_node* node){
00493 if (DTDEBUG) cout << "deleteTree, node=" << node->id << endl;
00494
00495 if (node==NULL)
00496 return;
00497
00498 totalnodes--;
00499
00500 node->nInstances = 0;
00501 node->coefficients.clear();
00502
00503
00504
00505 if (!node->leaf){
00506
00507 if (node->l != NULL){
00508 deleteTree(node->l);
00509 deallocateNode(node->l);
00510 node->l = NULL;
00511 }
00512
00513
00514 if (node->r != NULL){
00515 deleteTree(node->r);
00516 deallocateNode(node->r);
00517 node->r = NULL;
00518 }
00519 }
00520
00521 node->leaf = true;
00522 node->dim = -1;
00523 node->val = -1;
00524 node->constant = 0;
00525 }
00526
00527
00528 M5Tree::tree_node* M5Tree::getCorrectChild(tree_node* node,
00529 const std::vector<float> &input){
00530
00531 if (DTDEBUG) cout << "getCorrectChild, node=" << node->id << endl;
00532
00533 if (passTest(node->dim, node->val, input))
00534 return node->l;
00535 else
00536 return node->r;
00537
00538 }
00539
00540 M5Tree::tree_node* M5Tree::traverseTree(tree_node* node,
00541 const std::vector<float> &input){
00542
00543 if (DTDEBUG) cout << "traverseTree, node=" << node->id << endl;
00544
00545 while (!node->leaf){
00546 node = getCorrectChild(node, input);
00547 }
00548
00549 return node;
00550 }
00551
00552
00553 bool M5Tree::passTest(int dim, float val, const std::vector<float> &input){
00554 if (DTDEBUG) cout << "passTest, dim=" << dim << ",val=" << val
00555 << ",input["<<dim<<"]=" << input[dim] <<endl;
00556
00557 if (input[dim] > val)
00558 return false;
00559 else
00560 return true;
00561
00562 }
00563
00564
00565 void M5Tree::buildTree(tree_node *node,
00566 const std::vector<tree_experience*> &instances,
00567 bool changed){
00568 if(DTDEBUG) cout << "buildTree, node=" << node->id
00569 << ",nInstances:" << instances.size()
00570 << ",chg:" << changed << endl;
00571
00572 if (instances.size() == 0){
00573 cout << "Error: buildTree called on tree " << id << " node " << node->id << " with no instances." << endl;
00574 exit(-1);
00575 }
00576
00577
00578
00579
00580
00581
00582 node->nInstances = instances.size();
00583
00584 bool allSame = true;
00585 float val0 = instances[0]->output;
00586 for (unsigned i = 1; i < instances.size(); i++){
00587 if (instances[i]->output != val0){
00588 allSame = false;
00589 break;
00590 }
00591 }
00592
00593
00594 if (allSame){
00595 makeLeaf(node);
00596 node->constant = instances[0]->output;
00597 if (DTDEBUG){
00598 cout << "Tree " << id << " node " << node->id
00599 << " All " << node->nInstances
00600 << " classified with output "
00601 << instances[0]->output << endl;
00602 }
00603 return;
00604 }
00605
00606
00607 if (node->leaf && node->coefficients.size() > 0) {
00608
00609 float errorSum = 0;
00610 for (unsigned i = 0; i < instances.size(); i++){
00611
00612 tree_node* leaf = traverseTree(node, instances[i]->input);
00613 std::map<float, float> retval;
00614 leafPrediction(leaf, instances[i]->input, &retval);
00615 float prediction = retval.begin()->first;
00616 float absError = fabs(prediction - instances[i]->output);
00617 errorSum += absError;
00618
00619
00620
00621 }
00622 float avgError = errorSum / (float)instances.size();
00623 if (avgError < 0.001){
00624
00625 return;
00626 }
00627 }
00628
00629
00630 if (SPLITDEBUG) cout << endl << "Creating new decision node" << endl;
00631
00632 node->leaf = false;
00633
00634
00635 float bestSDR = -1.0;
00636 int bestDim = -1;
00637 float bestVal = -1;
00638 std::vector<tree_experience*> bestLeft;
00639 std::vector<tree_experience*> bestRight;
00640
00641 testPossibleSplits(instances, &bestSDR, &bestDim, &bestVal, &bestLeft, &bestRight);
00642
00643 implementSplit(node, instances, bestSDR, bestDim, bestVal, bestLeft, bestRight, changed);
00644
00645
00646 pruneTree(node, instances);
00647
00648 }
00649
00650
00651 void M5Tree::makeLeaf(tree_node* node){
00652
00653 removeChildren(node);
00654
00655
00656
00657 if (node->coefficients.size() != (unsigned)nfeat){
00658 node->coefficients.resize(nfeat, 0);
00659 }
00660 for (unsigned i = 0; i < node->coefficients.size(); i++){
00661 node->coefficients[i] = 0;
00662 }
00663
00664 }
00665
00666
00667 void M5Tree::removeChildren(tree_node* node){
00668
00669 if (node->l != NULL){
00670 deleteTree(node->l);
00671 deallocateNode(node->l);
00672 node->l = NULL;
00673 }
00674
00675 if (node->r != NULL){
00676 deleteTree(node->r);
00677 deallocateNode(node->r);
00678 node->r = NULL;
00679 }
00680
00681 node->leaf = true;
00682 }
00683
00684
00685 void M5Tree::pruneTree(tree_node *node,
00686 const std::vector<tree_experience*> &instances){
00687 if (LMDEBUG || DTDEBUG){
00688 printTree(root, 0);
00689 cout << "pruneTree, node=" << node->id
00690 << ",nInstances:" << instances.size() << endl;
00691 }
00692
00693
00694
00695
00696
00697 float subtreeErrorSum = 0;
00698 for (unsigned i = 0; i < instances.size(); i++){
00699
00700
00701 tree_node* leaf = traverseTree(node, instances[i]->input);
00702 std::map<float, float> retval;
00703 leafPrediction(leaf, instances[i]->input, &retval);
00704 float prediction = retval.begin()->first;
00705 float absError = fabs(prediction - instances[i]->output);
00706 subtreeErrorSum += absError;
00707 if (LMDEBUG || DTDEBUG){
00708 cout << "instance " << i << " subtree predicted: " << prediction
00709 << " actual: " << instances[i]->output
00710 << " error: " << absError << endl;
00711 }
00712 }
00713 if (instances.size() < 3){
00714 if (LMDEBUG || DTDEBUG) cout << "instances size <= 2!!!" << endl;
00715 return;
00716 }
00717
00718 float avgTreeError = subtreeErrorSum / (float)instances.size();
00719
00720
00721 if (false && avgTreeError <= 0.0001){
00722 if (LMDEBUG || DTDEBUG)
00723 cout << "Sub-tree is perfect (" << avgTreeError
00724 << "), do not attempt LM" << endl;
00725 return;
00726 }
00727
00728
00729 std::vector<bool> treeFeatsUsed(instances[0]->input.size(), false);
00730 getFeatsUsed(node, &treeFeatsUsed);
00731 int nTreeFeatsUsed = 0;
00732 for (unsigned i = 0; i < treeFeatsUsed.size(); i++){
00733 if (treeFeatsUsed[i])
00734 nTreeFeatsUsed++;
00735 }
00736
00737
00738
00739 std::vector<bool> featsUsed(instances[0]->input.size(), true);
00740 int nFeatsUsed = featsUsed.size();
00741
00742
00743 if (!ALLOW_ALL_FEATS){
00744 featsUsed = treeFeatsUsed;
00745 nFeatsUsed = nTreeFeatsUsed;
00746 }
00747
00748
00749 if (nFeatsUsed == 0){
00750 if (LMDEBUG || DTDEBUG) cout << "No features for LM" << endl;
00751 return;
00752 }
00753
00754
00755 float treeErrorEst = avgTreeError;
00756 float denom = (instances.size() - nTreeFeatsUsed);
00757 if (denom < 1){
00758 denom = 0.5;
00759 if (LMDEBUG) {
00760 cout << "denom of tree error factor is " << denom
00761 << " with nInst " << instances.size()
00762 << " nfeats: " << nTreeFeatsUsed << endl;
00763 }
00764 }
00765 treeErrorEst *= (instances.size() + nTreeFeatsUsed) / denom;
00766
00767
00768 float lmErrorSum = 0;
00769 int nlmFeats = 0;
00770 if (SIMPLE)
00771 nlmFeats = fitSimpleLinearModel(node, instances, featsUsed, nFeatsUsed, &lmErrorSum);
00772 else
00773 nlmFeats = fitLinearModel(node, instances, featsUsed, nFeatsUsed, &lmErrorSum);
00774
00775 float avgLMError = lmErrorSum / (float)instances.size();
00776
00777 float lmErrorEst = avgLMError;
00778 float denom2 = (instances.size() - nlmFeats);
00779 if (denom2 < 1){
00780 denom2 = 0.5;
00781 if (LMDEBUG) {
00782 cout << "denom2 of lm error factor is " << denom2
00783 << " with nInst " << instances.size()
00784 << " nfeats: " << nlmFeats << endl;
00785 }
00786 }
00787 lmErrorEst *= (instances.size() + nlmFeats) / denom2;
00788
00789
00790 if (LMDEBUG || DTDEBUG) {
00791 cout << "Sub-Tree Error: " << treeErrorEst << ", lm Error: "
00792 << lmErrorEst << endl;
00793 }
00794
00795 if (lmErrorEst < (treeErrorEst + 0.1*MIN_SDR)){
00796
00797 if (LMDEBUG || DTDEBUG)
00798 cout << node->id << " replace tree with linear model" << endl;
00799 removeChildren(node);
00800 } else {
00801
00802
00803 }
00804
00805 }
00806
00807
00808 int M5Tree::fitLinearModel(tree_node *node,
00809 const std::vector<tree_experience*> &instances,
00810 std::vector<bool> featureMask,
00811 int nFeats, float* resSum){
00812
00813 if(DTDEBUG || LMDEBUG) cout << "fitLinearModel, node=" << node->id
00814 << ",nInstances:" << instances.size() << endl;
00815
00816
00817 if (node->coefficients.size() != instances[0]->input.size()){
00818 node->coefficients.resize(instances[0]->input.size(), 0);
00819 }
00820
00821 node->constant = 0.0;
00822 bool doRegression = true;
00823 int ntimes = 0;
00824 int nlmFeats = 10;
00825 (*resSum) = 1000000;
00826
00827 while (doRegression){
00828
00829 ntimes++;
00830
00831 int nObs = (int)instances.size();
00832
00833
00834
00835
00836 if (nObs == 0 || nFeats == 0)
00837 break;
00838
00839 Matrix X(nObs, nFeats);
00840 ColumnVector Y(nObs);
00841
00842 std::vector<int> featIndices(nFeats);
00843
00844 std::vector<bool> constants(nFeats,true);
00845 bool foundProblem = false;
00846
00847
00848 for (int i = 0; i < nObs; i++){
00849 tree_experience *e = instances[i];
00850 if (LMDEBUG) cout << "Obs: " << i;
00851
00852
00853 int featIndex = 1;
00854 for (unsigned j = 0; j < featureMask.size(); j++){
00855 node->coefficients[j] = 0;
00856 if (!featureMask[j])
00857 continue;
00858
00859 if (constants[j] && e->input[j] != instances[0]->input[j]){
00860 constants[j] = false;
00861 }
00862
00863
00864
00865
00866
00867
00868
00869
00870
00871
00872
00873
00874 if (i == nObs-1 && constants[j]){
00875
00876 foundProblem = true;
00877 featureMask[j] = false;
00878 nFeats--;
00879 if (nFeats > 0)
00880 continue;
00881 else
00882 break;
00883 }
00884
00885 featIndices[featIndex-1] = j;
00886
00887 X(i+1,featIndex) = e->input[j];
00888 if (LMDEBUG){
00889 cout << " Feat " << featIndex << " index " << j
00890 << " val " << X(i+1,featIndex) << ",";
00891 }
00892 featIndex++;
00893 }
00894
00895 Y(i+1) = e->output;
00896 if (LMDEBUG) cout << " out: " << e->output << endl;
00897 }
00898
00899 if (foundProblem || nFeats == 0)
00900 continue;
00901
00902
00903 ColumnVector Ones(nObs); Ones = 1.0;
00904
00905
00906 RowVector Mrow = Ones.t() * X / nObs;
00907
00908
00909 Matrix XC(nObs,nFeats);
00910 XC = X - Ones * Mrow;
00911
00912
00913 ColumnVector YC(nObs);
00914 Real mval = Sum(Y) / nObs;
00915 YC = Y - Ones * mval;
00916
00917 Try {
00918
00919
00920
00921 SymmetricMatrix SSQ;
00922 SSQ << XC.t() * XC;
00923
00925
00926 LowerTriangularMatrix L = Cholesky(SSQ);
00927
00928
00929 ColumnVector A = L.t().i() * (L.i() * (XC.t() * YC));
00931
00933
00934
00935
00936
00937
00939
00940
00941
00942
00943 Real a = mval - (Mrow * A).AsScalar();
00944
00945
00946
00947 ColumnVector Fitted = X * A + a;
00948 ColumnVector Residual = Y - Fitted;
00949
00950
00951
00952
00953 (*resSum) = 0;
00954 for (int i = 0; i < nObs; i++){
00955 if (DTDEBUG || LMDEBUG){
00956 cout << "instance " << i << " linear model predicted: " << Fitted(i+1)
00957 << " actual: " << instances[i]->output
00958 << " error: " << Residual(i+1) << endl;
00959 }
00960 (*resSum) += fabs(Residual(i+1));
00961 }
00962
00963
00964
00965 nlmFeats = 0;
00966 for (int i = 0; i < nFeats; i++){
00967 if (DTDEBUG || LMDEBUG) cout << "Coeff " << i << " on feat: " << featIndices[i] << " is " << A(i+1) << endl;
00968 node->coefficients[featIndices[i]] = A(i+1);
00969 if (A(i+1) != 0)
00970 nlmFeats++;
00971 }
00972 if (DTDEBUG || LMDEBUG) cout << "constant is " << a << endl;
00973 node->constant = a;
00974
00975
00976 }
00977
00978 CatchAll {
00979
00980
00981
00982
00983
00984
00985 if (ntimes > 1 || nFeats < 2){
00986
00987 doRegression = false;
00988 break;
00989 }
00990 for (unsigned j = 0; j < featureMask.size(); j++){
00991 if (featureMask[j]){
00992
00993 featureMask[j] = false;
00994 nFeats--;
00995 break;
00996 }
00997 }
00998 continue;
00999 }
01000
01001
01002 doRegression = false;
01003
01004 }
01005
01006
01007 return nlmFeats;
01008
01009 }
01010
01011 int M5Tree::fitSimpleLinearModel(tree_node *node,
01012 const std::vector<tree_experience*> &instances,
01013 std::vector<bool> featureMask,
01014 int nFeats, float* resSum){
01015 if(DTDEBUG || LMDEBUG) cout << "fitSimpleLinearModel, node=" << node->id
01016 << ",nInstances:" << instances.size() << endl;
01017
01018
01019 if (node->coefficients.size() != instances[0]->input.size()){
01020 node->coefficients.resize(instances[0]->input.size(), 0);
01021 }
01022
01023
01024
01025 int bestFeat = -1;
01026 float bestCoeff = -1;
01027 float bestError = 1000000;
01028 float bestConstant = -1;
01029
01030 std::vector<float> xsum(instances[0]->input.size(), 0);
01031 std::vector<float> xysum(instances[0]->input.size(), 0);
01032 std::vector<float> x2sum(instances[0]->input.size(), 0);
01033 float ysum = 0;
01034
01035 int nObs = (int)instances.size();
01036 for (int i = 0; i < nObs; i++){
01037 tree_experience *e = instances[i];
01038 if (LMDEBUG) cout << "Obs: " << i;
01039
01040
01041 for (unsigned j = 0; j < instances[0]->input.size(); j++){
01042 if (!featureMask[j]) continue;
01043 if (LMDEBUG) cout << ", F" << j << ": " << e->input[j];
01044 xsum[j] += e->input[j];
01045 xysum[j] += (e->input[j] * e->output);
01046 x2sum[j] += (e->input[j] * e->input[j]);
01047 }
01048 ysum += e->output;
01049 if (LMDEBUG) cout << ", out: " << e->output << endl;
01050 }
01051
01052
01053 for (unsigned j = 0; j < instances[0]->input.size(); j++){
01054 if (!featureMask[j]) continue;
01055
01056
01057
01058
01059
01060 float coeff = (xysum[j] - xsum[j]*ysum/nObs)/(x2sum[j]-(xsum[j]*xsum[j])/nObs);
01061 float constant = (ysum/nObs) - coeff*(xsum[j]/nObs);
01062
01063 if (LMDEBUG) {
01064 cout << "Feat " << j << " coeff: " << coeff << ", constant: "
01065 << constant << " mask: " << featureMask[j] << endl;
01066 }
01067
01068
01069 float errorSum = 0;
01070 for (int i = 0; i < nObs; i++){
01071 tree_experience *e = instances[i];
01072 float pred = constant + coeff * e->input[j];
01073 float error = fabs(pred - e->output);
01074 if (LMDEBUG) cout << "Instance " << i << " error: " << error << endl;
01075 errorSum += error;
01076 }
01077 if (LMDEBUG) cout << "eSum: " << errorSum << endl;
01078
01079
01080 if (errorSum < bestError){
01081 bestError = errorSum;
01082 bestFeat = j;
01083 bestConstant = constant;
01084 bestCoeff = coeff;
01085 }
01086 }
01087
01088 if (LMDEBUG){
01089 cout << "SLM feat: " << bestFeat << " coeff: "
01090 << bestCoeff << " constant: "
01091 << bestConstant << " avgE: " << (bestError/nObs) << endl;
01092 }
01093
01094 if (bestFeat < 0 || bestFeat > (int)instances[0]->input.size()){
01095 node->constant = 0.0;
01096 for (unsigned i = 0; i < node->coefficients.size(); i++){
01097 node->coefficients[i] = 0.0;
01098 }
01099 (*resSum) = 1000000;
01100 return 10;
01101 }
01102
01103
01104 node->constant = bestConstant;
01105 for (unsigned i = 0; i < node->coefficients.size(); i++){
01106 if (i == (unsigned)bestFeat){
01107 node->coefficients[i] = bestCoeff;
01108 if (LMDEBUG) cout << "Set coefficient on feat " << i << " to " << bestCoeff << endl;
01109 }
01110 else {
01111 node->coefficients[i] = 0.0;
01112 }
01113 }
01114
01115 (*resSum) = bestError;
01116
01117
01118 return 1;
01119
01120 }
01121
01122
01123
01124
01125 void M5Tree::implementSplit(tree_node* node,
01126 const std::vector<tree_experience*> &instances,
01127 float bestSDR, int bestDim,
01128 float bestVal,
01129 const std::vector<tree_experience*> &bestLeft,
01130 const std::vector<tree_experience*> &bestRight,
01131 bool changed){
01132 if (DTDEBUG) cout << "implementSplit node=" << node->id
01133 << ",sdr=" << bestSDR
01134 << ",dim=" << bestDim
01135 << ",val=" << bestVal
01136 << ",chg=" << changed << endl;
01137
01138
01139
01140 if (bestSDR < MIN_SDR){
01141 makeLeaf(node);
01142 float valSum = 0;
01143 for (unsigned i = 0; i < instances.size(); i++){
01144 valSum += instances[i]->output;
01145 }
01146 float avg = valSum / (float)(instances.size());
01147 node->constant = avg;
01148 if (SPLITDEBUG || STOCH_DEBUG){
01149 cout << "DT " << id << " Node " << node->id << " Poor sdr "
01150 << node->nInstances
01151 << " instances classified at leaf " << node->id
01152 << " with multiple outputs " << endl;
01153 }
01154 return;
01155 }
01156
01157
01158
01159 if (!changed && node->dim == bestDim && node->val == bestVal
01160 && !node->leaf && node->l != NULL && node->r != NULL){
01161
01162 if (DTDEBUG || SPLITDEBUG) cout << "Same split as before" << endl;
01163
01164
01165 if (bestLeft.size() > (unsigned)node->l->nInstances){
01166
01167 if (DTDEBUG) cout << "Rebuild left side of tree" << endl;
01168 buildTree(node->l, bestLeft, changed);
01169 }
01170
01171 if (bestRight.size() > (unsigned)node->r->nInstances){
01172
01173 if (DTDEBUG) cout << "Rebuild right side of tree" << endl;
01174 buildTree(node->r, bestRight, changed);
01175 }
01176 return;
01177 }
01178
01179
01180
01181
01182 node->leaf = false;
01183 node->dim = bestDim;
01184 node->val = bestVal;
01185
01186 if (SPLITDEBUG) cout << "Best split was cut with val " << node->val
01187 << " on dim " << node->dim
01188 << " with sdr: " << bestSDR << endl;
01189
01190 if (DTDEBUG) cout << "Left has " << bestLeft.size()
01191 << ", right has " << bestRight.size() << endl;
01192
01193
01194 if (bestLeft.size() == 0 || bestRight.size() == 0){
01195 cout << "ERROR: DT " << id << " node " << node->id << " has 0 instances: left: " << bestLeft.size()
01196 << " right: " << bestRight.size() << endl;
01197 cout << "Split was cut with val " << node->val
01198 << " on dim " << node->dim
01199 << " with sdr: " << bestSDR << endl;
01200 exit(-1);
01201 }
01202
01203
01204
01205 if (node->l == NULL){
01206 if (DTDEBUG) cout << "Init new left tree nodes " << endl;
01207 node->l = allocateNode();
01208 }
01209 if (node->r == NULL){
01210 if (DTDEBUG) cout << "Init new right tree nodes " << endl;
01211 node->r = allocateNode();
01212 }
01213
01214
01215 if (DTDEBUG) cout << "Building left tree for node " << node->id << endl;
01216 buildTree(node->l, bestLeft, true);
01217 if (DTDEBUG) cout << "Building right tree for node " << node->id << endl;
01218 buildTree(node->r, bestRight, true);
01219
01220 }
01221
01222
01223 void M5Tree::getFeatsUsed(tree_node* node, std::vector<bool> *featsUsed){
01224
01225
01226 if (node->leaf){
01227
01228 for (unsigned i = 0; i < node->coefficients.size(); i++){
01229 if (node->coefficients[i] != 0){
01230 if (LMDEBUG || DTDEBUG) cout << "Leaf node, used coeff " << i << endl;
01231 (*featsUsed)[i] = true;
01232 }
01233 }
01234 return;
01235 }
01236
01237
01238
01239 (*featsUsed)[node->dim] = true;
01240 if (LMDEBUG || DTDEBUG) cout << "Split node, used feat " << node->dim << endl;
01241
01242 getFeatsUsed(node->l, featsUsed);
01243 getFeatsUsed(node->r, featsUsed);
01244
01245 return;
01246 }
01247
01248
01249 void M5Tree::testPossibleSplits(const std::vector<tree_experience*> &instances,
01250 float *bestSDR, int *bestDim,
01251 float *bestVal,
01252 std::vector<tree_experience*> *bestLeft,
01253 std::vector<tree_experience*> *bestRight) {
01254 if (DTDEBUG) cout << "testPossibleSplits" << endl;
01255
01256
01257
01258 float sd = calcSDforSet(instances);
01259
01260
01261 int nties = 0;
01262
01263
01264 for (unsigned idim = 0; idim < instances[0]->input.size(); idim++){
01265
01266
01267 float minVal, maxVal;
01268 std::set<float> uniques = getUniques(idim, instances, minVal, maxVal);
01269
01270 for (std::set<float>::iterator j = uniques.begin(); j != uniques.end(); j++){
01271
01272
01273 if ((*j) == maxVal)
01274 continue;
01275
01276
01277
01278 if (rng.uniform() < featPct)
01279 continue;
01280
01281 std::vector<tree_experience*> left;
01282 std::vector<tree_experience*> right;
01283
01284
01285 float splitval = (*j);
01286 float sdr = calcSDR(idim, splitval, instances, sd, left, right);
01287
01288 if (SPLITDEBUG) cout << " CUT split val " << splitval
01289 << " on dim: " << idim << " had sdr "
01290 << sdr << endl;
01291
01292
01293 compareSplits(sdr, idim, splitval, left, right, &nties,
01294 bestSDR, bestDim, bestVal, bestLeft, bestRight);
01295
01296
01297 }
01298 }
01299 }
01300
01301
01302
01303 void M5Tree::compareSplits(float sdr, int dim, float val,
01304 const std::vector<tree_experience*> &left,
01305 const std::vector<tree_experience*> &right,
01306 int *nties, float *bestSDR, int *bestDim,
01307 float *bestVal,
01308 std::vector<tree_experience*> *bestLeft,
01309 std::vector<tree_experience*> *bestRight){
01310 if (DTDEBUG) cout << "compareSplits sdr=" << sdr << ",dim=" << dim
01311 << ",val=" << val <<endl;
01312
01313
01314 bool newBest = false;
01315
01316
01317 if (fabs(*bestSDR - sdr) < SPLIT_MARGIN){
01318
01319
01320 (*nties)++;
01321 float randomval = rng.uniform(0,1);
01322 float newsplitprob = (1.0 / (float)*nties);
01323
01324 if (randomval < newsplitprob){
01325 newBest = true;
01326 if (SPLITDEBUG) cout << " Tie on split. DT: " << id << " rand: " << randomval
01327 << " splitProb: " << newsplitprob << ", selecting new split " << endl;
01328 }
01329 else
01330 if (SPLITDEBUG) cout << " Tie on split. DT: " << id << " rand: " << randomval
01331 << " splitProb: " << newsplitprob << ", staying with old split " << endl;
01332 }
01333
01334
01335 else if (sdr > *bestSDR){
01336 newBest = true;
01337 *nties = 1;
01338 }
01339
01340
01341
01342 if (newBest){
01343 *bestSDR = sdr;
01344 *bestDim = dim;
01345 *bestVal = val;
01346 *bestLeft = left;
01347 *bestRight = right;
01348 if (SPLITDEBUG){
01349 cout << " New best sdr: " << *bestSDR
01350 << " with val " << *bestVal
01351 << " on dim " << *bestDim << endl;
01352 }
01353 }
01354 }
01355
01356 float M5Tree::calcSDR(int dim, float val,
01357 const std::vector<tree_experience*> &instances,
01358 float sd,
01359 std::vector<tree_experience*> &left,
01360 std::vector<tree_experience*> &right){
01361 if (DTDEBUG) cout << "calcSDR, dim=" << dim
01362 << " val=" << val
01363 << " sd=" << sd
01364 << " nInstances= " << instances.size() << endl;
01365
01366 left.clear();
01367 right.clear();
01368
01369
01370 for (unsigned i = 0; i < instances.size(); i++){
01371 if (DTDEBUG) cout << "calcSDR - Classify instance " << i
01372 << " on new split " << endl;
01373
01374 if (passTest(dim, val, instances[i]->input)){
01375 left.push_back(instances[i]);
01376 }
01377 else{
01378 right.push_back(instances[i]);
01379 }
01380 }
01381
01382 if (DTDEBUG) cout << "Left has " << left.size()
01383 << ", right has " << right.size() << endl;
01384
01385
01386 float sdLeft = calcSDforSet(left);
01387 float sdRight = calcSDforSet(right);
01388
01389 float leftRatio = (float)left.size() / (float)instances.size();
01390 float rightRatio = (float)right.size() / (float)instances.size();
01391
01392 float sdr = sd - (leftRatio * sdLeft + rightRatio * sdRight);
01393
01394 if (DTDEBUG){
01395 cout << "LeftSD: " << sdLeft
01396 << " RightSD: " << sdRight
01397 << " SD: " << sd
01398 << " SDR: " << sdr
01399 << endl;
01400 }
01401
01402 return sdr;
01403
01404 }
01405
01406 float M5Tree::calcSDforSet(const std::vector<tree_experience*> &instances){
01407 if (DTDEBUG) cout << "calcSDforSet" << endl;
01408
01409 int n = instances.size();
01410
01411 if (n == 0)
01412 return 0;
01413
01414 double sum = 0;
01415 double sumSqr = 0;
01416
01417
01418 for (unsigned i = 0; i < instances.size(); i++){
01419 float val = instances[i]->output;
01420 sum += val;
01421 sumSqr += (val * val);
01422 }
01423
01424 double mean = sum / (double)n;
01425 double variance = (sumSqr - sum*mean)/(double)n;
01426 float sd = sqrt(variance);
01427
01428 return sd;
01429
01430 }
01431
01432 std::set<float> M5Tree::getUniques(int dim, const std::vector<tree_experience*> &instances, float& minVal, float& maxVal){
01433 if (DTDEBUG) cout << "getUniques,dim = " << dim;
01434
01435 std::set<float> uniques;
01436
01437 for (int i = 0; i < (int)instances.size(); i++){
01438 if (i == 0 || instances[i]->input[dim] < minVal)
01439 minVal = instances[i]->input[dim];
01440 if (i == 0 || instances[i]->input[dim] > maxVal)
01441 maxVal = instances[i]->input[dim];
01442
01443 uniques.insert(instances[i]->input[dim]);
01444 }
01445
01446
01447 if (uniques.size() > 100){
01448 float rangeInc = (maxVal - minVal) / 100.0;
01449 uniques.clear();
01450 for (float i = minVal; i < maxVal; i+= rangeInc){
01451 uniques.insert(i);
01452 }
01453 }
01454 uniques.insert(maxVal);
01455
01456 if (DTDEBUG) cout << " #: " << uniques.size() << endl;
01457 return uniques;
01458 }
01459
01460
01461 float* M5Tree::sortOnDim(int dim, const std::vector<tree_experience*> &instances){
01462 if (DTDEBUG) cout << "sortOnDim,dim = " << dim << endl;
01463
01464 float* values = new float[instances.size()];
01465
01466 for (int i = 0; i < (int)instances.size(); i++){
01467
01468
01469 float val = instances[i]->input[dim];
01470
01471
01472
01473 for (int j = 0; j <= i; j++){
01474
01475
01476
01477 if (j==i){
01478 values[j] = val;
01479
01480 }
01481
01482
01483 else if (val < values[j]){
01484
01485
01486
01487 for (int k = i; k > j; k--){
01488
01489 values[k] = values[k-1];
01490 }
01491
01492
01493
01494 values[j] = val;
01495
01496
01497 break;
01498 }
01499
01500 }
01501 }
01502
01503 if (DTDEBUG){
01504 cout << "Sorted array: " << values[0];
01505 for (int i = 1; i < (int)instances.size(); i++){
01506 cout << ", " << values[i];
01507 }
01508 cout << endl;
01509 }
01510
01511 return values;
01512
01513 }
01514
01515
01516 void M5Tree::printTree(tree_node *t, int level){
01517
01518 for (int i = 0; i < level; i++){
01519 cout << ".";
01520 }
01521
01522 cout << "Node " << t->id << " nInstances: " << t->nInstances ;
01523
01524
01525 if (t->leaf){
01526 cout << " Constant: " << t->constant << " coeff: ";
01527 for (unsigned j = 0; j < t->coefficients.size(); j++){
01528 if (!SIMPLE)
01529 cout << t->coefficients[j] << ", ";
01530 if (SIMPLE && t->coefficients[j] != 0)
01531 cout << " on feat " << j << ": " << t->coefficients[j];
01532 }
01533 cout << endl;
01534 return;
01535 }
01536
01537
01538
01539 cout << " Type: CUT";
01540 cout << " Dim: " << t->dim << " Val: " << t->val;
01541 cout << " Left: " << t->l->id << " Right: " << t->r->id << endl;
01542
01543
01544 if (t->dim != -1 && !t->leaf){
01545 printTree(t->l, level+1);
01546 printTree(t->r, level+1);
01547 }
01548
01549 }
01550
01551
01552
01553 void M5Tree::leafPrediction(tree_node* leaf, const std::vector<float> &input, std::map<float, float>* retval){
01554 if (DTDEBUG)
01555 cout << "Calculating output for leaf " << leaf->id << endl;
01556
01557 float prediction = leaf->constant;
01558 if (DTDEBUG) cout << "leaf constant: " << leaf->constant << endl;
01559
01560
01561 for (unsigned i = 0; i < input.size(); i++){
01562 prediction += leaf->coefficients[i] * input[i];
01563 if (DTDEBUG) {
01564 cout << "feat " << i << " coeff: " << leaf->coefficients[i]
01565 << " on input " << input[i] << endl;
01566 }
01567 }
01568
01569 if (DTDEBUG) cout << " prediction: " << prediction << endl;
01570
01571 (*retval)[prediction] = 1.0;
01572 }
01573
01574
01575 void M5Tree::initNodes(){
01576
01577 for (int i = 0; i < N_M5_NODES; i++){
01578 initTreeNode(&(allNodes[i]));
01579 freeNodes.push_back(i);
01580 if (NODEDEBUG)
01581 cout << "init node " << i << " with id " << allNodes[i].id
01582 << ", now " << freeNodes.size() << " free nodes." << endl;
01583 }
01584
01585 }
01586
01587 M5Tree::tree_node* M5Tree::allocateNode(){
01588 if (freeNodes.empty()){
01589 tree_node* newNode = new tree_node;
01590 initTreeNode(newNode);
01591 if (NODEDEBUG)
01592 cout << "PROBLEM: No more pre-allocated nodes!!!" << endl
01593 << "return new node " << newNode->id
01594 << ", now " << freeNodes.size() << " free nodes." << endl;
01595 return newNode;
01596 }
01597
01598 int i = freeNodes.back();
01599 freeNodes.pop_back();
01600 if (NODEDEBUG)
01601 cout << "allocate node " << i << " with id " << allNodes[i].id
01602 << ", now " << freeNodes.size() << " free nodes." << endl;
01603 return &(allNodes[i]);
01604 }
01605
01606 void M5Tree::deallocateNode(tree_node* node){
01607 if (node->id >= N_M5_NODES){
01608 if (NODEDEBUG)
01609 cout << "dealloc extra node id " << node->id
01610 << ", now " << freeNodes.size() << " free nodes." << endl;
01611 delete node;
01612 return;
01613 }
01614
01615 freeNodes.push_back(node->id);
01616 if (NODEDEBUG)
01617 cout << "dealloc node " << node->id
01618 << ", now " << freeNodes.size() << " free nodes." << endl;
01619 }