00001 #include "LinearSplitsTree.hh"
00002
00003
00004
00005
00006
00007 #define WANT_MATH // include.h will get math fns
00008
00009 #include "../newmat/newmatap.h"
00010 #ifdef use_namespace
00011 using namespace NEWMAT;
00012 #endif
00013
00014
00015
00016
00017
00018 LinearSplitsTree::LinearSplitsTree(int id, int trainMode, int trainFreq, int m,
00019 float featPct, bool simple, float min_er,
00020 Random rng):
00021 id(id), mode(trainMode),
00022 freq(trainFreq), M(m),
00023 featPct(featPct), SIMPLE(simple),
00024 MIN_ER(min_er), rng(rng)
00025 {
00026
00027 nnodes = 0;
00028 nOutput = 0;
00029 nExperiences = 0;
00030 hadError = false;
00031 maxnodes = N_LS_NODES;
00032 totalnodes = 0;
00033
00034
00035 SPLIT_MARGIN = 0.0;
00036
00037 LMDEBUG = false;
00038 DTDEBUG = false;
00039 SPLITDEBUG = false;
00040 STOCH_DEBUG = false;
00041 INCDEBUG = false;
00042 NODEDEBUG = false;
00043 COPYDEBUG = false;
00044
00045 cout << "Created linear splits decision tree " << id;
00046 if (SIMPLE) cout << " simple regression";
00047 else cout << " multivariate regrssion";
00048 if (DTDEBUG){
00049 cout << " mode: " << mode << " freq: " << freq << endl;
00050 }
00051 cout << " MIN_ER: " << MIN_ER << endl;
00052
00053
00054 initNodes();
00055 initTree();
00056
00057 }
00058
00059 LinearSplitsTree::LinearSplitsTree(const LinearSplitsTree& ls):
00060 id(ls.id), mode(ls.mode),
00061 freq(ls.freq), M(ls.M),
00062 featPct(ls.featPct), SIMPLE(ls.SIMPLE),
00063 MIN_ER(ls.MIN_ER), rng(ls.rng)
00064 {
00065 COPYDEBUG = ls.COPYDEBUG;
00066 if (COPYDEBUG) cout << "LS copy " << id << endl;
00067 nnodes = 0;
00068 nOutput = ls.nOutput;
00069 nExperiences = ls.nExperiences;
00070 hadError = ls.hadError;
00071 totalnodes = 0;
00072 maxnodes = ls.maxnodes;
00073 SPLIT_MARGIN = ls.SPLIT_MARGIN;
00074 LMDEBUG = ls.LMDEBUG;
00075 DTDEBUG = ls.DTDEBUG;
00076 SPLITDEBUG = ls.SPLITDEBUG;
00077 STOCH_DEBUG = ls.STOCH_DEBUG;
00078 INCDEBUG = ls.INCDEBUG;
00079 NODEDEBUG = ls.NODEDEBUG;
00080
00081 if (COPYDEBUG) cout << " LS copy nodes, experiences, root, etc" << endl;
00082
00083 for (int i = 0; i < N_LST_EXP; i++){
00084 allExp[i] = ls.allExp[i];
00085 }
00086 if (COPYDEBUG) cout << " LS copied exp array" << endl;
00087
00088
00089 experiences.resize(ls.experiences.size());
00090 for (unsigned i = 0; i < ls.experiences.size(); i++){
00091 experiences[i] = &(allExp[i]);
00092 }
00093 if (COPYDEBUG) cout << " LS set experience pointers" << endl;
00094
00095
00096 initNodes();
00097
00098 if (COPYDEBUG) cout << " LS copy tree " << endl;
00099 root = allocateNode();
00100 lastNode = root;
00101 copyTree(root, ls.root);
00102 if (COPYDEBUG) cout << " LS tree copy done" << endl;
00103
00104 if (COPYDEBUG) {
00105 cout << endl << "New tree: " << endl;
00106 printTree(root, 0);
00107 cout << endl;
00108 cout << " LS copy done" << endl;
00109 }
00110
00111 }
00112
00113 void LinearSplitsTree::copyTree(tree_node* newNode, tree_node* origNode){
00114
00115 int nodeId = newNode->id;
00116
00117 if (COPYDEBUG) {
00118 cout << " Copy node " << newNode->id << " from node " << origNode->id << endl;
00119 cout << " NewAddy " << newNode << ", old: " << origNode << endl;
00120 }
00121
00122
00123 *newNode = *origNode;
00124 newNode->id = nodeId;
00125
00126
00127 if (origNode->l != NULL && !newNode->leaf){
00128 newNode->l = allocateNode();
00129 if (COPYDEBUG) cout << " Copy left node " << newNode->l->id << " from " << origNode->l->id << endl;
00130 copyTree(newNode->l, origNode->l);
00131 } else {
00132 newNode->l = NULL;
00133 }
00134
00135 if (origNode->r != NULL && !newNode->leaf){
00136 newNode->r = allocateNode();
00137 if (COPYDEBUG) cout << " Copy right node " << newNode->r->id << " from " << origNode->r->id << endl;
00138 copyTree(newNode->r, origNode->r);
00139 } else {
00140 newNode->r = NULL;
00141 }
00142 }
00143
00144 LinearSplitsTree* LinearSplitsTree::getCopy(){
00145 LinearSplitsTree* copy = new LinearSplitsTree(*this);
00146 return copy;
00147 }
00148
00149 LinearSplitsTree::~LinearSplitsTree() {
00150 deleteTree(root);
00151 for (unsigned i = N_LST_EXP; i < experiences.size(); i++){
00152 delete experiences[i];
00153 }
00154 experiences.clear();
00155 }
00156
00157
00158 bool LinearSplitsTree::trainInstance(classPair &instance){
00159
00160 if (DTDEBUG) cout << id << " trainInstance" << endl;
00161
00162 bool modelChanged = false;
00163
00164
00165
00166
00167 tree_experience *e;
00168 if (nExperiences < N_LST_EXP){
00169
00170 e = &(allExp[nExperiences]);
00171
00172 } else {
00173
00174 e = new tree_experience;
00175 }
00176
00177
00178 e->input = instance.in;
00179 e->output = instance.out;
00180 experiences.push_back(e);
00181 nExperiences++;
00182
00183 if (nExperiences == 1000000){
00184 cout << "Reached limit of # experiences allowed." << endl;
00185 return false;
00186 }
00187
00188 if (nExperiences != (int)experiences.size())
00189 cout << "ERROR: experience size mismatch: " << nExperiences << ", " << experiences.size() << endl;
00190
00191
00192
00193
00194
00195
00196
00197
00198 if ( DTDEBUG) {
00199 cout << "Original input: ";
00200 for (unsigned i = 0; i < instance.in.size(); i++){
00201 cout << instance.in[i] << ", ";
00202 }
00203 cout << endl << " Original output: " << instance.out << endl;
00204 cout << "Added exp id: " << nExperiences << " output: " << e->output << endl;
00205 cout << "Address: " << e << " Input : ";
00206 for (unsigned i = 0; i < e->input.size(); i++){
00207 cout << e->input[i] << ", ";
00208 }
00209 cout << endl << " Now have " << nExperiences << " experiences." << endl;
00210 }
00211
00212
00213 if (mode == BUILD_EVERY || nExperiences <= 1){
00214 rebuildTree();
00215 modelChanged = true;
00216 }
00217
00218
00219 else if (mode == BUILD_ON_ERROR){
00220
00221
00222
00223 std::map<float, float> answer;
00224 testInstance(e->input, &answer);
00225 float val = answer.begin()->first;
00226 float error = fabs(val - e->output);
00227
00228 if (error > 0.0){
00229 rebuildTree();
00230 modelChanged = true;
00231 }
00232 }
00233
00234
00235 else if (mode == BUILD_EVERY_N){
00236
00237 if (!modelChanged && (nExperiences % freq) == 0){
00238 rebuildTree();
00239 modelChanged = true;
00240 }
00241 }
00242
00243 if (modelChanged){
00244 if (DTDEBUG || SPLITDEBUG) cout << "DT " << id << " tree re-built." << endl;
00245
00246 if (DTDEBUG || SPLITDEBUG){
00247 cout << endl << "DT: " << id << endl;
00248 printTree(root, 0);
00249 cout << "Done printing tree" << endl;
00250 }
00251 }
00252
00253
00254
00255
00256
00257
00258
00259
00260
00261
00262
00263
00264
00265
00266
00267
00268
00269
00270
00271
00272
00273
00274 return modelChanged;
00275
00276 }
00277
00278
00279
00280 bool LinearSplitsTree::trainInstances(std::vector<classPair> &instances){
00281 if (DTDEBUG) cout << "DT trainInstances: " << instances.size() << endl;
00282
00283 bool modelChanged = false;
00284
00285 bool doBuild = false;
00286
00287
00288 for (unsigned a = 0; a < instances.size(); a++){
00289 classPair instance = instances[a];
00290
00291
00292
00293
00294 tree_experience *e;
00295 if (nExperiences < N_LST_EXP){
00296
00297 e = &(allExp[nExperiences]);
00298
00299 } else {
00300
00301 e = new tree_experience;
00302 }
00303
00304
00305 e->input = instance.in;
00306 e->output = instance.out;
00307 experiences.push_back(e);
00308 nExperiences++;
00309
00310 if (nExperiences == 1000000){
00311 cout << "Reached limit of # experiences allowed." << endl;
00312 return false;
00313 }
00314
00315 if (nExperiences != (int)experiences.size())
00316 cout << "ERROR: experience size mismatch: " << nExperiences << ", " << experiences.size() << endl;
00317
00318 if (DTDEBUG) {
00319 cout << "Original input: ";
00320 for (unsigned i = 0; i < instance.in.size(); i++){
00321 cout << instance.in[i] << ", ";
00322 }
00323 cout << endl << " Original output: " << instance.out << endl;
00324 cout << "Added exp id: " << nExperiences << " output: " << e->output << endl;
00325 cout << "Address: " << e << " Input : ";
00326 for (unsigned i = 0; i < e->input.size(); i++){
00327 cout << e->input[i] << ", ";
00328 }
00329 cout << endl << " Now have " << nExperiences << " experiences." << endl;
00330 }
00331
00332
00333
00334
00335
00336
00337
00338
00339
00340
00341
00342
00343
00344
00345
00346
00347
00348
00349
00350
00351
00352
00353
00354
00355
00356 if (doBuild) continue;
00357
00358
00359 if (mode == BUILD_EVERY || nExperiences <= 1){
00360 doBuild = true;
00361 }
00362
00363
00364 else if (mode == BUILD_ON_ERROR){
00365
00366
00367
00368 std::map<float, float> answer;
00369 testInstance(e->input, &answer);
00370 float val = answer.begin()->first;
00371 float error = fabs(val - e->output);
00372
00373 if (error > 0.0){
00374 doBuild = true;
00375 }
00376
00377 }
00378
00379
00380 else if (mode == BUILD_EVERY_N){
00381
00382 if (!modelChanged && (nExperiences % freq) == 0){
00383 doBuild = true;
00384 }
00385 }
00386
00387 }
00388
00389 if (DTDEBUG) cout << "Added " << instances.size() << " new instances. doBuild = " << doBuild << endl;
00390
00391 if (doBuild){
00392 rebuildTree();
00393 modelChanged = true;
00394 }
00395
00396 if (modelChanged){
00397 if (DTDEBUG || SPLITDEBUG) cout << "DT " << id << " tree re-built." << endl;
00398
00399 if (DTDEBUG || SPLITDEBUG){
00400 cout << endl << "DT: " << id << endl;
00401 printTree(root, 0);
00402 cout << "Done printing tree" << endl;
00403 }
00404 }
00405
00406 return modelChanged;
00407
00408 }
00409
00410
00411 void LinearSplitsTree::rebuildTree(){
00412
00413
00414
00415
00416 root->avgError = calcAvgErrorforSet(experiences);
00417
00418 buildTree(root, experiences, false);
00419
00420 }
00421
00422
00423
00424 void LinearSplitsTree::testInstance(const std::vector<float> &input, std::map<float, float>* retval){
00425 if (DTDEBUG) cout << "testInstance on tree " << id << endl;
00426
00427 retval->clear();
00428
00429
00430 if (experiences.size() == 0){
00431 (*retval)[0.0] = 1.0;
00432 return;
00433 }
00434
00435
00436 tree_node* leaf = traverseTree(root, input);
00437 lastNode = leaf;
00438
00439
00440 leafPrediction(leaf, input, retval);
00441
00442 }
00443
00444 float LinearSplitsTree::getConf(const std::vector<float> &input){
00445 if (DTDEBUG) cout << "numVisits" << endl;
00446
00447
00448 if (experiences.size() == 0){
00449 return 0;
00450 }
00451
00452 if (lastNode == NULL)
00453 return 0;
00454
00455
00456
00457
00458
00459 float conf = (float)lastNode->nInstances / (float)(2.0*M);
00460 if (conf > 1.0)
00461 return 1.0;
00462 else
00463 return conf;
00464
00465 }
00466
00467
00468
00469
00470
00471
00472
00473 void LinearSplitsTree::initTree(){
00474 if (DTDEBUG) cout << "initTree()" << endl;
00475 root = allocateNode();
00476
00477 if (DTDEBUG) cout << " root id = " << root->id << endl;
00478
00479
00480 for (int i = 0; i < id; i++){
00481 rng.uniform(0, 1);
00482 }
00483
00484 }
00485
00486
00487
00488
00489 void LinearSplitsTree::initTreeNode(tree_node* node){
00490 if (DTDEBUG) cout << "initTreeNode()";
00491
00492 node->id = nnodes++;
00493 if (DTDEBUG) cout << " id = " << node->id << endl;
00494
00495 totalnodes++;
00496 if (totalnodes > maxnodes){
00497 maxnodes = totalnodes;
00498 if (DTDEBUG) cout << id << " LS MAX nodes: " << maxnodes << endl;
00499 }
00500
00501
00502 node->dim = -1;
00503 node->val = -1;
00504
00505
00506 node->nInstances = 0;
00507
00508
00509 node->constant = 0.0;
00510
00511
00512
00513
00514
00515 node->l = NULL;
00516 node->r = NULL;
00517
00518 node->leaf = true;
00519 node->avgError = 10000;
00520
00521 }
00522
00524 void LinearSplitsTree::deleteTree(tree_node* node){
00525 if (DTDEBUG) cout << "deleteTree, node=" << node->id << endl;
00526
00527 if (node==NULL)
00528 return;
00529
00530 totalnodes--;
00531
00532 node->nInstances = 0;
00533 node->coefficients.clear();
00534
00535
00536
00537 if (!node->leaf){
00538
00539 if (node->l != NULL){
00540 deleteTree(node->l);
00541 deallocateNode(node->l);
00542 node->l = NULL;
00543 }
00544
00545
00546 if (node->r != NULL){
00547 deleteTree(node->r);
00548 deallocateNode(node->r);
00549 node->r = NULL;
00550 }
00551 }
00552
00553 node->dim = -1;
00554 node->leaf = true;
00555 node->avgError = 10000;
00556 node->val = -1;
00557 node->constant = 0;
00558 }
00559
00560
00562 LinearSplitsTree::tree_node* LinearSplitsTree::getCorrectChild(tree_node* node,
00563 const std::vector<float> &input){
00564
00565 if (DTDEBUG) cout << "getCorrectChild, node=" << node->id << endl;
00566
00567 if (passTest(node->dim, node->val, input))
00568 return node->l;
00569 else
00570 return node->r;
00571
00572 }
00573
00575 LinearSplitsTree::tree_node* LinearSplitsTree::traverseTree(tree_node* node,
00576 const std::vector<float> &input){
00577
00578 if (DTDEBUG) cout << "traverseTree, node=" << node->id << endl;
00579
00580 while (!node->leaf){
00581 node = getCorrectChild(node, input);
00582 }
00583
00584 return node;
00585 }
00586
00587
00589 bool LinearSplitsTree::passTest(int dim, float val, const std::vector<float> &input){
00590 if (DTDEBUG) cout << "passTest, dim=" << dim << ",val=" << val
00591 << ",input["<<dim<<"]=" << input[dim] <<endl;
00592
00593
00594 if (input[dim] > val)
00595 return false;
00596 else
00597 return true;
00598
00599 }
00600
00601
00603 void LinearSplitsTree::buildTree(tree_node *node,
00604 const std::vector<tree_experience*> &instances,
00605 bool changed){
00606 if(DTDEBUG) cout << "buildTree, node=" << node->id
00607 << ",nInstances:" << instances.size()
00608 << ",chg:" << changed << endl;
00609
00610 if (instances.size() == 0){
00611 cout << "Error: buildTree called on tree " << id << " node " << node->id << " with no instances." << endl;
00612 exit(-1);
00613 }
00614
00615
00616
00617
00618
00619
00620 node->nInstances = instances.size();
00621
00622
00623 bool allSame = true;
00624 float val0 = instances[0]->output;
00625 for (unsigned i = 1; i < instances.size(); i++){
00626 if (instances[i]->output != val0){
00627 allSame = false;
00628 break;
00629 }
00630 }
00631
00632
00633 if (allSame){
00634 makeLeaf(node, instances);
00635 if (DTDEBUG){
00636 cout << "Tree " << id << " node " << node->id
00637 << " All " << node->nInstances
00638 << " classified with output "
00639 << instances[0]->output << ", " << node->constant << endl;
00640 }
00641 return;
00642 }
00643
00644
00645 if (node != root && node->avgError < MIN_ER){
00646 makeLeaf(node, instances);
00647 if (node->avgError < MIN_ER){
00648 if (DTDEBUG) {
00649 cout << "Tree " << id << " node " << node->id
00650 << " has low error " << node->avgError << " keeping as leaf" << endl;
00651 }
00652 return;
00653 }
00654 }
00655
00656
00657 if (SPLITDEBUG) cout << endl << "Creating new decision node " << id << "-" << node->id << endl;
00658
00659 node->leaf = false;
00660
00661
00662 float bestER = -1.0;
00663 int bestDim = -1;
00664 float bestVal = -1;
00665 std::vector<tree_experience*> bestLeft;
00666 std::vector<tree_experience*> bestRight;
00667 float leftError = 10000;
00668 float rightError = 10000;
00669
00670 testPossibleSplits(node->avgError, instances, &bestER, &bestDim, &bestVal, &bestLeft, &bestRight, &leftError, &rightError);
00671
00672 implementSplit(node, instances, bestER, bestDim, bestVal, bestLeft, bestRight, changed, leftError, rightError);
00673
00674 }
00675
00676
00677 void LinearSplitsTree::makeLeaf(tree_node* node, const std::vector<tree_experience*> &instances){
00678
00679
00680 if (node->l != NULL){
00681 deleteTree(node->l);
00682 deallocateNode(node->l);
00683 node->l = NULL;
00684 }
00685
00686 if (node->r != NULL){
00687 deleteTree(node->r);
00688 deallocateNode(node->r);
00689 node->r = NULL;
00690 }
00691
00692 node->leaf = true;
00693
00694
00695 if (SIMPLE)
00696 node->avgError = fitSimpleLinearModel(instances, &(node->constant), &(node->coefficients));
00697 else
00698 node->avgError = fitMultiLinearModel(instances, &(node->constant), &(node->coefficients));
00699
00700 if (DTDEBUG || LMDEBUG){
00701 cout << "make leaf, fit linear model with constant " << node->constant
00702 << " error: " << node->avgError << endl;
00703 }
00704
00705 }
00706
00707
00708 float LinearSplitsTree::fitMultiLinearModel(const std::vector<tree_experience*> &instances,
00709 float *bestConstant,
00710 std::vector<float> *bestCoefficients){
00711 if(DTDEBUG || LMDEBUG) cout << "fitMultiLinearModel"
00712 << ",nInstances:" << instances.size() << endl;
00713
00714
00715 if (bestCoefficients->size() != instances[0]->input.size()){
00716 bestCoefficients->resize(instances[0]->input.size(), 0);
00717 }
00718
00719
00720 if (instances.size() == 1){
00721 *bestConstant = instances[0]->output;
00722 for (unsigned i = 0; i < bestCoefficients->size(); i++){
00723 (*bestCoefficients)[i] = 0.0;
00724 }
00725 if (LMDEBUG || SPLITDEBUG){
00726 cout << " Singleton constant: "
00727 << *bestConstant << " avgE: " << 0 << endl;
00728 }
00729 return 0;
00730 }
00731
00732
00733
00734 float bestError = 100000;
00735 float avgError = 100000;
00736 bool doRegression = true;
00737 int ntimes = 0;
00738
00739 std::vector<bool> featureMask(bestCoefficients->size(), true);
00740
00741 while (doRegression){
00742
00743 ntimes++;
00744
00745 int nObs = (int)instances.size();
00746
00747 int nFeats = 0;
00748 for (unsigned i = 0; i < featureMask.size(); i++){
00749 if (featureMask[i]) nFeats++;
00750 }
00751 if (nFeats < 1)
00752 break;
00753
00754
00755 if (nObs == 0 || nFeats == 0)
00756 break;
00757
00758 Matrix X(nObs, nFeats);
00759 ColumnVector Y(nObs);
00760
00761 std::vector<int> featIndices(nFeats);
00762
00763 std::vector<bool> constants(nFeats,true);
00764 bool foundProblem = false;
00765
00766
00767 for (int i = 0; i < nObs; i++){
00768 tree_experience *e = instances[i];
00769 if (LMDEBUG) cout << "Obs: " << i;
00770
00771
00772 int featIndex = 1;
00773 for (unsigned j = 0; j < featureMask.size(); j++){
00774 (*bestCoefficients)[j] = 0;
00775 if (!featureMask[j])
00776 continue;
00777
00778 if (constants[j] && e->input[j] != instances[0]->input[j]){
00779 constants[j] = false;
00780 }
00781
00782 if (i == nObs-1 && constants[j]){
00783
00784 foundProblem = true;
00785 featureMask[j] = false;
00786 nFeats--;
00787 if (nFeats > 0)
00788 continue;
00789 else
00790 break;
00791 }
00792
00793 featIndices[featIndex-1] = j;
00794
00795 X(i+1,featIndex) = e->input[j];
00796 if (LMDEBUG){
00797 cout << " Feat " << featIndex << " index " << j
00798 << " val " << X(i+1,featIndex) << ",";
00799 }
00800 featIndex++;
00801 }
00802
00803 Y(i+1) = e->output;
00804 if (LMDEBUG) cout << " out: " << e->output << endl;
00805 }
00806
00807 if (foundProblem)
00808 continue;
00809
00810
00811 ColumnVector Ones(nObs); Ones = 1.0;
00812
00813
00814 RowVector Mrow = Ones.t() * X / nObs;
00815
00816
00817 Matrix XC(nObs,nFeats);
00818 XC = X - Ones * Mrow;
00819
00820
00821 ColumnVector YC(nObs);
00822 Real mval = Sum(Y) / nObs;
00823 YC = Y - Ones * mval;
00824
00825
00826
00827 SymmetricMatrix SSQ;
00828 SSQ << XC.t() * XC;
00829
00830 Try {
00831
00833
00834 LowerTriangularMatrix L = Cholesky(SSQ);
00835
00836
00837 ColumnVector A = L.t().i() * (L.i() * (XC.t() * YC));
00839
00841
00842
00843
00844
00845
00846
00848
00849
00850
00851
00852 Real a = mval - (Mrow * A).AsScalar();
00853
00854
00855
00856 ColumnVector Fitted = X * A + a;
00857 ColumnVector Residual = Y - Fitted;
00858
00859
00860
00861
00862 bestError = 0;
00863 for (int i = 0; i < nObs; i++){
00864 if (DTDEBUG || LMDEBUG){
00865 cout << "instance " << i << " linear model predicted: " << Fitted(i+1)
00866 << " actual: " << instances[i]->output
00867 << " error: " << Residual(i+1) << endl;
00868 }
00869 bestError += fabs(Residual(i+1));
00870 }
00871 avgError = bestError / (float)nObs;
00872
00873
00874 for (int i = 0; i < nFeats; i++){
00875 if (DTDEBUG || LMDEBUG) cout << "Coeff " << i << " on feat: " << featIndices[i] << " is " << A(i+1) << endl;
00876 (*bestCoefficients)[featIndices[i]] = A(i+1);
00877 }
00878 if (DTDEBUG || LMDEBUG) cout << "constant is " << a << endl;
00879 *bestConstant = a;
00880
00881 }
00882
00883 CatchAll {
00884
00885
00886
00887
00888
00889
00890
00891
00892
00893
00894
00895
00896
00897
00898
00899
00900
00901
00902
00903
00904
00905
00906
00907
00908
00909
00910
00911
00912
00913 if (ntimes > 2 || nFeats < 2){
00914
00915 doRegression = false;
00916 break;
00917 }
00918 for (unsigned j = 0; j < featureMask.size(); j++){
00919 if (featureMask[j]){
00920
00921 featureMask[j] = false;
00922 nFeats--;
00923 break;
00924 }
00925 }
00926 continue;
00927 }
00928
00929
00930 doRegression = false;
00931
00932 }
00933
00934
00935 return avgError;
00936
00937 }
00938
00939
00940 float LinearSplitsTree::fitSimpleLinearModel(const std::vector<tree_experience*> &instances,
00941 float *bestConstant, std::vector<float> *bestCoefficients){
00942 if(DTDEBUG || LMDEBUG || SPLITDEBUG) cout << " fitSimpleLinearModel, "
00943 << ",nInstances:" << instances.size() << endl;
00944
00945
00946 if (bestCoefficients->size() != instances[0]->input.size()){
00947 bestCoefficients->resize(instances[0]->input.size(), 0);
00948 }
00949
00950
00951
00952 int bestFeat = -1;
00953 float bestCoeff = -1;
00954 float bestError = 100000;
00955 *bestConstant = -1;
00956
00957
00958 if (instances.size() == 1){
00959 bestFeat = 0;
00960 *bestConstant = instances[0]->output;
00961 for (unsigned i = 0; i < bestCoefficients->size(); i++){
00962 (*bestCoefficients)[i] = 0.0;
00963 }
00964 bestError = 0;
00965 if (LMDEBUG || SPLITDEBUG){
00966 cout << " Singleton constant: "
00967 << *bestConstant << " avgE: " << bestError << endl;
00968 }
00969 return 0;
00970 }
00971
00972 std::vector<float> xsum(instances[0]->input.size(), 0);
00973 std::vector<float> xysum(instances[0]->input.size(), 0);
00974 std::vector<float> x2sum(instances[0]->input.size(), 0);
00975 float ysum = 0;
00976
00977 int nObs = (int)instances.size();
00978 for (int i = 0; i < nObs; i++){
00979 tree_experience *e = instances[i];
00980 if (LMDEBUG) cout << "Obs: " << i;
00981
00982
00983 for (unsigned j = 0; j < instances[0]->input.size(); j++){
00984 if (LMDEBUG) cout << ", F" << j << ": " << e->input[j];
00985 xsum[j] += e->input[j];
00986 xysum[j] += (e->input[j] * e->output);
00987 x2sum[j] += (e->input[j] * e->input[j]);
00988 }
00989 ysum += e->output;
00990 if (LMDEBUG) cout << ", out: " << e->output << endl;
00991 }
00992
00993
00994 for (unsigned j = 0; j < instances[0]->input.size(); j++){
00995 float coeff = (xysum[j] - xsum[j]*ysum/nObs)/(x2sum[j]-(xsum[j]*xsum[j])/nObs);
00996 float constant = (ysum/nObs) - coeff*(xsum[j]/nObs);
00997
00998
00999 if (fabs(coeff) < 1e-5){
01000 coeff = 0.0;
01001 }
01002 if (fabs(constant) < 1e-5){
01003 constant = 0.0;
01004 }
01005
01006 if (LMDEBUG) {
01007 cout << "Feat " << j << " coeff: " << coeff << ", constant: "
01008 << constant << endl;
01009 }
01010
01011
01012 float errorSum = 0;
01013 for (int i = 0; i < nObs; i++){
01014 tree_experience *e = instances[i];
01015 float pred = constant + coeff * e->input[j];
01016 float error = fabs(pred - e->output);
01017 if (LMDEBUG) cout << "Instance " << i << " error: " << error << endl;
01018 errorSum += error;
01019 }
01020 float avgError = errorSum / (float)nObs;
01021 if (LMDEBUG) cout << "avgError: " << avgError << endl;
01022
01023
01024 if (avgError < bestError){
01025 bestError = avgError;
01026 bestFeat = j;
01027 *bestConstant = constant;
01028 bestCoeff = coeff;
01029 }
01030 }
01031
01032 if (LMDEBUG || SPLITDEBUG){
01033 cout << " LST feat: " << bestFeat << " coeff: "
01034 << bestCoeff << " constant: "
01035 << *bestConstant << " avgE: " << bestError << endl;
01036 }
01037
01038 if (bestFeat < 0 || bestFeat > (int)instances[0]->input.size()){
01039 for (unsigned i = 0; i < bestCoefficients->size(); i++){
01040 (*bestCoefficients)[i] = 0.0;
01041 }
01042 *bestConstant = 0.0;
01043 return 100000;
01044 }
01045
01046
01047 for (unsigned i = 0; i < bestCoefficients->size(); i++){
01048 if (i == (unsigned)bestFeat)
01049 (*bestCoefficients)[i] = bestCoeff;
01050 else
01051 (*bestCoefficients)[i] = 0.0;
01052 }
01053
01054 return bestError;
01055
01056 }
01057
01058
01059
01060
01061 void LinearSplitsTree::implementSplit(tree_node* node,
01062 const std::vector<tree_experience*> &instances,
01063 float bestER, int bestDim,
01064 float bestVal,
01065 const std::vector<tree_experience*> &bestLeft,
01066 const std::vector<tree_experience*> &bestRight,
01067 bool changed, float leftError, float rightError){
01068 if (DTDEBUG) cout << "implementSplit node=" << node->id
01069 << ",er=" << bestER
01070 << ",dim=" << bestDim
01071 << ",val=" << bestVal
01072 << ",chg=" << changed << endl;
01073
01074
01075
01076 if (bestER < MIN_ER){
01077 makeLeaf(node, instances);
01078
01079 if (SPLITDEBUG || STOCH_DEBUG){
01080 cout << " DT " << id << " Node " << node->id << " Poor er "
01081 << node->nInstances
01082 << " instances classified at leaf " << node->id
01083 << " with er " << bestER << " constant: " << node->constant << endl;
01084 }
01085 return;
01086 }
01087
01088
01089
01090
01091
01092 if (!changed && node->dim == bestDim && node->val == bestVal
01093 && !node->leaf && node->l != NULL && node->r != NULL){
01094
01095 if (DTDEBUG || SPLITDEBUG) cout << "Same split as before " << node->id << endl;
01096 node->l->avgError = leftError;
01097 node->r->avgError = rightError;
01098
01099
01100
01101 if (bestLeft.size() > (unsigned)node->l->nInstances){
01102
01103 if (DTDEBUG) cout << "Rebuild left side of tree" << endl;
01104 buildTree(node->l, bestLeft, changed);
01105 }
01106
01107 if (bestRight.size() > (unsigned)node->r->nInstances){
01108
01109 if (DTDEBUG) cout << "Rebuild right side of tree" << endl;
01110 buildTree(node->r, bestRight, changed);
01111 }
01112 return;
01113 }
01114
01115
01116
01117
01118 node->leaf = false;
01119 node->dim = bestDim;
01120 node->val = bestVal;
01121
01122 if (SPLITDEBUG) cout << "Best split was cut with val " << node->val
01123 << " on dim " << node->dim
01124 << " with er: " << bestER << endl;
01125
01126 if (DTDEBUG) cout << "Left has " << bestLeft.size()
01127 << ", right has " << bestRight.size() << endl;
01128
01129
01130 if (bestLeft.size() == 0 || bestRight.size() == 0){
01131 cout << "ERROR: DT " << id << " node " << node->id << " has 0 instances: left: " << bestLeft.size()
01132 << " right: " << bestRight.size() << endl;
01133 cout << "Split was cut with val " << node->val
01134 << " on dim " << node->dim
01135 << " with er: " << bestER << endl;
01136 exit(-1);
01137 }
01138
01139
01140
01141 if (node->l == NULL){
01142 if (DTDEBUG) cout << "Init new left tree nodes " << endl;
01143 node->l = allocateNode();
01144 }
01145 if (node->r == NULL){
01146 if (DTDEBUG) cout << "Init new right tree nodes " << endl;
01147 node->r = allocateNode();
01148 }
01149
01150
01151 if (DTDEBUG) cout << "Building left tree for node " << node->id << endl;
01152 node->l->avgError = leftError;
01153 buildTree(node->l, bestLeft, true);
01154 if (DTDEBUG) cout << "Building right tree for node " << node->id << endl;
01155 node->r->avgError = rightError;
01156 buildTree(node->r, bestRight, true);
01157
01158 }
01159
01160
01161 void LinearSplitsTree::testPossibleSplits(float avgError, const std::vector<tree_experience*> &instances,
01162 float *bestER, int *bestDim,
01163 float *bestVal,
01164 std::vector<tree_experience*> *bestLeft,
01165 std::vector<tree_experience*> *bestRight,
01166 float *bestLeftError, float *bestRightError) {
01167 if (DTDEBUG || SPLITDEBUG) cout << "testPossibleSplits, error=" << avgError << endl;
01168
01169 int nties = 0;
01170
01171
01172 for (unsigned idim = 0; idim < instances[0]->input.size(); idim++){
01173
01174
01175 float minVal, maxVal;
01176 std::set<float> uniques = getUniques(idim, instances, minVal, maxVal);
01177
01178 for (std::set<float>::iterator j = uniques.begin(); j != uniques.end(); j++){
01179
01180
01181 if ((*j) == maxVal)
01182 continue;
01183
01184
01185
01186 if (rng.uniform() < featPct)
01187 continue;
01188
01189 std::vector<tree_experience*> left;
01190 std::vector<tree_experience*> right;
01191 float leftError = 10000;
01192 float rightError = 10000;
01193
01194
01195 float splitval = (*j);
01196 float er = calcER(idim, splitval, instances, avgError, left, right, &leftError, &rightError);
01197
01198 if (SPLITDEBUG){
01199 cout << id << " CUT split val " << splitval
01200 << " on dim: " << idim << " had er "
01201 << er << endl;
01202 }
01203
01204
01205 compareSplits(er, idim, splitval, left, right, &nties, leftError, rightError,
01206 bestER, bestDim, bestVal,bestLeft, bestRight, bestLeftError, bestRightError);
01207
01208
01209 }
01210 }
01211 }
01212
01213
01214
01216 void LinearSplitsTree::compareSplits(float er, int dim, float val,
01217 const std::vector<tree_experience*> &left,
01218 const std::vector<tree_experience*> &right,
01219 int *nties, float leftError, float rightError,
01220 float *bestER, int *bestDim,
01221 float *bestVal,
01222 std::vector<tree_experience*> *bestLeft,
01223 std::vector<tree_experience*> *bestRight,
01224 float *bestLeftError, float *bestRightError){
01225 if (DTDEBUG) cout << "compareSplits er=" << er << ",dim=" << dim
01226 << ",val=" << val <<endl;
01227
01228
01229 bool newBest = false;
01230
01231
01232 if (fabs(*bestER - er) < SPLIT_MARGIN){
01233
01234
01235 (*nties)++;
01236 float randomval = rng.uniform(0,1);
01237 float newsplitprob = (1.0 / (float)*nties);
01238
01239 if (randomval < newsplitprob){
01240 newBest = true;
01241 if (SPLITDEBUG) cout << " Tie on split. DT: " << id << " rand: " << randomval
01242 << " splitProb: " << newsplitprob << ", selecting new split " << endl;
01243 }
01244 else
01245 if (SPLITDEBUG) cout << " Tie on split. DT: " << id << " rand: " << randomval
01246 << " splitProb: " << newsplitprob << ", staying with old split " << endl;
01247 }
01248
01249
01250 else if (er > *bestER){
01251 newBest = true;
01252 *nties = 1;
01253 }
01254
01255
01256
01257 if (newBest){
01258 *bestER = er;
01259 *bestDim = dim;
01260 *bestVal = val;
01261 *bestLeft = left;
01262 *bestRight = right;
01263 *bestLeftError = leftError;
01264 *bestRightError = rightError;
01265 if (SPLITDEBUG){
01266 cout << " New best er: " << *bestER
01267 << " with val " << *bestVal
01268 << " on dim " << *bestDim << endl;
01269 }
01270 }
01271 }
01272
01274 float LinearSplitsTree::calcER(int dim, float val,
01275 const std::vector<tree_experience*> &instances,
01276 float avgError,
01277 std::vector<tree_experience*> &left,
01278 std::vector<tree_experience*> &right,
01279 float *leftError, float *rightError){
01280 if (DTDEBUG || SPLITDEBUG) cout << "calcER, dim=" << dim
01281 << " val=" << val
01282 << " err=" << avgError
01283 << " nInstances= " << instances.size() << endl;
01284
01285 left.clear();
01286 right.clear();
01287
01288
01289 for (unsigned i = 0; i < instances.size(); i++){
01290 if (DTDEBUG) cout << " calcER - Classify instance " << i << " on new split " << endl;
01291
01292 if (passTest(dim, val, instances[i]->input)){
01293 left.push_back(instances[i]);
01294 }
01295 else{
01296 right.push_back(instances[i]);
01297 }
01298 }
01299
01300 if (DTDEBUG) cout << "Left has " << left.size()
01301 << ", right has " << right.size() << endl;
01302
01303
01304 *leftError = calcAvgErrorforSet(left);
01305 *rightError = calcAvgErrorforSet(right);
01306
01307 float leftRatio = (float)left.size() / (float)instances.size();
01308 float rightRatio = (float)right.size() / (float)instances.size();
01309 float newError = (leftRatio * (*leftError) + rightRatio * (*rightError));
01310
01311 float er = avgError - newError;
01312
01313 if (DTDEBUG || SPLITDEBUG){
01314 cout << "LeftError: " << *leftError
01315 << " RightError: " << *rightError
01316 << " NewError: " << newError
01317 << " NodeError: " << avgError
01318 << " ER: " << er
01319 << endl;
01320 }
01321
01322 return er;
01323
01324 }
01325
01327 float LinearSplitsTree::calcAvgErrorforSet(const std::vector<tree_experience*> &instances){
01328 if (DTDEBUG) cout << "calcAvgErrorforSet" << endl;
01329
01330 int n = instances.size();
01331
01332 if (n == 0)
01333 return 0;
01334
01335
01336
01337 float constant;
01338 float avgError = 0.0;
01339 std::vector<float> coeff;
01340 if (SIMPLE)
01341 avgError = fitSimpleLinearModel(instances, &constant, &coeff);
01342 else
01343 avgError = fitMultiLinearModel(instances, &constant, &coeff);
01344
01345 return avgError;
01346 }
01347
01348
01350 std::set<float> LinearSplitsTree::getUniques(int dim, const std::vector<tree_experience*> &instances, float& minVal, float& maxVal){
01351 if (DTDEBUG) cout << "getUniques,dim = " << dim;
01352
01353 std::set<float> uniques;
01354
01355 for (int i = 0; i < (int)instances.size(); i++){
01356 if (i == 0 || instances[i]->input[dim] < minVal)
01357 minVal = instances[i]->input[dim];
01358 if (i == 0 || instances[i]->input[dim] > maxVal)
01359 maxVal = instances[i]->input[dim];
01360
01361 uniques.insert(instances[i]->input[dim]);
01362 }
01363
01364
01365
01366 if (uniques.size() > 100){
01367 float rangeInc = (maxVal - minVal) / 100.0;
01368 uniques.clear();
01369 for (float i = minVal; i < maxVal; i+= rangeInc){
01370 uniques.insert(i);
01371 }
01372 }
01373 uniques.insert(maxVal);
01374
01375
01376 if (DTDEBUG) cout << " #: " << uniques.size() << endl;
01377 return uniques;
01378 }
01379
01380
01383 float* LinearSplitsTree::sortOnDim(int dim, const std::vector<tree_experience*> &instances){
01384 if (DTDEBUG) cout << "sortOnDim,dim = " << dim << endl;
01385
01386 float* values = new float[instances.size()];
01387
01388 for (int i = 0; i < (int)instances.size(); i++){
01389
01390
01391 float val = instances[i]->input[dim];
01392
01393
01394
01395 for (int j = 0; j <= i; j++){
01396
01397
01398
01399 if (j==i){
01400 values[j] = val;
01401
01402 }
01403
01404
01405 else if (val < values[j]){
01406
01407
01408
01409 for (int k = i; k > j; k--){
01410
01411 values[k] = values[k-1];
01412 }
01413
01414
01415
01416 values[j] = val;
01417
01418
01419 break;
01420 }
01421
01422 }
01423 }
01424
01425 if (DTDEBUG){
01426 cout << "Sorted array: " << values[0];
01427 for (int i = 1; i < (int)instances.size(); i++){
01428 cout << ", " << values[i];
01429 }
01430 cout << endl;
01431 }
01432
01433 return values;
01434
01435 }
01436
01437
01439 void LinearSplitsTree::printTree(tree_node *t, int level){
01440
01441 for (int i = 0; i < level; i++){
01442 cout << ".";
01443 }
01444
01445 cout << "Node " << t->id << " nInstances: " << t->nInstances ;
01446
01447
01448 if (t->leaf){
01449 cout << " Constant: " << t->constant << " coeff: ";
01450 for (unsigned j = 0; j < t->coefficients.size(); j++){
01451 if (!SIMPLE)
01452 cout << t->coefficients[j] << ", ";
01453 if (SIMPLE && t->coefficients[j] != 0)
01454 cout << " on feat " << j << ": " << t->coefficients[j];
01455 }
01456 cout << endl;
01457 return;
01458 }
01459
01460
01461
01462 cout << " Type: CUT";
01463 cout << " Dim: " << t->dim << " Val: " << t->val;
01464 cout << " Left: " << t->l->id << " Right: " << t->r->id << endl;
01465
01466
01467 if (t->dim != -1 && !t->leaf){
01468 printTree(t->l, level+1);
01469 printTree(t->r, level+1);
01470 }
01471
01472 }
01473
01474
01475
01476 void LinearSplitsTree::leafPrediction(tree_node* leaf, const std::vector<float> &input,
01477 std::map<float, float>* retval){
01478 if (DTDEBUG)
01479 cout << "Calculating output for leaf " << leaf->id << endl;
01480
01481 float prediction = leaf->constant;
01482 if (DTDEBUG) cout << "leaf constant: " << leaf->constant << endl;
01483
01484
01485 for (unsigned i = 0; i < input.size(); i++){
01486 prediction += leaf->coefficients[i] * input[i];
01487 if (DTDEBUG) {
01488 cout << "feat " << i << " coeff: " << leaf->coefficients[i]
01489 << " on input " << input[i] << endl;
01490 }
01491 }
01492
01493 if (DTDEBUG) cout << " prediction: " << prediction << endl;
01494
01495 (*retval)[prediction] = 1.0;
01496
01497 }
01498
01499
01500 void LinearSplitsTree::initNodes(){
01501
01502 for (int i = 0; i < N_LS_NODES; i++){
01503 initTreeNode(&(allNodes[i]));
01504 freeNodes.push_back(i);
01505 if (NODEDEBUG)
01506 cout << "init node " << i << " with id " << allNodes[i].id
01507 << ", now " << freeNodes.size() << " free nodes." << endl;
01508 }
01509
01510 }
01511
01512 LinearSplitsTree::tree_node* LinearSplitsTree::allocateNode(){
01513 if (freeNodes.empty()){
01514 tree_node* newNode = new tree_node;
01515 initTreeNode(newNode);
01516 if (NODEDEBUG)
01517 cout << id << " PROBLEM: No more pre-allocated nodes!!!" << endl
01518 << "return new node " << newNode->id
01519 << ", now " << freeNodes.size() << " free nodes." << endl;
01520 return newNode;
01521 }
01522
01523 int i = freeNodes.back();
01524 freeNodes.pop_back();
01525 if (NODEDEBUG)
01526 cout << id << " allocate node " << i << " with id " << allNodes[i].id
01527 << ", now " << freeNodes.size() << " free nodes." << endl;
01528 return &(allNodes[i]);
01529 }
01530
01531 void LinearSplitsTree::deallocateNode(tree_node* node){
01532 if (node->id >= N_LS_NODES){
01533 if (NODEDEBUG)
01534 cout << id << " dealloc extra node id " << node->id
01535 << ", now " << freeNodes.size() << " free nodes." << endl;
01536 delete node;
01537 return;
01538 }
01539
01540 freeNodes.push_back(node->id);
01541 if (NODEDEBUG)
01542 cout << id << " dealloc node " << node->id
01543 << ", now " << freeNodes.size() << " free nodes." << endl;
01544 }