M5Tree.cc
Go to the documentation of this file.
00001 
00008 #include "M5Tree.hh"
00009 
00010 
00011 // Include stuff for newmat matrix libraries
00012 
00013 #define WANT_MATH                    // include.h will get math fns
00014                                      // newmatap.h will get include.h
00015 #include "../newmat/newmatap.h"      // need matrix applications
00016 #ifdef use_namespace
00017 using namespace NEWMAT;              // access NEWMAT namespace
00018 #endif
00019 
00020 
00021 
00022 M5Tree::M5Tree(int id, int trainMode, int trainFreq, int m,
00023                float featPct, bool simple, bool allowAllFeats, 
00024                float min_sdr, Random rng):
00025   id(id), mode(trainMode), freq(trainFreq), M(m),
00026   featPct(featPct), SIMPLE(simple), ALLOW_ALL_FEATS(allowAllFeats),
00027   MIN_SDR(min_sdr), rng(rng)
00028 {
00029 
00030   nnodes = 0;
00031   nOutput = 0;
00032   nExperiences = 0;
00033   hadError = false;
00034   totalnodes = 0;
00035   maxnodes = N_M5_NODES;
00036 
00037 
00038   // how close a split has to be to be randomly selected
00039   SPLIT_MARGIN = 0.0; //0.02; //5; //01; //0.05; //0.2; //0.05;
00040 
00041   LMDEBUG = false;
00042   DTDEBUG = false;
00043   SPLITDEBUG = false;//true;
00044   STOCH_DEBUG = false; //true; //false; //true;
00045   INCDEBUG = false; //true; //false; //true;
00046   NODEDEBUG = false;
00047   COPYDEBUG = false; //true;
00048   nfeat = 4;
00049 
00050   cout << "Created m5 decision tree " << id;
00051   if (SIMPLE) cout << " simple regression";
00052   else cout << " multivariate regression";
00053   if (ALLOW_ALL_FEATS) cout << " (all feats)";
00054   else cout << " (subtree feats)";
00055   if (DTDEBUG){
00056     cout << " mode: " << mode << " freq: " << freq << endl;
00057   }
00058   cout << " MIN_SDR: " << MIN_SDR << endl;
00059 
00060   initNodes();
00061   initTree();
00062 
00063 
00064 }
00065 
00066 M5Tree::M5Tree(const M5Tree& m5):
00067   id(m5.id), mode(m5.mode), freq(m5.freq), M(m5.M),
00068   featPct(m5.featPct), SIMPLE(m5.SIMPLE), ALLOW_ALL_FEATS(m5.ALLOW_ALL_FEATS),
00069   MIN_SDR(m5.MIN_SDR), rng(m5.rng)
00070 {
00071   COPYDEBUG = m5.COPYDEBUG;
00072   if (COPYDEBUG) cout << "m5 copy " << id << endl;
00073   nnodes = 0;
00074   nOutput = m5.nOutput;
00075   nExperiences = m5.nExperiences;
00076   hadError = m5.hadError;
00077   totalnodes = 0;
00078   maxnodes = m5.maxnodes;
00079   SPLIT_MARGIN = m5.SPLIT_MARGIN; 
00080   LMDEBUG = m5.LMDEBUG;
00081   DTDEBUG = m5.DTDEBUG;
00082   SPLITDEBUG = m5.SPLITDEBUG;
00083   STOCH_DEBUG = m5.STOCH_DEBUG; 
00084   INCDEBUG = m5.INCDEBUG; 
00085   NODEDEBUG = m5.NODEDEBUG;
00086   nfeat = m5.nfeat;
00087 
00088   if (COPYDEBUG) cout << "   M5 copy nodes, experiences, root, etc" << endl;
00089   // copy all experiences
00090   for (int i = 0; i < N_M5_EXP; i++){
00091     allExp[i] = m5.allExp[i];
00092   }
00093   if (COPYDEBUG) cout << "   M5 copied exp array" << endl;
00094 
00095   // set experience pointers
00096   experiences.resize(m5.experiences.size());
00097   for (unsigned i = 0; i < m5.experiences.size(); i++){
00098     experiences[i] = &(allExp[i]);
00099   }
00100   if (COPYDEBUG) cout << "   M5 set experience pointers" << endl;
00101 
00102   // now the tricky part, set the pointers inside the tree nodes correctly
00103   initNodes();
00104 
00105   if (COPYDEBUG) cout << "   M5 copy tree " << endl;
00106   root = allocateNode();
00107   lastNode = root;
00108   copyTree(root, m5.root);
00109   if (COPYDEBUG) cout << "   M5 tree copy done" << endl;
00110    
00111   if (COPYDEBUG) {
00112     cout << endl << "New tree: " << endl;
00113     printTree(root, 0);
00114     cout << endl;
00115     cout << "  m5 copy done" << endl;
00116   }
00117 
00118 }
00119 
00120 void M5Tree::copyTree(tree_node* newNode, tree_node* origNode){
00121 
00122   int nodeId = newNode->id;
00123 
00124   if (COPYDEBUG) {
00125     cout << "    Copy node " << newNode->id << " from node " << origNode->id << endl;
00126     cout << "    NewAddy " << newNode << ", old: " << origNode << endl;
00127   }
00128 
00129   // copy node from t
00130   *newNode = *origNode;
00131   newNode->id = nodeId;
00132 
00133   // if it has children, allocate and copy them too
00134   if (origNode->l != NULL && !newNode->leaf){
00135     newNode->l = allocateNode();
00136     if (COPYDEBUG) cout << "     Copy left node " << newNode->l->id << " from " << origNode->l->id << endl;
00137     copyTree(newNode->l, origNode->l);
00138   } else {
00139     newNode->l = NULL;
00140   }
00141 
00142   if (origNode->r != NULL && !newNode->leaf){
00143     newNode->r = allocateNode();
00144     if (COPYDEBUG) cout << "     Copy right node " << newNode->r->id << " from " << origNode->r->id << endl;
00145     copyTree(newNode->r, origNode->r);
00146   } else {
00147     newNode->r = NULL;
00148   }
00149 }
00150 
00151 M5Tree* M5Tree::getCopy(){
00152   M5Tree* copy = new M5Tree(*this);
00153   return copy;
00154 }
00155 
00156 M5Tree::~M5Tree() {
00157   deleteTree(root);
00158   for (unsigned i = N_M5_EXP; i < experiences.size(); i++){
00159     delete experiences[i];
00160   }
00161   experiences.clear();
00162 }
00163 
00164 // here the target output will be a single value
00165 bool M5Tree::trainInstance(classPair &instance){
00166 
00167   if (DTDEBUG) cout << id << " trainInstance" << endl;
00168 
00169   //  featPct *= 0.99;
00170 
00171   nfeat = instance.in.size();
00172 
00173   bool modelChanged = false;
00174 
00175   // simply add this instance to the set of experiences
00176 
00177   // take from static array until we run out
00178   tree_experience *e;
00179   if (nExperiences < N_M5_EXP){
00180     // from statically created set of experiences
00181     e = &(allExp[nExperiences]);
00182 
00183   } else {
00184     // dynamically create experience
00185     e = new tree_experience;
00186   }
00187 
00188 
00189   e->input = instance.in;
00190   e->output = instance.out;
00191   experiences.push_back(e);
00192   nExperiences++;
00193 
00194   if (nExperiences == 1000000){
00195     cout << "Reached limit of # experiences allowed." << endl;
00196     return false;
00197   }
00198 
00199   if (nExperiences != (int)experiences.size())
00200     cout << "ERROR: experience size mismatch: " << nExperiences << ", " << experiences.size() << endl;
00201 
00202   //cout << nExperiences << endl << flush;
00203   //if (nExperiences == 503 && id == 10){
00204   //  DTDEBUG = true;
00205   //  SPLITDEBUG = true;
00206   //  INCDEBUG = true;
00207   //}
00208 
00209   if ( DTDEBUG) {
00210     cout << "Original input: ";
00211     for (unsigned i = 0; i < instance.in.size(); i++){
00212       cout << instance.in[i] << ", ";
00213     }
00214     cout << endl << " Original output: " << instance.out << endl;
00215     cout << "Added exp id: " << nExperiences << " output: " << e->output << endl;
00216     cout << "Address: " << e << " Input : ";
00217     for (unsigned i = 0; i < e->input.size(); i++){
00218       cout << e->input[i] << ", ";
00219     }
00220     cout << endl << " Now have " << nExperiences << " experiences." << endl;
00221   }
00222 
00223   // depending on mode/etc, maybe re-build tree
00224 
00225   // mode 0: re-build every step
00226   if (mode == BUILD_EVERY || nExperiences <= 1){
00227     rebuildTree();
00228     modelChanged = true;
00229   }
00230 
00231   // mode 1: re-build on error only
00232   else if (mode == BUILD_ON_ERROR){
00233 
00234     // build on misclassification
00235     // check for misclassify
00236     std::map<float, float> answer;
00237     testInstance(e->input, &answer);
00238     float val = answer.begin()->first;
00239     float error = fabs(val - e->output);
00240 
00241     if (error > 0.0){
00242       rebuildTree();
00243       modelChanged = true;
00244     }
00245   }
00246 
00247   // mode 2: re-build every FREQ steps
00248   else if (mode == BUILD_EVERY_N){
00249     // build every freq steps
00250     if (!modelChanged && (nExperiences % freq) == 0){
00251       rebuildTree();
00252       modelChanged = true;
00253     }
00254   }
00255 
00256   if (modelChanged){
00257     if (DTDEBUG) cout << "DT " << id << " tree re-built." << endl;
00258 
00259     if (DTDEBUG){
00260       cout << endl << "DT: " << id << endl;
00261       printTree(root, 0);
00262       cout << "Done printing tree" << endl;
00263     }
00264   }
00265 
00266   return modelChanged;
00267 
00268 }
00269 
00270 
00271 // here the target output will be a single value
00272 bool M5Tree::trainInstances(std::vector<classPair> &instances){
00273   if (DTDEBUG) cout << id << " DT trainInstances: " 
00274                             << instances.size() 
00275                             << " nExp: " << nExperiences << endl;
00276   nfeat = instances[0].in.size();
00277   
00278   //  featPct *= 0.99;
00279   
00280   bool modelChanged = false;
00281 
00282   bool doBuild = false;
00283 
00284   // loop through instances, possibly checking for errors
00285   for (unsigned a = 0; a < instances.size(); a++){
00286     classPair instance = instances[a];
00287 
00288     // simply add this instance to the set of experiences
00289 
00290     // take from static array until we run out
00291     tree_experience *e;
00292     if (nExperiences < N_M5_EXP){
00293       // from statically created set of experiences
00294       e = &(allExp[nExperiences]);
00295 
00296     } else {
00297       // dynamically create experience
00298       e = new tree_experience;
00299     }
00300 
00301 
00302     e->input = instance.in;
00303     e->output = instance.out;
00304     experiences.push_back(e);
00305     nExperiences++;
00306 
00307     if (nExperiences == 1000000){
00308       cout << "Reached limit of # experiences allowed." << endl;
00309       return false;
00310     }
00311 
00312     if (nExperiences != (int)experiences.size())
00313       cout << "ERROR: experience size mismatch: " << nExperiences << ", " << experiences.size() << endl;
00314 
00315     if (DTDEBUG) {
00316       cout << "Original input: ";
00317       for (unsigned i = 0; i < instance.in.size(); i++){
00318         cout << instance.in[i] << ", ";
00319       }
00320       cout << endl << " Original output: " << instance.out << endl;
00321       cout << "Added exp id: " << nExperiences << " output: " << e->output << endl;
00322       cout << "Address: " << e << " Input : ";
00323       for (unsigned i = 0; i < e->input.size(); i++){
00324         cout << e->input[i] << ", ";
00325       }
00326       cout << endl << " Now have " << nExperiences << " experiences." << endl;
00327     }
00328 
00329     // depending on mode/etc, maybe re-build tree
00330 
00331     // don't need to check if we've already decided
00332     if (doBuild) continue;
00333 
00334     // mode 0: re-build every step
00335     if (mode == BUILD_EVERY || nExperiences <= 1){
00336       doBuild = true;
00337     }
00338 
00339     // mode 1: re-build on error only
00340     else if (mode == BUILD_ON_ERROR){
00341 
00342       // build on misclassification
00343       // check for misclassify
00344       std::map<float, float> answer;
00345       testInstance(e->input, &answer);
00346       float val = answer.begin()->first;
00347       float error = fabs(val - e->output);
00348 
00349       if (error > 0.0){
00350         doBuild = true;
00351       }
00352 
00353     }
00354 
00355     // mode 2: re-build every FREQ steps
00356     else if (mode == BUILD_EVERY_N){
00357       // build every freq steps
00358       if (!modelChanged && (nExperiences % freq) == 0){
00359         doBuild = true;
00360       }
00361     }
00362 
00363   } // loop of new instances
00364 
00365   if (DTDEBUG) cout << "Added " << instances.size() << " new instances. doBuild = " << doBuild << endl;
00366 
00367   if (doBuild){
00368     rebuildTree();
00369     modelChanged = true;
00370   }
00371 
00372   if (modelChanged){
00373     if (DTDEBUG) cout << "DT " << id << " tree re-built." << endl;
00374 
00375     if (DTDEBUG){
00376       cout << endl << "DT: " << id << endl;
00377       printTree(root, 0);
00378       cout << "Done printing tree" << endl;
00379     }
00380   }
00381 
00382   return modelChanged;
00383 
00384 }
00385 
00386 
00387 void M5Tree::rebuildTree(){
00388   //cout << "rebuild tree " << id << " on exp: " << nExperiences << endl;
00389   //  deleteTree(root);
00390   buildTree(root, experiences, false);
00391   //cout << "tree " << id << " rebuilt. " << endl;
00392 }
00393 
00394 
00395 // TODO: here we want to return the probability of the output value being each of the possible values, in the stochastic case
00396 void M5Tree::testInstance(const std::vector<float> &input, std::map<float, float>* retval){
00397   if (DTDEBUG) cout << "testInstance" << endl;
00398 
00399   retval->clear();
00400 
00401   // in case the tree is empty
00402   if (experiences.size() == 0){
00403     (*retval)[0.0] = 1.0;
00404     return;
00405   }
00406 
00407   // follow through tree to leaf
00408   tree_node* leaf = traverseTree(root, input);
00409   lastNode = leaf;
00410 
00411   // and return mapping of outputs and their probabilities
00412   leafPrediction(leaf, input, retval);
00413 
00414 }
00415 
00416 float M5Tree::getConf(const std::vector<float> &input){
00417   if (DTDEBUG) cout << "numVisits" << endl;
00418 
00419   // in case the tree is empty
00420   if (experiences.size() == 0){
00421     return 0;
00422   }
00423 
00424   if (lastNode == NULL){
00425     return 0;
00426   }
00427 
00428   // follow through tree to leaf
00429   //tree_node* leaf = traverseTree(root, input);
00430 
00431   // and return # in this leaf
00432   float conf = (float)lastNode->nInstances / (float)(2.0*M);
00433   if (conf > 1.0)
00434     return 1.0;
00435   else
00436     return conf;
00437 
00438 }
00439 
00440 // check to see if this state is one we should explore
00441 // to get more info on potential splits
00442 
00443 
00444 // init the tree
00445 void M5Tree::initTree(){
00446   if (DTDEBUG) cout << "initTree()" << endl;
00447   root = allocateNode();
00448 
00449   if (DTDEBUG) cout << "   root id = " << root->id << endl;
00450 
00451   // just to ensure the diff models are on different random values
00452   for (int i = 0; i < id; i++){
00453     rng.uniform(0, 1);
00454   }
00455 
00456 }
00457 
00458 
00459 
00460 // init a tree node
00461 void M5Tree::initTreeNode(tree_node* node){
00462   if (DTDEBUG) cout << "initTreeNode()";
00463 
00464   node->id = nnodes++;
00465   if (DTDEBUG) cout << " id = " << node->id << endl;
00466 
00467   totalnodes++;
00468   if (totalnodes > maxnodes){
00469     maxnodes = totalnodes;
00470     if (DTDEBUG) cout << id << " M5 MAX nodes: " << maxnodes << endl;
00471   }
00472 
00473   // split criterion
00474   node->dim = -1;
00475   node->val = -1;
00476 
00477   // current data
00478   node->nInstances = 0;
00479   node->constant = 0;
00480 
00481   // coefficients will get resized later
00482   //  node->coefficients.resize(2,0);
00483 
00484   // next nodes in tree
00485   node->l = NULL;
00486   node->r = NULL;
00487 
00488   node->leaf = true;
00489 
00490 }
00491 
00492 void M5Tree::deleteTree(tree_node* node){
00493   if (DTDEBUG) cout << "deleteTree, node=" << node->id << endl;
00494 
00495   if (node==NULL)
00496     return;
00497 
00498   totalnodes--;
00499 
00500   node->nInstances = 0;
00501   node->coefficients.clear();
00502 
00503   //recursively call deleteTree on children
00504   // then delete them
00505   if (!node->leaf){
00506     // left child
00507     if (node->l != NULL){
00508       deleteTree(node->l);
00509       deallocateNode(node->l);
00510       node->l = NULL;
00511     }
00512 
00513     // right child
00514     if (node->r != NULL){
00515       deleteTree(node->r);
00516       deallocateNode(node->r);
00517       node->r = NULL;
00518     }
00519   }
00520 
00521   node->leaf  = true;
00522   node->dim = -1;
00523   node->val = -1;
00524   node->constant = 0;
00525 }
00526 
00527 
00528 M5Tree::tree_node* M5Tree::getCorrectChild(tree_node* node,
00529                                            const std::vector<float> &input){
00530 
00531   if (DTDEBUG) cout << "getCorrectChild, node=" << node->id << endl;
00532 
00533   if (passTest(node->dim, node->val, input))
00534     return node->l;
00535   else
00536     return node->r;
00537 
00538 }
00539 
00540 M5Tree::tree_node* M5Tree::traverseTree(tree_node* node,
00541                                         const std::vector<float> &input){
00542 
00543   if (DTDEBUG) cout << "traverseTree, node=" << node->id << endl;
00544 
00545   while (!node->leaf){
00546     node = getCorrectChild(node, input);
00547   }
00548 
00549   return node;
00550 }
00551 
00552 
00553 bool M5Tree::passTest(int dim, float val, const std::vector<float> &input){
00554   if (DTDEBUG) cout << "passTest, dim=" << dim << ",val=" << val 
00555                     << ",input["<<dim<<"]=" << input[dim] <<endl;
00556 
00557   if (input[dim] > val)
00558     return false;
00559   else
00560     return true;
00561 
00562 }
00563 
00564 
00565 void M5Tree::buildTree(tree_node *node,
00566                        const std::vector<tree_experience*> &instances,
00567                        bool changed){
00568   if(DTDEBUG) cout << "buildTree, node=" << node->id
00569                    << ",nInstances:" << instances.size()
00570                    << ",chg:" << changed << endl;
00571 
00572   if (instances.size() == 0){
00573     cout << "Error: buildTree called on tree " << id << " node " << node->id << " with no instances." << endl;
00574     exit(-1);
00575   }
00576 
00577 
00578   // TODO: what about stochastic data?
00579   //std::vector<float> chiSquare = calcChiSquare(instances);
00580 
00581   // first, add instances to tree
00582   node->nInstances = instances.size();
00583 
00584   bool allSame = true;
00585   float val0 = instances[0]->output;
00586   for (unsigned i = 1; i < instances.size(); i++){
00587     if (instances[i]->output != val0){
00588       allSame = false;
00589       break;
00590     }
00591   }
00592 
00593   // see if they're all the same
00594   if (allSame){
00595     makeLeaf(node);
00596     node->constant = instances[0]->output;
00597     if (DTDEBUG){
00598       cout << "Tree " << id << " node " << node->id 
00599            << " All " << node->nInstances
00600            << " classified with output "
00601            << instances[0]->output << endl;
00602     }
00603     return;
00604   }
00605 
00606   // check if this is a leaf and linear model has no error
00607   if (node->leaf && node->coefficients.size() > 0) {
00608     //cout << "Node " << node->id << " is leaf, checking lm" << endl;
00609     float errorSum = 0;
00610     for (unsigned i = 0; i < instances.size(); i++){
00611       // get prediction for instance and compare with actual output
00612       tree_node* leaf = traverseTree(node, instances[i]->input);
00613       std::map<float, float> retval;
00614       leafPrediction(leaf, instances[i]->input, &retval);
00615       float prediction = retval.begin()->first;
00616       float absError = fabs(prediction - instances[i]->output);
00617       errorSum += absError;
00618       //cout << "instance " << i << " leaf predicted: " << prediction
00619       //     << " actual: " << instances[i]->output
00620       //   << " error: " << absError << endl;
00621     }
00622     float avgError = errorSum / (float)instances.size();
00623     if (avgError < 0.001){
00624       //      cout << "stick with linear model" << endl;
00625       return;
00626     }
00627   }
00628 
00629   // if not, calculate SDR to determine best split
00630   if (SPLITDEBUG) cout << endl << "Creating new decision node" << endl;
00631 
00632   node->leaf = false;
00633   //node->nInstances++;
00634 
00635   float bestSDR = -1.0;
00636   int bestDim = -1;
00637   float bestVal = -1;
00638   std::vector<tree_experience*> bestLeft;
00639   std::vector<tree_experience*> bestRight;
00640 
00641   testPossibleSplits(instances, &bestSDR, &bestDim, &bestVal, &bestLeft, &bestRight);
00642 
00643   implementSplit(node, instances, bestSDR, bestDim, bestVal, bestLeft, bestRight, changed);
00644 
00645   // possibly replace split node with linear regression model
00646   pruneTree(node, instances);
00647 
00648 }
00649 
00650 
00651 void M5Tree::makeLeaf(tree_node* node){
00652 
00653   removeChildren(node);
00654 
00655   // make sure there are enough coefficients for all the features
00656   // and that they are 0
00657   if (node->coefficients.size() != (unsigned)nfeat){
00658     node->coefficients.resize(nfeat, 0);
00659   }
00660   for (unsigned i = 0; i < node->coefficients.size(); i++){
00661     node->coefficients[i] = 0;
00662   }
00663 
00664 }
00665 
00666 
00667 void M5Tree::removeChildren(tree_node* node){
00668   // check on children
00669   if (node->l != NULL){
00670     deleteTree(node->l);
00671     deallocateNode(node->l);
00672     node->l = NULL;
00673   }
00674 
00675   if (node->r != NULL){
00676     deleteTree(node->r);
00677     deallocateNode(node->r);
00678     node->r = NULL;
00679   }
00680 
00681   node->leaf = true;
00682 }
00683 
00684 
00685 void M5Tree::pruneTree(tree_node *node,
00686                        const std::vector<tree_experience*> &instances){
00687   if (LMDEBUG || DTDEBUG){
00688     printTree(root, 0);
00689     cout << "pruneTree, node=" << node->id
00690          << ",nInstances:" << instances.size() << endl;
00691   }
00692 
00693   // TODO: no pruning right now
00694   //  return;
00695 
00696   // calculate error of current subtree
00697   float subtreeErrorSum = 0;
00698   for (unsigned i = 0; i < instances.size(); i++){
00699     
00700     // get prediction for instance and compare with actual output
00701     tree_node* leaf = traverseTree(node, instances[i]->input);
00702     std::map<float, float> retval;
00703     leafPrediction(leaf, instances[i]->input, &retval);
00704     float prediction = retval.begin()->first;
00705     float absError = fabs(prediction - instances[i]->output);
00706     subtreeErrorSum += absError;
00707     if (LMDEBUG || DTDEBUG){
00708       cout << "instance " << i << " subtree predicted: " << prediction
00709            << " actual: " << instances[i]->output
00710            << " error: " << absError << endl;
00711     }
00712   }
00713   if (instances.size() < 3){
00714     if (LMDEBUG || DTDEBUG) cout << "instances size <= 2!!!" << endl;
00715     return;
00716   }
00717 
00718   float avgTreeError = subtreeErrorSum / (float)instances.size();
00719 
00720   // if this is zero error, we're not going to replace it
00721   if (false && avgTreeError <= 0.0001){
00722     if (LMDEBUG || DTDEBUG) 
00723       cout << "Sub-tree is perfect (" << avgTreeError
00724            << "), do not attempt LM" << endl;
00725     return;
00726   }
00727                                                                 
00728   // figure out tree feats used
00729   std::vector<bool> treeFeatsUsed(instances[0]->input.size(), false);
00730   getFeatsUsed(node, &treeFeatsUsed);
00731   int nTreeFeatsUsed = 0;
00732   for (unsigned i = 0; i < treeFeatsUsed.size(); i++){
00733     if (treeFeatsUsed[i])
00734       nTreeFeatsUsed++;
00735   }
00736 
00737   // Just use them all... otherwise we ignore some that weren't good enough
00738   // for splitting, but are good here
00739   std::vector<bool> featsUsed(instances[0]->input.size(), true);
00740   int nFeatsUsed = featsUsed.size();
00741 
00742   // or just use ones allowed by subtree
00743   if (!ALLOW_ALL_FEATS){
00744     featsUsed = treeFeatsUsed;
00745     nFeatsUsed = nTreeFeatsUsed;
00746   }
00747 
00748   // no features to build linear model on
00749   if (nFeatsUsed == 0){
00750     if (LMDEBUG || DTDEBUG) cout << "No features for LM" << endl;
00751     return;
00752   }
00753 
00754   // add on error bonus based on nInstances and nParams
00755   float treeErrorEst = avgTreeError;
00756   float denom = (instances.size() - nTreeFeatsUsed);
00757   if (denom < 1){
00758     denom = 0.5;
00759     if (LMDEBUG) {
00760       cout << "denom of tree error factor is " << denom 
00761            << " with nInst " << instances.size() 
00762            << " nfeats: " << nTreeFeatsUsed << endl;
00763     }
00764   }
00765   treeErrorEst *= (instances.size() + nTreeFeatsUsed) / denom;
00766 
00767   // fit linear model to this set of instances
00768   float lmErrorSum = 0;
00769   int nlmFeats = 0;
00770   if (SIMPLE)
00771     nlmFeats = fitSimpleLinearModel(node, instances, featsUsed, nFeatsUsed, &lmErrorSum);
00772   else 
00773     nlmFeats = fitLinearModel(node, instances, featsUsed, nFeatsUsed, &lmErrorSum);
00774 
00775   float avgLMError = lmErrorSum / (float)instances.size();
00776 
00777   float lmErrorEst = avgLMError;
00778   float denom2 = (instances.size() - nlmFeats);
00779   if (denom2 < 1){
00780     denom2 = 0.5;
00781     if (LMDEBUG) {
00782       cout << "denom2 of lm error factor is " << denom2
00783            << " with nInst " << instances.size() 
00784            << " nfeats: " << nlmFeats << endl;
00785     }
00786   }
00787   lmErrorEst *= (instances.size() + nlmFeats) / denom2; 
00788 
00789   // replace subtree with linear model?
00790   if (LMDEBUG || DTDEBUG) {
00791     cout << "Sub-Tree Error: " << treeErrorEst << ", lm Error: "
00792          << lmErrorEst << endl;
00793   }
00794   //  if (lmErrorEst < (treeErrorEst + 0.0001)){
00795   if (lmErrorEst < (treeErrorEst + 0.1*MIN_SDR)){
00796   //if (lmErrorEst < treeErrorEst){
00797     if (LMDEBUG || DTDEBUG)
00798       cout << node->id << " replace tree with linear model" << endl;
00799     removeChildren(node);
00800   } else {
00801     // remove coefficients again, for memory
00802     //    node->coefficients.clear();
00803   }
00804 
00805 }
00806 
00807 
00808 int M5Tree::fitLinearModel(tree_node *node,
00809                            const std::vector<tree_experience*> &instances,
00810                            std::vector<bool> featureMask,
00811                            int nFeats, float* resSum){
00812   
00813   if(DTDEBUG || LMDEBUG) cout << "fitLinearModel, node=" << node->id
00814                               << ",nInstances:" << instances.size() << endl;
00815 
00816   // make sure there are enough coefficients for all the features
00817   if (node->coefficients.size() != instances[0]->input.size()){
00818     node->coefficients.resize(instances[0]->input.size(), 0);
00819   }
00820 
00821   node->constant = 0.0;
00822   bool doRegression = true;
00823   int ntimes = 0;
00824   int nlmFeats = 10;
00825   (*resSum) = 1000000;
00826 
00827   while (doRegression){
00828     //cout << id << " Attempt linear model " << ntimes << endl;
00829     ntimes++;
00830 
00831     int nObs = (int)instances.size();
00832 
00833     //cout << "with nObs: " << nObs << " and nFeats: " << nFeats << endl;
00834 
00835     // no feats or obs, no model to build
00836     if (nObs == 0 || nFeats == 0)
00837       break;
00838 
00839     Matrix X(nObs, nFeats);
00840     ColumnVector Y(nObs);
00841 
00842     std::vector<int> featIndices(nFeats);
00843 
00844     std::vector<bool> constants(nFeats,true);
00845     bool foundProblem = false;
00846     
00847     // load up matrices
00848     for (int i = 0; i < nObs; i++){
00849       tree_experience *e = instances[i];
00850       if (LMDEBUG) cout << "Obs: " << i;
00851 
00852       // go through all features
00853       int featIndex = 1;
00854       for (unsigned j = 0; j < featureMask.size(); j++){
00855         node->coefficients[j] = 0;
00856         if (!featureMask[j])
00857           continue;
00858       
00859         if (constants[j] && e->input[j] != instances[0]->input[j]){
00860           constants[j] = false;
00861         }
00862 
00863         /*
00864         if (rng.uniform() < featPct){
00865           featureMask[j] = false;
00866           nFeats--;
00867           if (nFeats > 0)
00868             continue;
00869           else
00870             break;
00871         }
00872         */
00873 
00874         if (i == nObs-1 && constants[j]){
00875           //cout << "PROBLEM: feat " << j << " is constant!" << endl;
00876           foundProblem = true;
00877           featureMask[j] = false;
00878           nFeats--;
00879           if (nFeats > 0)
00880             continue;
00881           else
00882             break;
00883         }
00884 
00885         featIndices[featIndex-1] = j;
00886         // HACK: I'm adding random noise here to prevent colinear features
00887         X(i+1,featIndex) = e->input[j]; // + rng.uniform(-0.00001, 0.00001);
00888         if (LMDEBUG){
00889           cout << " Feat " << featIndex << " index " << j
00890                << " val " << X(i+1,featIndex) << ",";
00891         }
00892         featIndex++;
00893       }
00894 
00895       Y(i+1) = e->output;
00896       if (LMDEBUG) cout << " out: " << e->output << endl;
00897     }
00898 
00899     if (foundProblem || nFeats == 0)
00900       continue;
00901 
00902     // make vector of 1s
00903     ColumnVector Ones(nObs); Ones = 1.0;
00904 
00905     // calculate means (averages) of x1 and x2 [ .t() takes transpose]
00906     RowVector Mrow = Ones.t() * X / nObs;
00907 
00908     // and subtract means from x1 and x1
00909     Matrix XC(nObs,nFeats);
00910     XC = X - Ones * Mrow;
00911 
00912     // do the same to Y [use Sum to get sum of elements]
00913     ColumnVector YC(nObs);
00914     Real mval = Sum(Y) / nObs;
00915     YC = Y - Ones * mval;
00916 
00917     Try {
00918 
00919       // form sum of squares and product matrix
00920       //    [use << rather than = for copying Matrix into SymmetricMatrix]
00921       SymmetricMatrix SSQ;
00922       SSQ << XC.t() * XC;
00923       
00925       // Cholesky Method
00926       LowerTriangularMatrix L = Cholesky(SSQ);
00927 
00928       // calculate estimate
00929       ColumnVector A = L.t().i() * (L.i() * (XC.t() * YC));
00931       
00933       // Least Squares Method
00934       // calculate estimate
00935       //    [bracket last two terms to force this multiplication first]
00936       //    [ .i() means inverse, but inverse is not explicity calculated]
00937       //ColumnVector A = SSQ.i() * (XC.t() * YC);
00939   
00940   
00941       // calculate estimate of constant term
00942       //    [AsScalar converts 1x1 matrix to Real]
00943       Real a = mval - (Mrow * A).AsScalar();
00944 
00945       // Calculate fitted values and residuals
00946       //int npred1 = nFeats+1;
00947       ColumnVector Fitted = X * A + a;
00948       ColumnVector Residual = Y - Fitted;
00949       //Real ResVar = Residual.SumSquare() / (nObs-npred1);
00950 
00951       // print out answers
00952       // for each instance
00953       (*resSum) = 0;
00954       for (int i = 0; i < nObs; i++){
00955         if (DTDEBUG || LMDEBUG){
00956           cout << "instance " << i << " linear model predicted: " << Fitted(i+1)
00957                << " actual: " << instances[i]->output
00958                << " error: " << Residual(i+1) << endl;
00959         }
00960         (*resSum) += fabs(Residual(i+1));
00961       }
00962 
00963 
00964       // coeff
00965       nlmFeats = 0;
00966       for (int i = 0; i < nFeats; i++){
00967         if (DTDEBUG || LMDEBUG) cout << "Coeff " << i << " on feat: " << featIndices[i] << " is " << A(i+1) << endl;
00968         node->coefficients[featIndices[i]] = A(i+1);
00969         if (A(i+1) != 0)
00970           nlmFeats++;
00971       }
00972       if (DTDEBUG || LMDEBUG) cout << "constant is " << a << endl;
00973       node->constant = a;
00974 
00975 
00976     }
00977 
00978     CatchAll {
00979       // had an error trying the linear regression.
00980       // HACK TODO: for now, turn off one variable
00981       //cout << ntimes << " linear regression had exception" << endl;
00982       //<< BaseException::what() <<endl;
00983 
00984       // tried this already, stop now
00985       if (ntimes > 1 || nFeats < 2){
00986         //cout << "max regression" << endl;
00987         doRegression = false;
00988         break;
00989       }
00990       for (unsigned j = 0; j < featureMask.size(); j++){
00991         if (featureMask[j]){
00992           //cout << "remove feature " << j << endl;
00993           featureMask[j] = false;
00994           nFeats--;
00995           break;
00996         }
00997       }
00998       continue;
00999     }
01000 
01001     // it worked, dont need to do it again
01002     doRegression = false;
01003 
01004   }
01005 
01006   // return # features used
01007   return nlmFeats;
01008 
01009 }
01010 
01011 int M5Tree::fitSimpleLinearModel(tree_node *node,
01012                                  const std::vector<tree_experience*> &instances,
01013                                  std::vector<bool> featureMask,
01014                                  int nFeats, float* resSum){
01015   if(DTDEBUG || LMDEBUG) cout << "fitSimpleLinearModel, node=" << node->id
01016                               << ",nInstances:" << instances.size() << endl;
01017   
01018   // make sure there are enough coefficients for all the features
01019   if (node->coefficients.size() != instances[0]->input.size()){
01020     node->coefficients.resize(instances[0]->input.size(), 0);
01021   }
01022 
01023   // loop through all features, try simple single variable regression
01024   // keep track of error/coefficient of each
01025   int bestFeat = -1;
01026   float bestCoeff = -1;
01027   float bestError = 1000000;
01028   float bestConstant = -1;
01029 
01030   std::vector<float> xsum(instances[0]->input.size(), 0);
01031   std::vector<float> xysum(instances[0]->input.size(), 0);
01032   std::vector<float> x2sum(instances[0]->input.size(), 0);
01033   float ysum = 0;
01034 
01035   int nObs = (int)instances.size();
01036   for (int i = 0; i < nObs; i++){
01037     tree_experience *e = instances[i];
01038     if (LMDEBUG) cout << "Obs: " << i;
01039     
01040     // go through all features
01041     for (unsigned j = 0; j < instances[0]->input.size(); j++){
01042       if (!featureMask[j]) continue;
01043       if (LMDEBUG) cout << ", F" << j << ": " << e->input[j];
01044       xsum[j] += e->input[j];
01045       xysum[j] += (e->input[j] * e->output);
01046       x2sum[j] += (e->input[j] * e->input[j]);
01047     }
01048     ysum += e->output;
01049     if (LMDEBUG) cout << ", out: " << e->output << endl;
01050   }
01051   
01052   // now go through all features and calc coeff and constant
01053   for (unsigned j = 0; j < instances[0]->input.size(); j++){
01054     if (!featureMask[j]) continue;
01055     /*
01056       if (rng.uniform() < featPct){
01057       continue;
01058       }
01059     */
01060     float coeff = (xysum[j] - xsum[j]*ysum/nObs)/(x2sum[j]-(xsum[j]*xsum[j])/nObs);
01061     float constant = (ysum/nObs) - coeff*(xsum[j]/nObs);
01062     
01063     if (LMDEBUG) {
01064       cout << "Feat " << j << " coeff: " << coeff << ", constant: " 
01065            << constant << " mask: " << featureMask[j] << endl;
01066     }
01067 
01068     // now try to make predictions and see what error is
01069     float errorSum = 0;
01070     for (int i = 0; i < nObs; i++){
01071       tree_experience *e = instances[i];
01072       float pred = constant + coeff * e->input[j];
01073       float error = fabs(pred - e->output);
01074       if (LMDEBUG) cout << "Instance " << i << " error: " << error << endl;
01075       errorSum += error;
01076     }
01077     if (LMDEBUG) cout << "eSum: " << errorSum << endl;
01078 
01079     // check if this is the best
01080     if (errorSum < bestError){
01081       bestError = errorSum;
01082       bestFeat = j;
01083       bestConstant = constant;
01084       bestCoeff = coeff;
01085     }
01086   }
01087   
01088   if (LMDEBUG){
01089     cout << "SLM feat: " << bestFeat << " coeff: " 
01090          << bestCoeff << " constant: " 
01091          << bestConstant << " avgE: " << (bestError/nObs) << endl;
01092   }
01093 
01094   if (bestFeat < 0 || bestFeat > (int)instances[0]->input.size()){
01095     node->constant = 0.0;
01096     for (unsigned i = 0; i < node->coefficients.size(); i++){
01097       node->coefficients[i] = 0.0;
01098     }
01099     (*resSum) = 1000000;
01100     return 10;
01101   }
01102 
01103   // pick best feature somehow
01104   node->constant = bestConstant;
01105   for (unsigned i = 0; i < node->coefficients.size(); i++){
01106     if (i == (unsigned)bestFeat){
01107       node->coefficients[i] = bestCoeff;
01108       if (LMDEBUG) cout << "Set coefficient on feat " << i << " to " << bestCoeff << endl;
01109     }
01110     else {
01111       node->coefficients[i] = 0.0;
01112     }
01113   }
01114 
01115   (*resSum) = bestError;
01116 
01117   // only use 1 input feature this way
01118   return 1;
01119  
01120 }
01121 
01122 
01123 
01124 
01125 void M5Tree::implementSplit(tree_node* node, 
01126                             const std::vector<tree_experience*> &instances,
01127                             float bestSDR, int bestDim,
01128                             float bestVal, 
01129                             const std::vector<tree_experience*> &bestLeft,
01130                             const std::vector<tree_experience*> &bestRight,
01131                             bool changed){
01132   if (DTDEBUG) cout << "implementSplit node=" << node->id
01133                     << ",sdr=" << bestSDR
01134                     << ",dim=" << bestDim
01135                     << ",val=" << bestVal 
01136                     << ",chg=" << changed << endl;
01137 
01138 
01139   // see if this should still be a leaf node
01140   if (bestSDR < MIN_SDR){
01141     makeLeaf(node);
01142     float valSum = 0;
01143     for (unsigned i = 0; i < instances.size(); i++){
01144       valSum += instances[i]->output;
01145     }
01146     float avg = valSum / (float)(instances.size());
01147     node->constant = avg;
01148     if (SPLITDEBUG || STOCH_DEBUG){
01149       cout << "DT " << id << " Node " << node->id << " Poor sdr "
01150            << node->nInstances
01151            << " instances classified at leaf " << node->id
01152            << " with multiple outputs " << endl;
01153     }
01154     return;
01155   }
01156 
01157   // see if this split changed or not
01158   // assuming no changes above
01159   if (!changed && node->dim == bestDim && node->val == bestVal
01160        && !node->leaf && node->l != NULL && node->r != NULL){
01161     // same split as before.
01162     if (DTDEBUG || SPLITDEBUG) cout << "Same split as before" << endl;
01163 
01164     // see which leaf changed
01165     if (bestLeft.size() > (unsigned)node->l->nInstances){
01166       // redo left side
01167       if (DTDEBUG) cout << "Rebuild left side of tree" << endl;
01168       buildTree(node->l, bestLeft, changed);
01169     }
01170 
01171     if (bestRight.size() > (unsigned)node->r->nInstances){
01172       // redo right side
01173       if (DTDEBUG) cout << "Rebuild right side of tree" << endl;
01174       buildTree(node->r, bestRight, changed);
01175     }
01176     return;
01177   }
01178 
01179 
01180   // totally new
01181   // set the best split here
01182   node->leaf = false;
01183   node->dim = bestDim;
01184   node->val = bestVal;
01185 
01186   if (SPLITDEBUG) cout << "Best split was cut with val " << node->val
01187                        << " on dim " << node->dim
01188                        << " with sdr: " << bestSDR << endl;
01189 
01190   if (DTDEBUG) cout << "Left has " << bestLeft.size()
01191                     << ", right has " << bestRight.size() << endl;
01192 
01193   // make sure both instances
01194   if (bestLeft.size() == 0 || bestRight.size() == 0){
01195     cout << "ERROR: DT " << id << " node " << node->id << " has 0 instances: left: " << bestLeft.size()
01196          << " right: " << bestRight.size() << endl;
01197     cout << "Split was cut with val " << node->val
01198          << " on dim " << node->dim
01199          << " with sdr: " << bestSDR << endl;
01200     exit(-1);
01201   }
01202 
01203 
01204   // check if these already exist
01205   if (node->l == NULL){
01206     if (DTDEBUG) cout << "Init new left tree nodes " << endl;
01207     node->l = allocateNode();
01208   }
01209   if (node->r == NULL){
01210     if (DTDEBUG) cout << "Init new right tree nodes " << endl;
01211     node->r = allocateNode();
01212   }
01213 
01214   // recursively build the sub-trees to this one
01215   if (DTDEBUG) cout << "Building left tree for node " << node->id << endl;
01216   buildTree(node->l, bestLeft, true);
01217   if (DTDEBUG) cout << "Building right tree for node " << node->id << endl;
01218   buildTree(node->r, bestRight, true);
01219 
01220 }
01221 
01222 
01223 void M5Tree::getFeatsUsed(tree_node* node, std::vector<bool> *featsUsed){
01224 
01225   // if leaf, ones from linear model
01226   if (node->leaf){
01227     //cout << "coeff size: " << node->coefficients.size() << endl;
01228     for (unsigned i = 0; i < node->coefficients.size(); i++){
01229       if (node->coefficients[i] != 0){
01230         if (LMDEBUG || DTDEBUG) cout << "Leaf node, used coeff " << i << endl;
01231         (*featsUsed)[i] = true;
01232       }
01233     }
01234     return;
01235   }
01236 
01237   // otherwise see what split was used
01238   // and call for left and right sub-trees
01239   (*featsUsed)[node->dim] = true;
01240   if (LMDEBUG || DTDEBUG) cout << "Split node, used feat " << node->dim << endl;
01241 
01242   getFeatsUsed(node->l, featsUsed);
01243   getFeatsUsed(node->r, featsUsed);
01244 
01245   return;
01246 }
01247 
01248 
01249 void M5Tree::testPossibleSplits(const std::vector<tree_experience*> &instances,
01250                                 float *bestSDR, int *bestDim,
01251                                 float *bestVal, 
01252                                 std::vector<tree_experience*> *bestLeft,
01253                                 std::vector<tree_experience*> *bestRight) {
01254   if (DTDEBUG) cout << "testPossibleSplits" << endl;
01255 
01256 
01257   // calculate sd for the set
01258   float sd = calcSDforSet(instances);
01259   //if (DTDEBUG) cout << "I: " << I << endl;
01260 
01261   int nties = 0;
01262 
01263   // for each possible split, calc standard deviation reduction
01264   for (unsigned idim = 0; idim < instances[0]->input.size(); idim++){
01265 
01266     //float* sorted = sortOnDim(idim, instances);
01267     float minVal, maxVal;
01268     std::set<float> uniques = getUniques(idim, instances, minVal, maxVal);
01269 
01270     for (std::set<float>::iterator j = uniques.begin(); j != uniques.end(); j++){
01271 
01272       // skip max val, not a valid cut for either
01273       if ((*j) == maxVal)
01274         continue;
01275 
01276       // if this is a random forest, we eliminate some random number of splits
01277       // here (decision is taken from the random set that are left)
01278       if (rng.uniform() < featPct)
01279         continue;
01280 
01281       std::vector<tree_experience*> left;
01282       std::vector<tree_experience*> right;
01283 
01284       // splits that are cuts
01285       float splitval = (*j);
01286       float sdr = calcSDR(idim, splitval, instances, sd, left, right);
01287 
01288       if (SPLITDEBUG) cout << " CUT split val " << splitval
01289                            << " on dim: " << idim << " had sdr "
01290                            << sdr << endl;
01291 
01292       // see if this is the new best sdr
01293       compareSplits(sdr, idim, splitval, left, right, &nties,
01294                     bestSDR, bestDim, bestVal, bestLeft, bestRight);
01295 
01296 
01297     } // j loop
01298   }
01299 }
01300 
01301 
01302 
01303 void M5Tree::compareSplits(float sdr, int dim, float val, 
01304                            const std::vector<tree_experience*> &left, 
01305                            const std::vector<tree_experience*> &right,
01306                            int *nties, float *bestSDR, int *bestDim,
01307                            float *bestVal, 
01308                            std::vector<tree_experience*> *bestLeft,
01309                            std::vector<tree_experience*> *bestRight){
01310   if (DTDEBUG) cout << "compareSplits sdr=" << sdr << ",dim=" << dim
01311                     << ",val=" << val  <<endl;
01312 
01313 
01314   bool newBest = false;
01315 
01316   // if its a virtual tie, break it randomly
01317   if (fabs(*bestSDR - sdr) < SPLIT_MARGIN){
01318     //cout << "Split tie, making random decision" << endl;
01319 
01320     (*nties)++;
01321     float randomval = rng.uniform(0,1);
01322     float newsplitprob = (1.0 / (float)*nties);
01323 
01324     if (randomval < newsplitprob){
01325       newBest = true;
01326       if (SPLITDEBUG) cout << "   Tie on split. DT: " << id << " rand: " << randomval
01327                            << " splitProb: " << newsplitprob << ", selecting new split " << endl;
01328     }
01329     else
01330       if (SPLITDEBUG) cout << "   Tie on split. DT: " << id << " rand: " << randomval
01331                            << " splitProb: " << newsplitprob << ", staying with old split " << endl;
01332   }
01333 
01334   // if its clearly better, set this as the best split
01335   else if (sdr > *bestSDR){
01336     newBest = true;
01337     *nties = 1;
01338   }
01339 
01340 
01341   // set the split features
01342   if (newBest){
01343     *bestSDR = sdr;
01344     *bestDim = dim;
01345     *bestVal = val;
01346     *bestLeft = left;
01347     *bestRight = right;
01348     if (SPLITDEBUG){
01349       cout << "  New best sdr: " << *bestSDR
01350            << " with val " << *bestVal
01351            << " on dim " << *bestDim << endl;
01352     }
01353   } // newbest
01354 }
01355 
01356 float M5Tree::calcSDR(int dim, float val, 
01357                       const std::vector<tree_experience*> &instances,
01358                       float sd,
01359                       std::vector<tree_experience*> &left,
01360                       std::vector<tree_experience*> &right){
01361   if (DTDEBUG) cout << "calcSDR, dim=" << dim
01362                     << " val=" << val
01363                     << " sd=" << sd
01364                     << " nInstances= " << instances.size() << endl;
01365 
01366   left.clear();
01367   right.clear();
01368 
01369   // split into two sides
01370   for (unsigned i = 0; i < instances.size(); i++){
01371     if (DTDEBUG) cout << "calcSDR - Classify instance " << i 
01372                       << " on new split " << endl;
01373 
01374     if (passTest(dim, val, instances[i]->input)){
01375       left.push_back(instances[i]);
01376     }
01377     else{
01378       right.push_back(instances[i]);
01379     }
01380   }
01381 
01382   if (DTDEBUG) cout << "Left has " << left.size()
01383                     << ", right has " << right.size() << endl;
01384 
01385   // get sd for both sides
01386   float sdLeft = calcSDforSet(left);
01387   float sdRight = calcSDforSet(right);
01388 
01389   float leftRatio = (float)left.size() / (float)instances.size();
01390   float rightRatio = (float)right.size() / (float)instances.size();
01391 
01392   float sdr = sd - (leftRatio * sdLeft + rightRatio * sdRight);
01393 
01394   if (DTDEBUG){
01395     cout << "LeftSD: " << sdLeft
01396          << " RightSD: " << sdRight
01397          << " SD: " << sd
01398          << " SDR: " << sdr
01399          << endl;
01400   }
01401 
01402   return sdr;
01403 
01404 }
01405 
01406 float M5Tree::calcSDforSet(const std::vector<tree_experience*> &instances){
01407   if (DTDEBUG) cout << "calcSDforSet" << endl;
01408 
01409   int n = instances.size();
01410 
01411   if (n == 0)
01412     return 0;
01413 
01414   double sum = 0;
01415   double sumSqr = 0;
01416 
01417   // go through instances and calculate sums, sum of squares
01418   for (unsigned i = 0; i < instances.size(); i++){
01419     float val = instances[i]->output;
01420     sum += val;
01421     sumSqr += (val * val);
01422   }
01423 
01424   double mean = sum / (double)n;
01425   double variance = (sumSqr - sum*mean)/(double)n;
01426   float sd = sqrt(variance);
01427 
01428   return sd;
01429 
01430 }
01431 
01432 std::set<float> M5Tree::getUniques(int dim, const std::vector<tree_experience*> &instances, float& minVal, float& maxVal){
01433   if (DTDEBUG) cout << "getUniques,dim = " << dim;
01434 
01435   std::set<float> uniques;
01436 
01437   for (int i = 0; i < (int)instances.size(); i++){
01438     if (i == 0 || instances[i]->input[dim] < minVal)
01439       minVal = instances[i]->input[dim];
01440     if (i == 0 || instances[i]->input[dim] > maxVal)
01441       maxVal = instances[i]->input[dim];
01442 
01443     uniques.insert(instances[i]->input[dim]);
01444   }
01445 
01446   // lets not try more than 100 possible splits per dimension
01447   if (uniques.size() > 100){
01448     float rangeInc = (maxVal - minVal) / 100.0;
01449     uniques.clear();
01450     for (float i = minVal; i < maxVal; i+= rangeInc){
01451       uniques.insert(i);
01452     }
01453   }
01454   uniques.insert(maxVal);
01455 
01456   if (DTDEBUG) cout << " #: " << uniques.size() << endl;
01457   return uniques;
01458 }
01459 
01460 
01461 float* M5Tree::sortOnDim(int dim, const std::vector<tree_experience*> &instances){
01462   if (DTDEBUG) cout << "sortOnDim,dim = " << dim << endl;
01463 
01464   float* values = new float[instances.size()];
01465 
01466   for (int i = 0; i < (int)instances.size(); i++){
01467     //cout << "Instance " << i << endl;
01468 
01469     float val = instances[i]->input[dim];
01470     //cout << " val: " << val << endl;
01471 
01472     // find where this should go
01473     for (int j = 0; j <= i; j++){
01474       //cout << " j: " << j << endl;
01475 
01476       // get to i, this is the spot then
01477       if (j==i){
01478         values[j] = val;
01479         //cout << "  At i, putting value in slot j: " << j << endl;
01480       }
01481 
01482       // if this is the spot
01483       else if (val < values[j]){
01484         //cout << "  Found slot at j: " << j << endl;
01485 
01486         // slide everything forward to make room
01487         for (int k = i; k > j; k--){
01488           //cout << "   k = " << k << " Sliding value from k-1 to k" << endl;
01489           values[k] = values[k-1];
01490         }
01491 
01492         // put value in its spot at j
01493         //cout << "  Putting value at slot j: " << j << endl;
01494         values[j] = val;
01495 
01496         // break
01497         break;
01498       }
01499 
01500     }
01501   }
01502 
01503   if (DTDEBUG){
01504     cout << "Sorted array: " << values[0];
01505     for (int i = 1; i < (int)instances.size(); i++){
01506       cout << ", " << values[i];
01507     }
01508     cout << endl;
01509   }
01510 
01511   return values;
01512 
01513 }
01514 
01515 
01516 void M5Tree::printTree(tree_node *t, int level){
01517 
01518   for (int i = 0; i < level; i++){
01519     cout << ".";
01520   }
01521 
01522   cout << "Node " << t->id  << " nInstances: " << t->nInstances ;
01523 
01524   // Leaf, print regression stuff
01525   if (t->leaf){
01526     cout << " Constant: " << t->constant << " coeff: ";
01527     for (unsigned j = 0; j < t->coefficients.size(); j++){
01528       if (!SIMPLE)
01529         cout << t->coefficients[j] << ", ";
01530       if (SIMPLE && t->coefficients[j] != 0)
01531         cout << " on feat " << j << ": " << t->coefficients[j];
01532     }
01533     cout << endl;
01534     return;
01535   }
01536 
01537   // otherwise, print split info, next nodes
01538 
01539   cout << " Type: CUT";
01540   cout << " Dim: " << t->dim << " Val: " << t->val;
01541   cout << " Left: " << t->l->id << " Right: " << t->r->id << endl;
01542 
01543   // print children
01544   if (t->dim != -1 && !t->leaf){
01545     printTree(t->l, level+1);
01546     printTree(t->r, level+1);
01547   }
01548 
01549 }
01550 
01551 
01552 // output a map of outcomes and their probabilities for this leaf node
01553 void M5Tree::leafPrediction(tree_node* leaf, const std::vector<float> &input, std::map<float, float>* retval){
01554   if (DTDEBUG)
01555     cout << "Calculating output for leaf " << leaf->id << endl;
01556 
01557   float prediction = leaf->constant;
01558   if (DTDEBUG) cout << "leaf constant: " << leaf->constant << endl;
01559 
01560   // plus each coefficient
01561   for (unsigned i = 0; i < input.size(); i++){
01562     prediction += leaf->coefficients[i] * input[i];
01563     if (DTDEBUG) {
01564       cout << "feat " << i << " coeff: " << leaf->coefficients[i]
01565            << " on input " << input[i] << endl;
01566     }
01567   }
01568 
01569   if (DTDEBUG) cout << " prediction: " << prediction << endl;
01570 
01571   (*retval)[prediction] = 1.0;
01572 }
01573 
01574 
01575 void M5Tree::initNodes(){
01576 
01577   for (int i = 0; i < N_M5_NODES; i++){
01578     initTreeNode(&(allNodes[i]));
01579     freeNodes.push_back(i);
01580     if (NODEDEBUG) 
01581       cout << "init node " << i << " with id " << allNodes[i].id 
01582            << ", now " << freeNodes.size() << " free nodes." << endl;
01583   }
01584 
01585 }
01586 
01587 M5Tree::tree_node* M5Tree::allocateNode(){
01588   if (freeNodes.empty()){
01589     tree_node* newNode = new tree_node;
01590     initTreeNode(newNode);
01591     if (NODEDEBUG) 
01592       cout << "PROBLEM: No more pre-allocated nodes!!!" << endl
01593            << "return new node " << newNode->id 
01594            << ", now " << freeNodes.size() << " free nodes." << endl;
01595     return newNode;
01596   }
01597 
01598   int i = freeNodes.back();
01599   freeNodes.pop_back();
01600   if (NODEDEBUG) 
01601     cout << "allocate node " << i << " with id " << allNodes[i].id 
01602          << ", now " << freeNodes.size() << " free nodes." << endl;
01603   return &(allNodes[i]);
01604 }
01605 
01606 void M5Tree::deallocateNode(tree_node* node){
01607   if (node->id >= N_M5_NODES){
01608     if (NODEDEBUG) 
01609       cout << "dealloc extra node id " << node->id 
01610            << ", now " << freeNodes.size() << " free nodes." << endl;
01611     delete node;
01612     return;
01613   }
01614 
01615   freeNodes.push_back(node->id);
01616   if (NODEDEBUG) 
01617     cout << "dealloc node " << node->id 
01618          << ", now " << freeNodes.size() << " free nodes." << endl;
01619 }


rl_agent
Author(s): Todd Hester
autogenerated on Thu Jun 6 2019 22:00:13