00001
00008 #include "C45Tree.hh"
00009
00010
00011
00012 C45Tree::C45Tree(int id, int trainMode, int trainFreq, int m,
00013 float featPct, Random rng):
00014 id(id), mode(trainMode), freq(trainFreq), M(m),
00015 featPct(featPct), ALLOW_ONLY_SPLITS(true), rng(rng)
00016 {
00017
00018 nnodes = 0;
00019 nOutput = 0;
00020 nExperiences = 0;
00021 hadError = false;
00022 maxnodes = N_C45_NODES;
00023 totalnodes = 0;
00024
00025
00026 SPLIT_MARGIN = 0.0;
00027
00028 MIN_GAIN_RATIO = 0.0001;
00029
00030 DTDEBUG = false;
00031 SPLITDEBUG = false;
00032 STOCH_DEBUG = false;
00033 INCDEBUG = false;
00034 NODEDEBUG = false;
00035 COPYDEBUG = false;
00036
00037 cout << "Created C4.5 decision tree " << id;
00038 if (ALLOW_ONLY_SPLITS) cout << " with == and > splits" << endl;
00039 else cout << " with > splits only" << endl;
00040 if (DTDEBUG) {
00041 cout << " mode: " << mode << " freq: " << freq << endl;
00042 }
00043
00044 initNodes();
00045 initTree();
00046
00047 }
00048
00049 C45Tree::C45Tree(const C45Tree &t):
00050 id(t.id), mode(t.mode), freq(t.freq), M(t.M),
00051 featPct(t.featPct), ALLOW_ONLY_SPLITS(t.ALLOW_ONLY_SPLITS), rng(t.rng)
00052 {
00053 COPYDEBUG = t.COPYDEBUG;
00054 if (COPYDEBUG) cout << " C4.5 tree copy constructor id " << id << endl;
00055 nnodes = 0;
00056 nOutput = t.nOutput;
00057 nExperiences = t.nExperiences;
00058 hadError = t.hadError;
00059 maxnodes = t.maxnodes;
00060 totalnodes = 0;
00061
00062 SPLIT_MARGIN = t.SPLIT_MARGIN;
00063 MIN_GAIN_RATIO = t.MIN_GAIN_RATIO;
00064 DTDEBUG = t.DTDEBUG;
00065 SPLITDEBUG = t.SPLITDEBUG;
00066 STOCH_DEBUG = t.STOCH_DEBUG;
00067 INCDEBUG = t.INCDEBUG;
00068 NODEDEBUG = t.NODEDEBUG;
00069
00070
00071 if (COPYDEBUG) cout << " C4.5 copy nodes, experiences, root, etc" << endl;
00072
00073 for (int i = 0; i < N_C45_EXP; i++){
00074 allExp[i] = t.allExp[i];
00075 }
00076 if (COPYDEBUG) cout << " C4.5 copied exp array" << endl;
00077
00078
00079 experiences.resize(t.experiences.size());
00080 for (unsigned i = 0; i < t.experiences.size(); i++){
00081 experiences[i] = &(allExp[i]);
00082 }
00083 if (COPYDEBUG) cout << " C4.5 set experience pointers" << endl;
00084
00085
00086 initNodes();
00087
00088 if (COPYDEBUG) cout << " C4.5 copy tree " << endl;
00089 root = allocateNode();
00090 lastNode = root;
00091 copyTree(root, t.root);
00092 if (COPYDEBUG) cout << " C4.5 tree copy done" << endl;
00093
00094 if (COPYDEBUG) {
00095 cout << endl << "New tree: " << endl;
00096 printTree(root, 0);
00097 cout << endl;
00098 cout << " c4.5 copy done" << endl;
00099 }
00100
00101 }
00102
00103 C45Tree* C45Tree::getCopy(){
00104
00105 C45Tree* copy = new C45Tree(*this);
00106 return copy;
00107
00108 }
00109
00110 void C45Tree::copyTree(tree_node* newNode, tree_node* origNode){
00111
00112 int nodeId = newNode->id;
00113
00114 if (COPYDEBUG) {
00115 cout << " Copy node " << newNode->id << " from node " << origNode->id << endl;
00116 cout << " NewAddy " << newNode << ", old: " << origNode << endl;
00117 }
00118
00119
00120 *newNode = *origNode;
00121 newNode->id = nodeId;
00122
00123
00124 if (origNode->l != NULL && !newNode->leaf){
00125 newNode->l = allocateNode();
00126 if (COPYDEBUG) cout << " Copy left node " << newNode->l->id << " from " << origNode->l->id << endl;
00127 copyTree(newNode->l, origNode->l);
00128 } else {
00129 newNode->l = NULL;
00130 }
00131
00132 if (origNode->r != NULL && !newNode->leaf){
00133 newNode->r = allocateNode();
00134 if (COPYDEBUG) cout << " Copy right node " << newNode->r->id << " from " << origNode->r->id << endl;
00135 copyTree(newNode->r, origNode->r);
00136 } else {
00137 newNode->r = NULL;
00138 }
00139 }
00140
00141
00142 C45Tree::~C45Tree() {
00143 deleteTree(root);
00144 for (unsigned i = N_C45_EXP; i < experiences.size(); i++){
00145 delete experiences[i];
00146 }
00147 experiences.clear();
00148 }
00149
00150
00151 bool C45Tree::trainInstance(classPair &instance){
00152
00153 if (DTDEBUG) cout << "trainInstance" << endl;
00154
00155 bool modelChanged = false;
00156
00157
00158
00159
00160 tree_experience *e;
00161 if (nExperiences < N_C45_EXP){
00162
00163 e = &(allExp[nExperiences]);
00164
00165 } else {
00166
00167 e = new tree_experience;
00168 }
00169
00170
00171 e->input = instance.in;
00172 e->output = instance.out;
00173 experiences.push_back(e);
00174 nExperiences++;
00175
00176 if (nExperiences == 1000000){
00177 cout << "Reached limit of # experiences allowed." << endl;
00178 return false;
00179 }
00180
00181 if (nExperiences != (int)experiences.size())
00182 cout << "ERROR: experience size mismatch: " << nExperiences << ", " << experiences.size() << endl;
00183
00184
00185
00186
00187
00188
00189
00190
00191 if (DTDEBUG) {
00192 cout << "Original input: ";
00193 for (unsigned i = 0; i < instance.in.size(); i++){
00194 cout << instance.in[i] << ", ";
00195 }
00196 cout << endl << " Original output: " << instance.out << endl;
00197 cout << "Added exp id: " << nExperiences << " output: " << e->output << endl;
00198 cout << "Address: " << e << " Input : ";
00199 for (unsigned i = 0; i < e->input.size(); i++){
00200 cout << e->input[i] << ", ";
00201 }
00202 cout << endl << " Now have " << nExperiences << " experiences." << endl;
00203 }
00204
00205
00206
00207
00208 if (mode == BUILD_EVERY || nExperiences <= 1){
00209 modelChanged = rebuildTree();
00210 }
00211
00212
00213 else if (mode == BUILD_ON_ERROR){
00214
00215
00216
00217
00218 tree_node* leaf = traverseTree(root, e->input);
00219
00220 float count = (float)leaf->outputs[e->output];
00221 float outputProb = count / (float)leaf->nInstances;
00222
00223 if (outputProb < 0.75){
00224 modelChanged = rebuildTree();
00225 modelChanged = true;
00226 }
00227 }
00228
00229
00230 else if (mode == BUILD_EVERY_N){
00231
00232 if (!modelChanged && (nExperiences % freq) == 0){
00233 modelChanged = rebuildTree();
00234 }
00235 }
00236
00237 if (modelChanged){
00238 if (DTDEBUG) cout << "DT " << id << " tree re-built." << endl;
00239
00240 if (DTDEBUG){
00241 cout << endl << "DT: " << id << endl;
00242 printTree(root, 0);
00243 cout << "Done printing tree" << endl;
00244 }
00245 }
00246
00247
00248
00249
00250
00251
00252
00253
00254
00255 return modelChanged;
00256
00257 }
00258
00259
00260
00261 bool C45Tree::trainInstances(std::vector<classPair> &instances){
00262 if (DTDEBUG) cout << "DT " << id << " trainInstances: " << instances.size() << endl;
00263
00264 bool modelChanged = false;
00265
00266 bool doBuild = false;
00267
00268 bool buildOnError = false;
00269
00270
00271 for (unsigned a = 0; a < instances.size(); a++){
00272 classPair instance = instances[a];
00273
00274
00275
00276
00277 tree_experience *e;
00278 if (nExperiences < N_C45_EXP){
00279
00280 e = &(allExp[nExperiences]);
00281
00282 } else {
00283
00284 e = new tree_experience;
00285 }
00286
00287
00288 e->input = instance.in;
00289 e->output = instance.out;
00290 experiences.push_back(e);
00291 nExperiences++;
00292
00293 if (nExperiences == 1000000){
00294 cout << "Reached limit of # experiences allowed." << endl;
00295 return false;
00296 }
00297
00298 if (nExperiences != (int)experiences.size())
00299 cout << "ERROR: experience size mismatch: " << nExperiences << ", " << experiences.size() << endl;
00300
00301 if (DTDEBUG) {
00302 cout << "Original input: ";
00303 for (unsigned i = 0; i < instance.in.size(); i++){
00304 cout << instance.in[i] << ", ";
00305 }
00306 cout << endl << " Original output: " << instance.out << endl;
00307 cout << "Added exp id: " << nExperiences << " output: " << e->output << endl;
00308 cout << "Address: " << e << " Input : ";
00309 for (unsigned i = 0; i < e->input.size(); i++){
00310 cout << e->input[i] << ", ";
00311 }
00312 cout << endl << " Now have " << nExperiences << " experiences." << endl;
00313 }
00314
00315
00316
00317
00318 if (doBuild) continue;
00319
00320
00321 if (mode == BUILD_EVERY || nExperiences <= 1){
00322 doBuild = true;
00323 }
00324
00325
00326 else if (mode == BUILD_ON_ERROR){
00327
00328
00329
00330
00331 tree_node* leaf = traverseTree(root, e->input);
00332
00333 float count = (float)leaf->outputs[e->output];
00334 float outputProb = count / (float)leaf->nInstances;
00335
00336 if (outputProb < 0.75){
00337 doBuild = true;
00338 buildOnError = true;
00339 }
00340 }
00341
00342
00343 else if (mode == BUILD_EVERY_N){
00344
00345 if (!modelChanged && (nExperiences % freq) == 0){
00346 doBuild = true;
00347 }
00348 }
00349
00350 }
00351
00352 if (DTDEBUG) cout << "Added " << instances.size() << " new instances. doBuild = " << doBuild << endl;
00353
00354 if (doBuild){
00355 modelChanged = rebuildTree();
00356 }
00357
00358 if (modelChanged){
00359 if (DTDEBUG) cout << "DT " << id << " tree re-built." << endl;
00360
00361 if (DTDEBUG){
00362 cout << endl << "DT: " << id << endl;
00363 printTree(root, 0);
00364 cout << "Done printing tree" << endl;
00365 }
00366 }
00367
00368 return (modelChanged || buildOnError);
00369
00370 }
00371
00372
00373 bool C45Tree::rebuildTree(){
00374 return buildTree(root, experiences, false);
00375 }
00376
00377
00378
00379 void C45Tree::testInstance(const std::vector<float> &input, std::map<float, float>* retval){
00380 if (DTDEBUG) cout << "testInstance" << endl;
00381
00382 retval->clear();
00383
00384
00385 if (experiences.size() == 0){
00386 (*retval)[0.0] = 1.0;
00387 return;
00388 }
00389
00390
00391 tree_node* leaf = traverseTree(root, input);
00392 lastNode = leaf;
00393
00394
00395 outputProbabilities(leaf, retval);
00396
00397 }
00398
00399 float C45Tree::getConf(const std::vector<float> &input){
00400 if (DTDEBUG) cout << "numVisits" << endl;
00401
00402
00403 if (experiences.size() == 0){
00404 return 0;
00405 }
00406
00407
00408
00409
00410
00411 float conf = (float)lastNode->nInstances / (float)(2.0*M);
00412 if (conf > 1.0)
00413 return 1.0;
00414 else
00415 return conf;
00416
00417 }
00418
00419
00420
00421
00422
00423
00424 void C45Tree::initTree(){
00425 if (DTDEBUG) cout << "initTree()" << endl;
00426 root = allocateNode();
00427
00428 if (DTDEBUG) cout << " root id = " << root->id << endl;
00429
00430
00431 for (int i = 0; i < id; i++){
00432 rng.uniform(0, 1);
00433 }
00434
00435 }
00436
00437
00438
00439
00440 void C45Tree::initTreeNode(tree_node* node){
00441 if (DTDEBUG) cout << "initTreeNode()";
00442
00443 node->id = nnodes++;
00444 if (DTDEBUG) cout << " id = " << node->id << endl;
00445
00446 totalnodes++;
00447 if (totalnodes > maxnodes){
00448 maxnodes = totalnodes;
00449 if (DTDEBUG) cout << id << " C4.5 MAX nodes: " << maxnodes << endl;
00450 }
00451
00452
00453 node->dim = -1;
00454 node->val = -1;
00455 node->type = -1;
00456
00457
00458 node->nInstances = 0;
00459 node->outputs.clear();
00460
00461
00462 node->l = NULL;
00463 node->r = NULL;
00464
00465 node->leaf = true;
00466
00467 }
00468
00469 void C45Tree::deleteTree(tree_node* node){
00470 if (DTDEBUG) cout << "deleteTree, node=" << node->id << endl;
00471
00472 if (node==NULL)
00473 return;
00474
00475 totalnodes--;
00476
00477 node->nInstances = 0;
00478 node->outputs.clear();
00479
00480
00481
00482 if (!node->leaf){
00483
00484 if (node->l != NULL){
00485 deleteTree(node->l);
00486 deallocateNode(node->l);
00487 node->l = NULL;
00488 }
00489
00490
00491 if (node->r != NULL){
00492 deleteTree(node->r);
00493 deallocateNode(node->r);
00494 node->r = NULL;
00495 }
00496 }
00497
00498 node->leaf = true;
00499 node->dim = -1;
00500
00501 }
00502
00503
00504 C45Tree::tree_node* C45Tree::getCorrectChild(tree_node* node,
00505 const std::vector<float> &input){
00506
00507 if (DTDEBUG) cout << "getCorrectChild, node=" << node->id << endl;
00508
00509 if (passTest(node->dim, node->val, node->type, input))
00510 return node->l;
00511 else
00512 return node->r;
00513
00514 }
00515
00516 C45Tree::tree_node* C45Tree::traverseTree(tree_node* node,
00517 const std::vector<float> &input){
00518
00519 if (DTDEBUG) cout << "traverseTree, node=" << node->id << endl;
00520
00521 while (!node->leaf){
00522 node = getCorrectChild(node, input);
00523 }
00524
00525 return node;
00526 }
00527
00528
00529 bool C45Tree::passTest(int dim, float val, bool type, const std::vector<float> &input){
00530 if (DTDEBUG) cout << "passTest, dim=" << dim << ",val=" << val << ",type=" << type
00531 << ",input["<<dim<<"]=" << input[dim] <<endl;
00532
00533 if (type == CUT){
00534 if (input[dim] > val)
00535 return false;
00536 else
00537 return true;
00538 } else if (type == ONLY){
00539 if (input[dim] == val)
00540 return false;
00541 else
00542 return true;
00543 } else {
00544 return false;
00545 }
00546
00547 }
00548
00549
00550 bool C45Tree::buildTree(tree_node *node,
00551 const std::vector<tree_experience*> &instances,
00552 bool changed){
00553 if(DTDEBUG) cout << "buildTree, node=" << node->id
00554 << ",nInstances:" << instances.size()
00555 << ",chg:" << changed << endl;
00556
00557 if (instances.size() == 0){
00558 cout << "Error: buildTree called on tree " << id << " node " << node->id << " with no instances." << endl;
00559 exit(-1);
00560 }
00561
00562
00563
00564
00565
00566
00567 node->nInstances = instances.size();
00568
00569
00570 node->outputs.clear();
00571 for (unsigned i = 0; i < instances.size(); i++){
00572 node->outputs[instances[i]->output]++;
00573 }
00574
00575
00576 if (node->outputs.size() == 1){
00577 bool change = makeLeaf(node);
00578 if (DTDEBUG){
00579 cout << "All " << node->nInstances
00580 << " classified with output "
00581 << instances[0]->output << endl;
00582 }
00583 return change;
00584 }
00585
00586
00587 else {
00588
00589 if (SPLITDEBUG) cout << endl << "Creating new decision node" << endl;
00590
00591
00592
00593
00594 float bestGainRatio = -1.0;
00595 int bestDim = -1;
00596 float bestVal = -1;
00597 bool bestType = 0;
00598 std::vector<tree_experience*> bestLeft;
00599 std::vector<tree_experience*> bestRight;
00600
00601 testPossibleSplits(instances, &bestGainRatio, &bestDim, &bestVal, &bestType, &bestLeft, &bestRight);
00602
00603 return implementSplit(node, bestGainRatio, bestDim, bestVal, bestType, bestLeft, bestRight, changed);
00604
00605 }
00606
00607 }
00608
00609
00610 bool C45Tree::makeLeaf(tree_node* node){
00611
00612
00613 if (node->l != NULL){
00614 deleteTree(node->l);
00615 deallocateNode(node->l);
00616 node->l = NULL;
00617 }
00618
00619 if (node->r != NULL){
00620 deleteTree(node->r);
00621 deallocateNode(node->r);
00622 node->r = NULL;
00623 }
00624
00625
00626 bool change = (!node->leaf || node->type < 0);
00627
00628 node->leaf = true;
00629 node->type = 0;
00630
00631 return change;
00632 }
00633
00634 bool C45Tree::implementSplit(tree_node* node, float bestGainRatio, int bestDim,
00635 float bestVal, bool bestType,
00636 const std::vector<tree_experience*> &bestLeft,
00637 const std::vector<tree_experience*> &bestRight,
00638 bool changed){
00639 if (DTDEBUG) cout << "implementSplit node=" << node->id << ",gainRatio=" << bestGainRatio
00640 << ",dim=" << bestDim
00641 << ",val=" << bestVal << ",type=" << bestType
00642 << ",chg=" << changed << endl;
00643
00644
00645
00646 if (bestGainRatio < MIN_GAIN_RATIO){
00647 bool change = makeLeaf(node);
00648 if (SPLITDEBUG || STOCH_DEBUG){
00649 cout << "DT " << id << " Node " << node->id << " Poor gain ratio: "
00650 << bestGainRatio << ", " << node->nInstances
00651 << " instances classified at leaf " << node->id
00652 << " with multiple outputs " << endl;
00653 }
00654 return change;
00655 }
00656
00657
00658
00659 if (!changed && node->dim == bestDim && node->val == bestVal
00660 && node->type == bestType && !node->leaf
00661 && node->l != NULL && node->r != NULL){
00662
00663 if (DTDEBUG || SPLITDEBUG) cout << "Same split as before" << endl;
00664 bool changeL = false;
00665 bool changeR = false;
00666
00667
00668 if (bestLeft.size() > (unsigned)node->l->nInstances){
00669
00670 if (DTDEBUG) cout << "Rebuild left side of tree" << endl;
00671 changeL = buildTree(node->l, bestLeft, changed);
00672 }
00673
00674 if (bestRight.size() > (unsigned)node->r->nInstances){
00675
00676 if (DTDEBUG) cout << "Rebuild right side of tree" << endl;
00677 changeR = buildTree(node->r, bestRight, changed);
00678 }
00679
00680
00681 return (changeL || changeR);
00682 }
00683
00684
00685
00686 node->leaf = false;
00687 node->dim = bestDim;
00688 node->val = bestVal;
00689 node->type = bestType;
00690
00691 if (SPLITDEBUG) cout << "Best split was type " << node->type
00692 << " with val " << node->val
00693 << " on dim " << node->dim
00694 << " with gainratio: " << bestGainRatio << endl;
00695
00696 if (DTDEBUG) cout << "Left has " << bestLeft.size()
00697 << ", right has " << bestRight.size() << endl;
00698
00699
00700 if (bestLeft.size() == 0 || bestRight.size() == 0){
00701 cout << "ERROR: DT " << id << " node " << node->id << " has 0 instances: left: " << bestLeft.size()
00702 << " right: " << bestRight.size() << endl;
00703 cout << "Split was type " << node->type
00704 << " with val " << node->val
00705 << " on dim " << node->dim
00706 << " with gainratio: " << bestGainRatio << endl;
00707 exit(-1);
00708 }
00709
00710
00711
00712 if (node->l == NULL){
00713 if (DTDEBUG) cout << "Init new left tree nodes " << endl;
00714 node->l = allocateNode();
00715 }
00716 if (node->r == NULL){
00717 if (DTDEBUG) cout << "Init new right tree nodes " << endl;
00718 node->r = allocateNode();
00719 }
00720
00721
00722 if (DTDEBUG) cout << "Building left tree for node " << node->id << endl;
00723 buildTree(node->l, bestLeft, true);
00724 if (DTDEBUG) cout << "Building right tree for node " << node->id << endl;
00725 buildTree(node->r, bestRight, true);
00726
00727
00728 return true;
00729
00730 }
00731
00732
00733
00734 void C45Tree::testPossibleSplits(const std::vector<tree_experience*> &instances,
00735 float *bestGainRatio, int *bestDim,
00736 float *bestVal, bool *bestType,
00737 std::vector<tree_experience*> *bestLeft,
00738 std::vector<tree_experience*> *bestRight) {
00739 if (DTDEBUG) cout << "testPossibleSplits" << endl;
00740
00741
00742
00743 float I = calcIforSet(instances);
00744
00745
00746 int nties = 0;
00747
00748
00749 for (unsigned idim = 0; idim < instances[0]->input.size(); idim++){
00750
00751
00752 float minVal, maxVal;
00753 std::set<float> uniques = getUniques(idim, instances, minVal, maxVal);
00754
00755 for (std::set<float>::iterator j = uniques.begin(); j != uniques.end(); j++){
00756
00757
00758 if ((*j) == maxVal)
00759 continue;
00760
00761
00762
00763 if (rng.uniform() < featPct)
00764 continue;
00765
00766 std::vector<tree_experience*> left;
00767 std::vector<tree_experience*> right;
00768
00769
00770 float splitval = (*j);
00771 float gainRatio = calcGainRatio(idim, splitval, CUT, instances, I, left, right);
00772
00773 if (SPLITDEBUG) cout << " CUT split val " << splitval
00774 << " on dim: " << idim << " had gain ratio "
00775 << gainRatio << endl;
00776
00777
00778 compareSplits(gainRatio, idim, splitval, CUT, left, right, &nties,
00779 bestGainRatio, bestDim, bestVal, bestType, bestLeft, bestRight);
00780
00781
00782
00783 if (ALLOW_ONLY_SPLITS && (*j) != minVal){
00784
00785 float splitval = (*j);
00786
00787 float gainRatio = calcGainRatio(idim, splitval, ONLY, instances, I, left, right);
00788
00789 if (SPLITDEBUG) cout << " ONLY split val " << splitval
00790 << " on dim: " << idim << " had gain ratio "
00791 << gainRatio << endl;
00792
00793
00794 compareSplits(gainRatio, idim, splitval, ONLY, left, right, &nties,
00795 bestGainRatio, bestDim, bestVal, bestType, bestLeft, bestRight);
00796
00797 }
00798
00799 }
00800 }
00801 }
00802
00803
00804
00805 void C45Tree::compareSplits(float gainRatio, int dim, float val, bool type,
00806 const std::vector<tree_experience*> &left,
00807 const std::vector<tree_experience*> &right,
00808 int *nties, float *bestGainRatio, int *bestDim,
00809 float *bestVal, bool *bestType,
00810 std::vector<tree_experience*> *bestLeft, std::vector<tree_experience*> *bestRight){
00811 if (DTDEBUG) cout << "compareSplits gainRatio=" << gainRatio << ",dim=" << dim
00812 << ",val=" << val << ",type= " << type <<endl;
00813
00814
00815 bool newBest = false;
00816
00817
00818 if (fabs(*bestGainRatio - gainRatio) < SPLIT_MARGIN){
00819
00820
00821 (*nties)++;
00822 float randomval = rng.uniform(0,1);
00823 float newsplitprob = (1.0 / (float)*nties);
00824
00825 if (randomval < newsplitprob){
00826 newBest = true;
00827 if (SPLITDEBUG) cout << " Tie on split. DT: " << id << " rand: " << randomval
00828 << " splitProb: " << newsplitprob << ", selecting new split " << endl;
00829 }
00830 else
00831 if (SPLITDEBUG) cout << " Tie on split. DT: " << id << " rand: " << randomval
00832 << " splitProb: " << newsplitprob << ", staying with old split " << endl;
00833 }
00834
00835
00836 else if (gainRatio > *bestGainRatio){
00837 newBest = true;
00838 *nties = 1;
00839 }
00840
00841
00842
00843 if (newBest){
00844 *bestGainRatio = gainRatio;
00845 *bestDim = dim;
00846 *bestVal = val;
00847 *bestType = type;
00848 *bestLeft = left;
00849 *bestRight = right;
00850 if (SPLITDEBUG){
00851 cout << " New best gain ratio: " << *bestGainRatio
00852 << ": type " << *bestType
00853 << " with val " << *bestVal
00854 << " on dim " << *bestDim << endl;
00855 }
00856 }
00857 }
00858
00859 float C45Tree::calcGainRatio(int dim, float val, bool type,
00860 const std::vector<tree_experience*> &instances,
00861 float I,
00862 std::vector<tree_experience*> &left,
00863 std::vector<tree_experience*> &right){
00864 if (DTDEBUG) cout << "calcGainRatio, dim=" << dim
00865 << " val=" << val
00866 << " I=" << I
00867 << " nInstances= " << instances.size() << endl;
00868
00869 left.clear();
00870 right.clear();
00871
00872
00873 float D[2];
00874
00875
00876
00877
00878 float Info;
00879
00880
00881 float Gain;
00882
00883
00884 float SplitInfo;
00885
00886
00887 float GainRatio;
00888
00889
00890 for (unsigned i = 0; i < instances.size(); i++){
00891 if (DTDEBUG) cout << "calcGainRatio - Classify instance " << i
00892 << " on new split " << endl;
00893
00894 if (passTest(dim, val, type, instances[i]->input)){
00895 left.push_back(instances[i]);
00896 }
00897 else{
00898 right.push_back(instances[i]);
00899 }
00900 }
00901
00902 if (DTDEBUG) cout << "Left has " << left.size()
00903 << ", right has " << right.size() << endl;
00904
00905 D[0] = (float)left.size() / (float)instances.size();
00906 D[1] = (float)right.size() / (float)instances.size();
00907 float leftInfo = calcIforSet(left);
00908 float rightInfo = calcIforSet(right);
00909 Info = D[0] * leftInfo + D[1] * rightInfo;
00910 Gain = I - Info;
00911 SplitInfo = calcIofP((float*)&D, 2);
00912 GainRatio = Gain / SplitInfo;
00913
00914 if (DTDEBUG){
00915 cout << "LeftInfo: " << leftInfo
00916 << " RightInfo: " << rightInfo
00917 << " Info: " << Info
00918 << " Gain: " << Gain
00919 << " SplitInfo: " << SplitInfo
00920 << " GainRatio: " << GainRatio
00921 << endl;
00922 }
00923
00924 return GainRatio;
00925
00926 }
00927
00928 float C45Tree::calcIofP(float* P, int size){
00929 if (DTDEBUG) cout << "calcIofP, size=" << size << endl;
00930 float I = 0;
00931 for (int i = 0; i < size; i++){
00932 I -= P[i] * log(P[i]);
00933 }
00934 return I;
00935 }
00936
00937 float C45Tree::calcIforSet(const std::vector<tree_experience*> &instances){
00938 if (DTDEBUG) cout << "calcIforSet" << endl;
00939
00940 std::map<float, int> classes;
00941
00942
00943 for (unsigned i = 0; i < instances.size(); i++){
00944
00945 float val = instances[i]->output;
00946 classes[val]++;
00947 }
00948
00949
00950 float Pval;
00951 float I = 0;
00952 for (std::map<float, int>::iterator i = classes.begin(); i != classes.end(); i++){
00953 Pval = (float)(*i).second / (float)instances.size();
00954
00955 I -= Pval * log(Pval);
00956 }
00957
00958 return I;
00959
00960 }
00961
00962 std::set<float> C45Tree::getUniques(int dim, const std::vector<tree_experience*> &instances, float& minVal, float& maxVal){
00963 if (DTDEBUG) cout << "getUniques,dim = " << dim;
00964
00965 std::set<float> uniques;
00966
00967 for (int i = 0; i < (int)instances.size(); i++){
00968 if (i == 0 || instances[i]->input[dim] < minVal)
00969 minVal = instances[i]->input[dim];
00970 if (i == 0 || instances[i]->input[dim] > maxVal)
00971 maxVal = instances[i]->input[dim];
00972
00973 uniques.insert(instances[i]->input[dim]);
00974 }
00975
00976 if (DTDEBUG) cout << " #: " << uniques.size() << endl;
00977 return uniques;
00978 }
00979
00980
00981 float* C45Tree::sortOnDim(int dim, const std::vector<tree_experience*> &instances){
00982 if (DTDEBUG) cout << "sortOnDim,dim = " << dim << endl;
00983
00984 float* values = new float[instances.size()];
00985
00986 for (int i = 0; i < (int)instances.size(); i++){
00987
00988
00989 float val = instances[i]->input[dim];
00990
00991
00992
00993 for (int j = 0; j <= i; j++){
00994
00995
00996
00997 if (j==i){
00998 values[j] = val;
00999
01000 }
01001
01002
01003 else if (val < values[j]){
01004
01005
01006
01007 for (int k = i; k > j; k--){
01008
01009 values[k] = values[k-1];
01010 }
01011
01012
01013
01014 values[j] = val;
01015
01016
01017 break;
01018 }
01019
01020 }
01021 }
01022
01023 if (DTDEBUG){
01024 cout << "Sorted array: " << values[0];
01025 for (int i = 1; i < (int)instances.size(); i++){
01026 cout << ", " << values[i];
01027 }
01028 cout << endl;
01029 }
01030
01031 return values;
01032
01033 }
01034
01035
01036 void C45Tree::printTree(tree_node *t, int level){
01037
01038 for (int i = 0; i < level; i++){
01039 cout << ".";
01040 }
01041
01042 cout << "Node " << t->id;
01043 if (t->type == C45Tree::CUT) cout << " Type: CUT";
01044 else cout << " Type: ONLY";
01045 cout << " Dim: " << t->dim << " Val: " << t->val
01046 << " nInstances: " << t->nInstances ;
01047
01048 if (t->leaf){
01049 cout << " Outputs: ";
01050 for (std::map<float, int>::iterator j = t->outputs.begin();
01051 j != t->outputs.end(); j++){
01052 cout << (*j).first << ": " << (*j).second << ", ";
01053 }
01054 cout << endl;
01055 }
01056 else
01057 cout << " Left: " << t->l->id << " Right: " << t->r->id << endl;
01058
01059
01060
01061 if (t->dim != -1 && !t->leaf){
01062 printTree(t->l, level+1);
01063 printTree(t->r, level+1);
01064 }
01065
01066 }
01067
01068
01069
01070
01071
01072
01073 void C45Tree::outputProbabilities(tree_node* leaf, std::map<float, float>* retval){
01074 if (STOCH_DEBUG) cout << "Calculating output probs for leaf " << leaf->id << endl;
01075
01076
01077 for (std::map<float, int>::iterator it = leaf->outputs.begin();
01078 it != leaf->outputs.end(); it++){
01079
01080 float val = (*it).first;
01081 float count = (float)(*it).second;
01082 if (count > 0)
01083 (*retval)[val] = count / (float)leaf->nInstances;
01084
01085 if (STOCH_DEBUG)
01086 cout << "Output value " << val << " had count of " << count << " on "
01087 << leaf->nInstances <<" instances and prob of "
01088 << (*retval)[val] << endl;
01089 }
01090
01091 }
01092
01093
01094 void C45Tree::initNodes(){
01095
01096 for (int i = 0; i < N_C45_NODES; i++){
01097 initTreeNode(&(allNodes[i]));
01098 freeNodes.push_back(i);
01099 if (NODEDEBUG)
01100 cout << "init node " << i << " with id " << allNodes[i].id
01101 << ", now " << freeNodes.size() << " free nodes." << endl;
01102 }
01103
01104 }
01105
01106 C45Tree::tree_node* C45Tree::allocateNode(){
01107 if (freeNodes.empty()){
01108 tree_node* newNode = new tree_node;
01109 initTreeNode(newNode);
01110 if (NODEDEBUG)
01111 cout << id << " PROBLEM: No more pre-allocated nodes!!!" << endl
01112 << "return new node " << newNode->id
01113 << ", now " << freeNodes.size() << " free nodes." << endl;
01114 return newNode;
01115 }
01116
01117 int i = freeNodes.back();
01118 freeNodes.pop_back();
01119 if (NODEDEBUG)
01120 cout << id << " allocate node " << i << " with id " << allNodes[i].id
01121 << ", now " << freeNodes.size() << " free nodes." << endl;
01122 return &(allNodes[i]);
01123 }
01124
01125 void C45Tree::deallocateNode(tree_node* node){
01126
01127 if (node->id >= N_C45_NODES){
01128 if (NODEDEBUG)
01129 cout << id << " dealloc extra node id " << node->id
01130 << ", now " << freeNodes.size() << " free nodes." << endl;
01131 delete node;
01132 return;
01133 }
01134
01135 freeNodes.push_back(node->id);
01136 if (NODEDEBUG)
01137 cout << id << " dealloc node " << node->id
01138 << ", now " << freeNodes.size() << " free nodes." << endl;
01139 }
01140