00001 #include "Stump.hh"
00002
00003
00004
00005 Stump::Stump(int id, int trainMode, int trainFreq, int m, float featPct, Random rng):
00006 id(id), mode(trainMode), freq(trainFreq), M(m), featPct(featPct), rng(rng)
00007 {
00008
00009 nOutput = 0;
00010 nExperiences = 0;
00011
00012
00013 SPLIT_MARGIN = 0.02;
00014
00015 LOSS_MARGIN = 0.0;
00016 REBUILD_RATIO = 0.05;
00017
00018 MIN_GAIN_RATIO = 0.0001;
00019
00020 ALLOW_ONLY_SPLITS = true;
00021
00022 STDEBUG = false;
00023 SPLITDEBUG = false;
00024
00025 if (STDEBUG) {
00026 cout << "Created decision stump " << id << endl;
00027 cout << " mode: " << mode
00028 << " freq: " << freq << endl;
00029 }
00030
00031 initStump();
00032
00033 }
00034
00035 Stump::Stump(const Stump& s):
00036 id(s.id), mode(s.mode), freq(s.freq), M(s.M), featPct(s.featPct), rng(s.rng)
00037 {
00038 nOutput = s.nOutput;
00039 nExperiences = s.nExperiences;
00040 SPLIT_MARGIN = s.SPLIT_MARGIN;
00041 LOSS_MARGIN = s.LOSS_MARGIN;
00042 REBUILD_RATIO = s.REBUILD_RATIO;
00043 MIN_GAIN_RATIO = s.MIN_GAIN_RATIO;
00044 ALLOW_ONLY_SPLITS = s.ALLOW_ONLY_SPLITS;
00045 STDEBUG = s.STDEBUG;
00046 SPLITDEBUG = s.SPLITDEBUG;
00047
00048 for (int i = 0; i < N_STUMP_EXP; i++){
00049 allExp[i] = s.allExp[i];
00050 }
00051
00052
00053 experiences.resize(s.experiences.size());
00054 for (unsigned i = 0; i < s.experiences.size(); i++){
00055 experiences[i] = &(allExp[i]);
00056 }
00057
00058
00059 dim = s.dim;
00060 val = s.val;
00061 type = s.type;
00062 gainRatio = s.gainRatio;
00063 leftOutputs = s.leftOutputs;
00064 rightOutputs = s.rightOutputs;
00065 }
00066
00067 Stump* Stump::getCopy(){
00068 Stump* copy = new Stump(*this);
00069 return copy;
00070 }
00071
00072 Stump::~Stump() {
00073 for (unsigned i = N_STUMP_EXP; i < experiences.size(); i++){
00074 delete experiences[i];
00075 }
00076 experiences.clear();
00077 }
00078
00079
00080
00081 bool Stump::trainInstances(std::vector<classPair> &instances){
00082 if (STDEBUG) cout << "trainInstances: " << instances.size() << endl;
00083
00084 bool modelChanged = false;
00085 bool doBuild = false;
00086
00087
00088 for (unsigned a = 0; a < instances.size(); a++){
00089 classPair instance = instances[a];
00090
00091
00092 stump_experience *e;
00093 if (nExperiences < N_STUMP_EXP){
00094
00095 e = &(allExp[nExperiences]);
00096
00097 } else {
00098
00099 e = new stump_experience;
00100 }
00101
00102 e->input = instance.in;
00103 e->output = instance.out;
00104 e->id = nExperiences;
00105 experiences.push_back(e);
00106 nExperiences++;
00107
00108 if (nExperiences == 1000000){
00109 cout << "Reached limit of # experiences allowed." << endl;
00110 return false;
00111 }
00112
00113 if (nExperiences != (int)experiences.size())
00114 cout << "ERROR: experience size mismatch: " << nExperiences << ", " << experiences.size() << endl;
00115
00116
00117
00118
00119
00120
00121
00122
00123 if (STDEBUG) {
00124 cout << "Original input: ";
00125 for (unsigned i = 0; i < instance.in.size(); i++){
00126 cout << instance.in[i] << ", ";
00127 }
00128 cout << endl << " Original output: " << instance.out << endl;
00129 cout << "Added exp id: " << e->id << " output: " << e->output << endl;
00130 cout << "Address: " << e << " Input : ";
00131 for (unsigned i = 0; i < e->input.size(); i++){
00132 cout << e->input[i] << ", ";
00133 }
00134 cout << endl << " Now have " << nExperiences << " experiences." << endl;
00135 }
00136
00137
00138 if (mode == 0 || mode == 1 || nExperiences <= 1){
00139
00140 doBuild = true;
00141 }
00142
00143
00144 else if (mode == 2){
00145
00146 if (!modelChanged && (nExperiences % freq) == 0){
00147 doBuild = true;
00148 }
00149 }
00150
00151 }
00152
00153 if (doBuild){
00154 buildStump();
00155 modelChanged = true;
00156 }
00157
00158 if (modelChanged){
00159 if (STDEBUG) cout << "ST " << id << " stump re-built." << endl;
00160
00161 if (STDEBUG){
00162 cout << endl << "ST: " << id << endl;
00163 printStump();
00164 cout << "Done printing stump" << endl;
00165 }
00166 }
00167
00168 return modelChanged;
00169 }
00170
00171
00172
00173 bool Stump::trainInstance(classPair &instance){
00174 if (STDEBUG) cout << "trainInstance" << endl;
00175
00176 bool modelChanged = false;
00177
00178
00179
00180
00181 stump_experience *e;
00182 if (nExperiences < N_STUMP_EXP){
00183
00184 e = &(allExp[nExperiences]);
00185
00186 } else {
00187
00188 e = new stump_experience;
00189 }
00190
00191
00192 e->input = instance.in;
00193 e->output = instance.out;
00194 e->id = nExperiences;
00195 experiences.push_back(e);
00196 nExperiences++;
00197
00198 if (nExperiences == 1000000){
00199 cout << "Reached limit of # experiences allowed." << endl;
00200 return false;
00201 }
00202
00203 if (nExperiences != (int)experiences.size())
00204 cout << "ERROR: experience size mismatch: " << nExperiences << ", " << experiences.size() << endl;
00205
00206
00207
00208
00209
00210
00211
00212
00213 if (STDEBUG) {
00214 cout << "Original input: ";
00215 for (unsigned i = 0; i < instance.in.size(); i++){
00216 cout << instance.in[i] << ", ";
00217 }
00218 cout << endl << " Original output: " << instance.out << endl;
00219 cout << "Added exp id: " << e->id << " output: " << e->output << endl;
00220 cout << "Address: " << e << " Input : ";
00221 for (unsigned i = 0; i < e->input.size(); i++){
00222 cout << e->input[i] << ", ";
00223 }
00224 cout << endl << " Now have " << nExperiences << " experiences." << endl;
00225 }
00226
00227
00228
00229
00230 if (mode == 0 || mode == 1 || nExperiences <= 1){
00231
00232
00233 buildStump();
00234 modelChanged = true;
00235
00236 }
00237
00238
00239 else if (mode == 2){
00240
00241 if (!modelChanged && (nExperiences % freq) == 0){
00242 buildStump();
00243 modelChanged = true;
00244
00245 }
00246 }
00247
00248 if (modelChanged){
00249 if (STDEBUG) cout << "ST " << id << " stump re-built." << endl;
00250
00251 if (STDEBUG){
00252 cout << endl << "ST: " << id << endl;
00253 printStump();
00254 cout << "Done printing stump" << endl;
00255 }
00256 }
00257
00258 return modelChanged;
00259
00260 }
00261
00262
00263
00264
00265 void Stump::testInstance(const std::vector<float> &input, std::map<float, float>* retval){
00266 if (STDEBUG) cout << "testInstance ST: " << id << endl;
00267
00268 retval->clear();
00269
00270
00271 if (experiences.size() == 0){
00272 (*retval)[0.0] = 1.0;
00273 if (STDEBUG) cout << "no experiences, return 1.0 prob of 0.0" << endl;
00274 return;
00275 }
00276
00277
00278 if (passTest(dim, val, type, input))
00279 outputProbabilities(leftOutputs, retval);
00280 else
00281 outputProbabilities(rightOutputs, retval);
00282 }
00283
00284 float Stump::getConf(const std::vector<float> &input){
00285 if (STDEBUG) cout << "numVisits" << endl;
00286
00287
00288 if (experiences.size() == 0){
00289 return 0;
00290 }
00291
00292
00293 float conf = (float)nExperiences / (float)(2.0 *M);
00294 if (conf > 1.0)
00295 return 1.0;
00296 else
00297 return conf;
00298
00299 }
00300
00301
00302 int Stump::findMatching(const std::vector<stump_experience*> &instances, int dim,
00303 int val, int minConf){
00304
00305 int count = 0;
00306
00307 for (unsigned i = 0; i < instances.size(); i++){
00308 if (!passTest(dim, val, ONLY, instances[i]->input)){
00309 count++;
00310
00311 if (count >= minConf)
00312 return count;
00313 }
00314 }
00315 return count;
00316 }
00317
00318
00319
00320
00321
00322 void Stump::initStump(){
00323 if (STDEBUG) cout << "initStump()" << endl;
00324
00325 dim = -1;
00326 val = -1;
00327 type = -1;
00328
00329
00330 leftOutputs.clear();
00331 rightOutputs.clear();
00332
00333
00334 for (int i = 0; i < id; i++){
00335 rng.uniform(0, 1);
00336 }
00337
00338 }
00339
00340
00342 bool Stump::passTest(int dim, float val, int type, const std::vector<float> &input){
00343 if (STDEBUG) cout << "passTest, dim=" << dim << ",val=" << val << ",type=" << type
00344 << ",input["<<dim<<"]=" << input[dim] <<endl;
00345
00346
00347
00348 if (type == CUT){
00349 if (input[dim] > val)
00350 return false;
00351 else
00352 return true;
00353 } else if (type == ONLY){
00354 if (input[dim] == val)
00355 return false;
00356 else
00357 return true;
00358 } else {
00359 return false;
00360 }
00361
00362 }
00363
00364
00366 void Stump::buildStump(){
00367 if(STDEBUG) cout << "buildStump" << endl;
00368
00369 if (experiences.size() == 0){
00370 cout << "Error: buildStump called on stump " << id << " with no instances." << endl;
00371 exit(-1);
00372 }
00373
00374 if (SPLITDEBUG) cout << endl << "Creating new decision node" << endl;
00375
00376
00377
00378 float bestGainRatio = -1.0;
00379 int bestDim = -1;
00380 float bestVal = -1;
00381 int bestType = -1;
00382
00383 testPossibleSplits(&bestGainRatio, &bestDim, &bestVal, &bestType);
00384
00385 implementSplit(bestGainRatio, bestDim, bestVal, bestType);
00386
00387 }
00388
00389
00390 void Stump::implementSplit(float bestGainRatio, int bestDim,
00391 float bestVal, int bestType){
00392 if (STDEBUG) cout << "implementSplit gainRatio=" << bestGainRatio
00393 << ",dim=" << bestDim
00394 << ",val=" << bestVal << ",type=" << bestType << endl;
00395
00396
00397
00398 dim = bestDim;
00399 val = bestVal;
00400 type = bestType;
00401 gainRatio = bestGainRatio;
00402 if (SPLITDEBUG) cout << "Best split was type " << type
00403 << " with val " << val
00404 << " on dim " << dim
00405 << " with gainratio: " << bestGainRatio << endl;
00406
00407
00408
00409 std::vector<stump_experience*> left;
00410 std::vector<stump_experience*> right;
00411 leftOutputs.clear();
00412 rightOutputs.clear();
00413 for (unsigned i = 0; i < experiences.size(); i++){
00414 if (STDEBUG) cout << "implmentSplit - Classify instance " << i << " id: "
00415 << experiences[i]->id << " on new split " << endl;
00416 if (passTest(dim, val, type, experiences[i]->input)){
00417 left.push_back(experiences[i]);
00418 leftOutputs.insert(experiences[i]->output);
00419 }
00420 else{
00421 right.push_back(experiences[i]);
00422 rightOutputs.insert(experiences[i]->output);
00423 }
00424 }
00425
00426 if (STDEBUG) cout << "Left has " << left.size()
00427 << ", right has " << right.size() << endl;
00428
00429 }
00430
00431
00432
00433 void Stump::testPossibleSplits(float *bestGainRatio, int *bestDim,
00434 float *bestVal, int *bestType) {
00435 if (STDEBUG) cout << "testPossibleSplits" << endl;
00436
00437
00438
00439 float I = calcIforSet(experiences);
00440
00441
00442 int nties = 0;
00443
00444
00445 for (unsigned idim = 0; idim < experiences[0]->input.size(); idim++){
00446
00447
00448
00449 if (rng.uniform() < featPct)
00450 continue;
00451
00452
00453 float* sorted = sortOnDim(idim);
00454
00455 for (int j = 0; j < (int)experiences.size(); j++){
00456
00457
00458
00459 if (j > 0 && sorted[j] != sorted[j-1]){
00460 float splitval = (sorted[j] + sorted[j-1]) / 2.0;
00461 float gainRatio = calcGainRatio(idim, splitval, CUT, I);
00462
00463 if (SPLITDEBUG) cout << " CUT split val " << splitval
00464 << " on dim: " << idim << " had gain ratio "
00465 << gainRatio << endl;
00466
00467
00468 compareSplits(gainRatio, idim, splitval, CUT, &nties,
00469 bestGainRatio, bestDim, bestVal, bestType);
00470
00471
00472 }
00473
00474 if (ALLOW_ONLY_SPLITS){
00475
00476 if (j == 0 || sorted[j] != sorted[j-1]){
00477 float splitval = sorted[j];
00478
00479 float gainRatio = calcGainRatio(idim, splitval, ONLY, I);
00480
00481 if (SPLITDEBUG) cout << " ONLY split val " << splitval
00482 << " on dim: " << idim << " had gain ratio "
00483 << gainRatio << endl;
00484
00485
00486 compareSplits(gainRatio, idim, splitval, ONLY, &nties,
00487 bestGainRatio, bestDim, bestVal, bestType);
00488
00489 }
00490 }
00491
00492 }
00493 delete[] sorted;
00494
00495 }
00496 }
00497
00498
00500 void Stump::compareSplits(float gainRatio, int dim, float val, int type,
00501 int *nties, float *bestGainRatio, int *bestDim,
00502 float *bestVal, int *bestType){
00503 if (STDEBUG) cout << "compareSplits gainRatio=" << gainRatio << ",dim=" << dim
00504 << ",val=" << val << ",type= " << type <<endl;
00505
00506
00507 bool newBest = false;
00508
00509
00510 if (fabs(*bestGainRatio - gainRatio) < SPLIT_MARGIN){
00511
00512
00513 (*nties)++;
00514 float randomval = rng.uniform(0,1);
00515 float newsplitprob = (1.0 / (float)*nties);
00516
00517 if (randomval < newsplitprob){
00518 newBest = true;
00519 if (SPLITDEBUG) cout << " Tie on split. ST: " << id << " rand: " << randomval
00520 << " splitProb: " << newsplitprob << ", selecting new split " << endl;
00521 }
00522 else
00523 if (SPLITDEBUG) cout << " Tie on split. ST: " << id << " rand: " << randomval
00524 << " splitProb: " << newsplitprob << ", staying with old split " << endl;
00525 }
00526
00527
00528 else if (gainRatio > *bestGainRatio){
00529 newBest = true;
00530 *nties = 1;
00531 }
00532
00533
00534
00535 if (newBest){
00536 *bestGainRatio = gainRatio;
00537 *bestDim = dim;
00538 *bestVal = val;
00539 *bestType = type;
00540 if (SPLITDEBUG){
00541 cout << " New best gain ratio: " << *bestGainRatio
00542 << ": type " << *bestType
00543 << " with val " << *bestVal
00544 << " on dim " << *bestDim << endl;
00545 }
00546 }
00547 }
00548
00550 float Stump::calcGainRatio(int dim, float val, int type,
00551 float I){
00552 if (STDEBUG) cout << "calcGainRatio, dim=" << dim
00553 << " val=" << val
00554 << " I=" << I
00555 << " nInstances= " << experiences.size() << endl;
00556
00557 std::vector<stump_experience*> left;
00558 std::vector<stump_experience*> right;
00559
00560
00561
00562 float D[2];
00563
00564
00565
00566
00567 float Info;
00568
00569
00570 float Gain;
00571
00572
00573 float SplitInfo;
00574
00575
00576 float GainRatio;
00577
00578
00579 for (unsigned i = 0; i < experiences.size(); i++){
00580 if (STDEBUG) cout << "calcGainRatio - Classify instance " << i << " id: "
00581 << experiences[i]->id << " on new split " << endl;
00582
00583 if (passTest(dim, val, type, experiences[i]->input)){
00584 left.push_back(experiences[i]);
00585 }
00586 else{
00587 right.push_back(experiences[i]);
00588 }
00589 }
00590
00591 if (STDEBUG) cout << "Left has " << left.size()
00592 << ", right has " << right.size() << endl;
00593
00594 D[0] = (float)left.size() / (float)experiences.size();
00595 D[1] = (float)right.size() / (float)experiences.size();
00596 float leftInfo = calcIforSet(left);
00597 float rightInfo = calcIforSet(right);
00598 Info = D[0] * leftInfo + D[1] * rightInfo;
00599 Gain = I - Info;
00600 SplitInfo = calcIofP((float*)&D, 2);
00601 GainRatio = Gain / SplitInfo;
00602
00603 if (STDEBUG){
00604 cout << "LeftInfo: " << leftInfo
00605 << " RightInfo: " << rightInfo
00606 << " Info: " << Info
00607 << " Gain: " << Gain
00608 << " SplitInfo: " << SplitInfo
00609 << " GainRatio: " << GainRatio
00610 << endl;
00611 }
00612
00613 return GainRatio;
00614
00615 }
00616
00617 float Stump::calcIofP(float* P, int size){
00618 if (STDEBUG) cout << "calcIofP, size=" << size << endl;
00619 float I = 0;
00620 for (int i = 0; i < size; i++){
00621 I -= P[i] * log(P[i]);
00622 }
00623 return I;
00624 }
00625
00627 float Stump::calcIforSet(const std::vector<stump_experience*> &instances){
00628 if (STDEBUG) cout << "calcIforSet" << endl;
00629
00630 std::vector<float> classes;
00631 std::vector<int> classCounts;
00632
00633
00634 for (unsigned i = 0; i < instances.size(); i++){
00635
00636 float val = instances[i]->output;
00637 bool newValue = true;
00638
00639 for (unsigned j = 0; j < classes.size(); j++){
00640
00641 if (val == classes[j]){
00642 newValue = false;
00643 classCounts[j]++;
00644 break;
00645 }
00646 }
00647
00648
00649 if (newValue){
00650 classes.push_back(val);
00651 classCounts.push_back(1);
00652 if (STDEBUG) cout << "found new class with val " << val << endl;
00653 }
00654 }
00655
00656
00657 float *P = new float[classes.size()];
00658 for (int i = 0; i < (int)classCounts.size(); i++){
00659 P[i] = (float)classCounts[i] / (float)instances.size();
00660 }
00661
00662
00663 float I = calcIofP(P, classes.size());
00664 delete [] P;
00665
00666 return I;
00667
00668 }
00669
00672 float* Stump::sortOnDim(int dim){
00673 if (STDEBUG) cout << "sortOnDim,dim = " << dim << endl;
00674
00675 float* values = new float[experiences.size()];
00676
00677 for (int i = 0; i < (int)experiences.size(); i++){
00678
00679
00680 float val = experiences[i]->input[dim];
00681
00682
00683
00684 for (int j = 0; j <= i; j++){
00685
00686
00687
00688 if (j==i){
00689 values[j] = val;
00690
00691 }
00692
00693
00694 else if (val < values[j]){
00695
00696
00697
00698 for (int k = i; k > j; k--){
00699
00700 values[k] = values[k-1];
00701 }
00702
00703
00704
00705 values[j] = val;
00706
00707
00708 break;
00709 }
00710
00711 }
00712 }
00713
00714 if (STDEBUG){
00715 cout << "Sorted array: " << values[0];
00716 for (int i = 1; i < (int)experiences.size(); i++){
00717 cout << ", " << values[i];
00718 }
00719 cout << endl;
00720 }
00721
00722 return values;
00723
00724 }
00725
00726
00728 void Stump::printStump(){
00729
00730 cout << "Type: " << type
00731 << " Dim: " << dim << " Val: " << val
00732 << " nExperiences: " << nExperiences ;
00733
00734 cout << " Left Outputs: ";
00735 for (std::multiset<float>::iterator j = leftOutputs.begin();
00736 j != leftOutputs.end(); j++){
00737 cout << *j << ", ";
00738 }
00739 cout << endl;
00740 cout << " Right Outputs: ";
00741 for (std::multiset<float>::iterator j = rightOutputs.begin();
00742 j != rightOutputs.end(); j++){
00743 cout << *j << ", ";
00744 }
00745 cout << endl;
00746
00747 }
00748
00751
00752
00753
00754
00755
00756
00757
00758
00759
00760
00761
00762
00763
00764
00765
00766
00767
00768
00769
00770
00771
00772
00773
00774
00775
00776
00777
00778
00779
00780
00781
00782
00783
00784
00785
00786
00787
00788
00789
00790
00791
00792
00793
00794
00795
00796
00797
00798
00799
00800
00801
00802
00803
00804
00805
00806
00807
00808
00809
00810
00811
00812
00813
00814
00815
00816
00817
00818
00819
00820
00821
00822
00823
00824
00825
00826
00827
00828
00829
00830
00831
00832
00833
00834
00835
00836
00837
00838
00839
00840
00841
00842
00843
00844
00845
00846
00847
00848
00849
00850
00851
00852
00853
00854
00855 void Stump::outputProbabilities(std::multiset<float> outputs, std::map<float, float>* retval){
00856 if (STDEBUG) cout << " Calculating output probs" << endl;
00857
00858
00859 for (std::multiset<float>::iterator it = outputs.begin();
00860 it != outputs.end(); it = outputs.upper_bound(*it)){
00861
00862
00863 float val = *it;
00864 int count = outputs.count(val);
00865 if (count > 0){
00866 (*retval)[val] = (float) count / (float)outputs.size();
00867 if (STDEBUG) cout << " Output value " << val << " had count of " << count
00868 << " on "
00869 << outputs.size() << " experiences and prob of "
00870 << (*retval)[val] << endl;
00871 }
00872 }
00873
00874 }