Stump.cc
Go to the documentation of this file.
00001 #include "Stump.hh"
00002 
00003 
00004 
00005 Stump::Stump(int id, int trainMode, int trainFreq, int m, float featPct, Random rng):
00006   id(id), mode(trainMode), freq(trainFreq), M(m), featPct(featPct), rng(rng)
00007 {
00008 
00009   nOutput = 0;
00010   nExperiences = 0;
00011 
00012   // how close a split has to be to be randomly selected
00013   SPLIT_MARGIN = 0.02; //5; //01; //0.05; //0.2; //0.05;
00014 
00015   LOSS_MARGIN = 0.0; //0.02;
00016   REBUILD_RATIO = 0.05;
00017 
00018   MIN_GAIN_RATIO = 0.0001; //0.0004; //0.001; //0.0002; //0.001;
00019 
00020   ALLOW_ONLY_SPLITS = true; //false;
00021 
00022   STDEBUG = false;
00023   SPLITDEBUG = false; //true;
00024 
00025   if (STDEBUG) {
00026     cout << "Created decision stump " << id << endl;
00027     cout << " mode: " << mode
00028          << " freq: " << freq << endl;
00029   }
00030 
00031   initStump();
00032 
00033 }
00034 
00035 Stump::Stump(const Stump& s):
00036 id(s.id), mode(s.mode), freq(s.freq), M(s.M), featPct(s.featPct), rng(s.rng)
00037 {
00038   nOutput = s.nOutput;
00039   nExperiences = s.nExperiences;
00040   SPLIT_MARGIN = s.SPLIT_MARGIN;
00041   LOSS_MARGIN = s.LOSS_MARGIN;
00042   REBUILD_RATIO = s.REBUILD_RATIO;
00043   MIN_GAIN_RATIO = s.MIN_GAIN_RATIO;
00044   ALLOW_ONLY_SPLITS = s.ALLOW_ONLY_SPLITS;
00045   STDEBUG = s.STDEBUG;
00046   SPLITDEBUG = s.SPLITDEBUG;
00047 
00048   for (int i = 0; i < N_STUMP_EXP; i++){
00049     allExp[i] = s.allExp[i];
00050   }
00051 
00052   // set experience pointers
00053   experiences.resize(s.experiences.size());
00054   for (unsigned i = 0; i < s.experiences.size(); i++){
00055     experiences[i] = &(allExp[i]);
00056   }
00057 
00058   // actual split
00059   dim = s.dim;
00060   val = s.val;
00061   type = s.type;
00062   gainRatio = s.gainRatio;
00063   leftOutputs = s.leftOutputs;
00064   rightOutputs = s.rightOutputs;
00065 }
00066 
00067 Stump* Stump::getCopy(){
00068   Stump* copy = new Stump(*this);
00069   return copy;
00070 }
00071 
00072 Stump::~Stump() {
00073   for (unsigned i = N_STUMP_EXP; i < experiences.size(); i++){
00074     delete experiences[i];
00075   }
00076   experiences.clear();
00077 }
00078 
00079 
00080 // multiple instances at once
00081 bool Stump::trainInstances(std::vector<classPair> &instances){
00082   if (STDEBUG) cout << "trainInstances: " << instances.size() << endl;
00083 
00084   bool modelChanged = false;
00085   bool doBuild = false;
00086 
00087   // loop through instances, possibly checking for errors
00088   for (unsigned a = 0; a < instances.size(); a++){
00089     classPair instance = instances[a];
00090 
00091     // take from static array until we run out
00092     stump_experience *e;
00093     if (nExperiences < N_STUMP_EXP){
00094       // from statically created set of experiences
00095       e = &(allExp[nExperiences]);
00096 
00097     } else {
00098       // dynamically create experience
00099       e = new stump_experience;
00100     }
00101 
00102     e->input = instance.in;
00103     e->output = instance.out;
00104     e->id = nExperiences;
00105     experiences.push_back(e);
00106     nExperiences++;
00107 
00108     if (nExperiences == 1000000){
00109       cout << "Reached limit of # experiences allowed." << endl;
00110       return false;
00111     }
00112 
00113     if (nExperiences != (int)experiences.size())
00114       cout << "ERROR: experience size mismatch: " << nExperiences << ", " << experiences.size() << endl;
00115 
00116     //cout << nExperiences << endl << flush;
00117     //if (nExperiences == 503 && id == 10){
00118     //  STDEBUG = true;
00119     //  SPLITDEBUG = true;
00120     //  INCDEBUG = true;
00121     //}
00122 
00123     if (STDEBUG) {
00124       cout << "Original input: ";
00125       for (unsigned i = 0; i < instance.in.size(); i++){
00126         cout << instance.in[i] << ", ";
00127       }
00128       cout << endl << " Original output: " << instance.out << endl;
00129       cout << "Added exp id: " << e->id << " output: " << e->output << endl;
00130       cout << "Address: " << e << " Input : ";
00131       for (unsigned i = 0; i < e->input.size(); i++){
00132         cout << e->input[i] << ", ";
00133       }
00134       cout << endl << " Now have " << nExperiences << " experiences." << endl;
00135     }
00136 
00137     // mode 0: re-build every step
00138     if (mode == 0 || mode == 1 || nExperiences <= 1){
00139       // build every step
00140       doBuild = true;
00141     }
00142 
00143     // mode 2: re-build every FREQ steps
00144     else if (mode == 2){
00145       // build every freq steps
00146       if (!modelChanged && (nExperiences % freq) == 0){
00147         doBuild = true;
00148       }
00149     }
00150 
00151   } // end instance loop
00152 
00153   if (doBuild){
00154     buildStump();
00155     modelChanged = true;
00156   }
00157 
00158   if (modelChanged){
00159     if (STDEBUG) cout << "ST " << id << " stump re-built." << endl;
00160 
00161     if (STDEBUG){
00162       cout << endl << "ST: " << id << endl;
00163       printStump();
00164       cout << "Done printing stump" << endl;
00165     }
00166   }
00167 
00168   return modelChanged;
00169 }
00170 
00171 
00172 // here the target output will be a single value
00173 bool Stump::trainInstance(classPair &instance){
00174   if (STDEBUG) cout << "trainInstance" << endl;
00175 
00176   bool modelChanged = false;
00177 
00178   // simply add this instance to the set of experiences
00179 
00180   // take from static array until we run out
00181   stump_experience *e;
00182   if (nExperiences < N_STUMP_EXP){
00183     // from statically created set of experiences
00184     e = &(allExp[nExperiences]);
00185 
00186   } else {
00187     // dynamically create experience
00188     e = new stump_experience;
00189   }
00190 
00191 
00192   e->input = instance.in;
00193   e->output = instance.out;
00194   e->id = nExperiences;
00195   experiences.push_back(e);
00196   nExperiences++;
00197 
00198   if (nExperiences == 1000000){
00199     cout << "Reached limit of # experiences allowed." << endl;
00200     return false;
00201   }
00202 
00203   if (nExperiences != (int)experiences.size())
00204     cout << "ERROR: experience size mismatch: " << nExperiences << ", " << experiences.size() << endl;
00205 
00206   //cout << nExperiences << endl << flush;
00207   //if (nExperiences == 503 && id == 10){
00208   //  STDEBUG = true;
00209   //  SPLITDEBUG = true;
00210   //  INCDEBUG = true;
00211   //}
00212 
00213   if (STDEBUG) {
00214     cout << "Original input: ";
00215     for (unsigned i = 0; i < instance.in.size(); i++){
00216       cout << instance.in[i] << ", ";
00217     }
00218     cout << endl << " Original output: " << instance.out << endl;
00219     cout << "Added exp id: " << e->id << " output: " << e->output << endl;
00220     cout << "Address: " << e << " Input : ";
00221     for (unsigned i = 0; i < e->input.size(); i++){
00222       cout << e->input[i] << ", ";
00223     }
00224     cout << endl << " Now have " << nExperiences << " experiences." << endl;
00225   }
00226 
00227   // depending on mode/etc, maybe re-build stump
00228 
00229   // mode 0: re-build every step
00230   if (mode == 0 || mode == 1 || nExperiences <= 1){
00231 
00232     // build every step
00233     buildStump();
00234     modelChanged = true;
00235 
00236   }
00237 
00238   // mode 2: re-build every FREQ steps
00239   else if (mode == 2){
00240     // build every freq steps
00241     if (!modelChanged && (nExperiences % freq) == 0){
00242       buildStump();
00243       modelChanged = true;
00244 
00245     }
00246   }
00247 
00248   if (modelChanged){
00249     if (STDEBUG) cout << "ST " << id << " stump re-built." << endl;
00250 
00251     if (STDEBUG){
00252       cout << endl << "ST: " << id << endl;
00253       printStump();
00254       cout << "Done printing stump" << endl;
00255     }
00256   }
00257 
00258   return modelChanged;
00259 
00260 }
00261 
00262 
00263 
00264 // TODO: here we want to return the probability of the output value being each of the possible values, in the stochastic case
00265 void Stump::testInstance(const std::vector<float> &input, std::map<float, float>* retval){
00266   if (STDEBUG) cout << "testInstance ST: " << id << endl;
00267 
00268   retval->clear();
00269 
00270   // in case the stump is empty
00271   if (experiences.size() == 0){
00272     (*retval)[0.0] = 1.0;
00273     if (STDEBUG) cout << "no experiences, return 1.0 prob of 0.0" << endl;
00274     return; 
00275   }
00276 
00277   // and return mapping of outputs and their probabilities
00278   if (passTest(dim, val, type, input))
00279     outputProbabilities(leftOutputs, retval);
00280   else
00281     outputProbabilities(rightOutputs, retval);
00282 }
00283 
00284 float Stump::getConf(const std::vector<float> &input){
00285   if (STDEBUG) cout << "numVisits" << endl;
00286 
00287   // in case the stump is empty
00288   if (experiences.size() == 0){
00289     return 0;
00290   }
00291 
00292   // and return #
00293   float conf = (float)nExperiences / (float)(2.0 *M);
00294   if (conf > 1.0)
00295     return 1.0;
00296   else
00297     return conf;
00298 
00299 }
00300 
00301 
00302 int Stump::findMatching(const std::vector<stump_experience*> &instances, int dim,
00303                         int val, int minConf){
00304 
00305   int count = 0;
00306 
00307   for (unsigned i = 0; i < instances.size(); i++){
00308     if (!passTest(dim, val, ONLY, instances[i]->input)){
00309       count++;
00310       // no need to continue if this won't be the new min
00311       if (count >= minConf)
00312         return count;
00313     }
00314   }
00315   return count;
00316 }
00317 
00318 
00319 
00320 
00321 // init the stump
00322 void Stump::initStump(){
00323   if (STDEBUG) cout << "initStump()" << endl;
00324 
00325   dim = -1;
00326   val = -1;
00327   type = -1;
00328 
00329   // current data
00330   leftOutputs.clear();
00331   rightOutputs.clear();
00332 
00333   // just to ensure the diff models are on different random values
00334   for (int i = 0; i < id; i++){
00335     rng.uniform(0, 1);
00336   }
00337 
00338 }
00339 
00340 
00342 bool Stump::passTest(int dim, float val, int type, const std::vector<float> &input){
00343   if (STDEBUG) cout << "passTest, dim=" << dim << ",val=" << val << ",type=" << type
00344                     << ",input["<<dim<<"]=" << input[dim] <<endl;
00345 
00346   // TODO: deal with categorical attributes in addition to continuous ones
00347 
00348   if (type == CUT){
00349     if (input[dim] > val)
00350       return false;
00351     else
00352       return true;
00353   } else if (type == ONLY){
00354     if (input[dim] == val)
00355       return false;
00356     else
00357       return true;
00358   } else {
00359     return false;
00360   }
00361 
00362 }
00363 
00364 
00366 void Stump::buildStump(){
00367   if(STDEBUG) cout << "buildStump" << endl;
00368 
00369   if (experiences.size() == 0){
00370     cout << "Error: buildStump called on stump " << id << " with no instances." << endl;
00371     exit(-1);
00372   }
00373 
00374   if (SPLITDEBUG) cout << endl << "Creating new decision node" << endl;
00375 
00376   //node->nInstances++;
00377 
00378   float bestGainRatio = -1.0;
00379   int bestDim = -1;
00380   float bestVal = -1;
00381   int bestType = -1;
00382 
00383   testPossibleSplits(&bestGainRatio, &bestDim, &bestVal, &bestType);
00384 
00385   implementSplit(bestGainRatio, bestDim, bestVal, bestType);
00386 
00387 }
00388 
00389 
00390 void Stump::implementSplit(float bestGainRatio, int bestDim,
00391                            float bestVal, int bestType){
00392   if (STDEBUG) cout << "implementSplit gainRatio=" << bestGainRatio
00393                     << ",dim=" << bestDim
00394                     << ",val=" << bestVal << ",type=" << bestType << endl;
00395 
00396 
00397   // set the best split here
00398   dim = bestDim;
00399   val = bestVal;
00400   type = bestType;
00401   gainRatio = bestGainRatio;
00402   if (SPLITDEBUG) cout << "Best split was type " << type
00403                        << " with val " << val
00404                        << " on dim " << dim
00405                        << " with gainratio: " << bestGainRatio << endl;
00406 
00407 
00408   // split up the instances based on this split
00409   std::vector<stump_experience*> left;
00410   std::vector<stump_experience*> right;
00411   leftOutputs.clear();
00412   rightOutputs.clear();
00413   for (unsigned i = 0; i < experiences.size(); i++){
00414     if (STDEBUG) cout << "implmentSplit - Classify instance " << i << " id: "
00415                       << experiences[i]->id << " on new split " << endl;
00416     if (passTest(dim, val, type, experiences[i]->input)){
00417       left.push_back(experiences[i]);
00418       leftOutputs.insert(experiences[i]->output);
00419     }
00420     else{
00421       right.push_back(experiences[i]);
00422       rightOutputs.insert(experiences[i]->output);
00423     }
00424   }
00425 
00426   if (STDEBUG) cout << "Left has " << left.size()
00427                     << ", right has " << right.size() << endl;
00428 
00429 }
00430 
00431 
00432 
00433 void Stump::testPossibleSplits(float *bestGainRatio, int *bestDim,
00434                                float *bestVal, int *bestType) {
00435   if (STDEBUG) cout << "testPossibleSplits" << endl;
00436 
00437 
00438   // pre-calculate some stuff for these splits (namely I, P, C)
00439   float I = calcIforSet(experiences);
00440   //if (STDEBUG) cout << "I: " << I << endl;
00441 
00442   int nties = 0;
00443 
00444   // for each possible split, calc gain ratio
00445   for (unsigned idim = 0; idim < experiences[0]->input.size(); idim++){
00446 
00447     // we eliminate some random number of splits
00448     // here (decision is taken from the random set that are left)
00449     if (rng.uniform() < featPct)
00450       continue;
00451 
00452 
00453     float* sorted = sortOnDim(idim);
00454 
00455     for (int j = 0; j < (int)experiences.size(); j++){
00456 
00457       // splits that are cuts
00458       // if different from last value, try split in between
00459       if (j > 0 && sorted[j] != sorted[j-1]){
00460         float splitval = (sorted[j] + sorted[j-1]) / 2.0;
00461         float gainRatio = calcGainRatio(idim, splitval, CUT, I);
00462 
00463         if (SPLITDEBUG) cout << " CUT split val " << splitval
00464                              << " on dim: " << idim << " had gain ratio "
00465                              << gainRatio << endl;
00466 
00467         // see if this is the new best gain ratio
00468         compareSplits(gainRatio, idim, splitval, CUT, &nties,
00469                       bestGainRatio, bestDim, bestVal, bestType);
00470 
00471 
00472       } // if its a possible cut
00473 
00474       if (ALLOW_ONLY_SPLITS){
00475         // splits that are true only if this value is equal
00476         if (j == 0 || sorted[j] != sorted[j-1]){
00477           float splitval = sorted[j];
00478 
00479           float gainRatio = calcGainRatio(idim, splitval, ONLY, I);
00480 
00481           if (SPLITDEBUG) cout << " ONLY split val " << splitval
00482                                << " on dim: " << idim << " had gain ratio "
00483                                << gainRatio << endl;
00484 
00485           // see if this is the new best gain ratio
00486           compareSplits(gainRatio, idim, splitval, ONLY, &nties,
00487                         bestGainRatio, bestDim, bestVal, bestType);
00488 
00489         } // splits with only
00490       }
00491 
00492     } // j loop
00493     delete[] sorted;
00494 
00495   }
00496 }
00497 
00498 
00500 void Stump::compareSplits(float gainRatio, int dim, float val, int type,
00501                           int *nties, float *bestGainRatio, int *bestDim,
00502                           float *bestVal, int *bestType){
00503   if (STDEBUG) cout << "compareSplits gainRatio=" << gainRatio << ",dim=" << dim
00504                     << ",val=" << val << ",type= " << type <<endl;
00505 
00506 
00507   bool newBest = false;
00508 
00509   // if its a virtual tie, break it randomly
00510   if (fabs(*bestGainRatio - gainRatio) < SPLIT_MARGIN){
00511     //cout << "Split tie, making random decision" << endl;
00512 
00513     (*nties)++;
00514     float randomval = rng.uniform(0,1);
00515     float newsplitprob = (1.0 / (float)*nties);
00516 
00517     if (randomval < newsplitprob){
00518       newBest = true;
00519       if (SPLITDEBUG) cout << "   Tie on split. ST: " << id << " rand: " << randomval
00520                            << " splitProb: " << newsplitprob << ", selecting new split " << endl;
00521     }
00522     else
00523       if (SPLITDEBUG) cout << "   Tie on split. ST: " << id << " rand: " << randomval
00524                            << " splitProb: " << newsplitprob << ", staying with old split " << endl;
00525   }
00526 
00527   // if its clearly better, set this as the best split
00528   else if (gainRatio > *bestGainRatio){
00529     newBest = true;
00530     *nties = 1;
00531   }
00532 
00533 
00534   // set the split features
00535   if (newBest){
00536     *bestGainRatio = gainRatio;
00537     *bestDim = dim;
00538     *bestVal = val;
00539     *bestType = type;
00540     if (SPLITDEBUG){
00541       cout << "  New best gain ratio: " << *bestGainRatio
00542            << ": type " << *bestType
00543            << " with val " << *bestVal
00544            << " on dim " << *bestDim << endl;
00545     }
00546   } // newbest
00547 }
00548 
00550 float Stump::calcGainRatio(int dim, float val, int type,
00551                            float I){
00552   if (STDEBUG) cout << "calcGainRatio, dim=" << dim
00553                     << " val=" << val
00554                     << " I=" << I
00555                     << " nInstances= " << experiences.size() << endl;
00556 
00557   std::vector<stump_experience*> left;
00558   std::vector<stump_experience*> right;
00559 
00560 
00561   // array with percentage positive and negative for this test
00562   float D[2];
00563 
00564   // info(T) = I(P): float I;
00565 
00566   // Info for this split = Info(X,T)
00567   float Info;
00568 
00569   // Gain for this split = Gain(X,T)
00570   float Gain;
00571 
00572   // SplitInfo for this split = I(|pos|/|T|, |neg|/|T|)
00573   float SplitInfo;
00574 
00575   // GainRatio for this split = GainRatio(X,T) = Gain(X,T) / SplitInfo(X,T)
00576   float GainRatio;
00577 
00578   // see where the instances would go with this split
00579   for (unsigned i = 0; i < experiences.size(); i++){
00580     if (STDEBUG) cout << "calcGainRatio - Classify instance " << i << " id: "
00581                       << experiences[i]->id << " on new split " << endl;
00582 
00583     if (passTest(dim, val, type, experiences[i]->input)){
00584       left.push_back(experiences[i]);
00585     }
00586     else{
00587       right.push_back(experiences[i]);
00588     }
00589   }
00590 
00591   if (STDEBUG) cout << "Left has " << left.size()
00592                     << ", right has " << right.size() << endl;
00593 
00594   D[0] = (float)left.size() / (float)experiences.size();
00595   D[1] = (float)right.size() / (float)experiences.size();
00596   float leftInfo = calcIforSet(left);
00597   float rightInfo = calcIforSet(right);
00598   Info = D[0] * leftInfo + D[1] * rightInfo;
00599   Gain = I - Info;
00600   SplitInfo = calcIofP((float*)&D, 2);
00601   GainRatio = Gain / SplitInfo;
00602 
00603   if (STDEBUG){
00604     cout << "LeftInfo: " << leftInfo
00605          << " RightInfo: " << rightInfo
00606          << " Info: " << Info
00607          << " Gain: " << Gain
00608          << " SplitInfo: " << SplitInfo
00609          << " GainRatio: " << GainRatio
00610          << endl;
00611   }
00612 
00613   return GainRatio;
00614 
00615 }
00616 
00617 float Stump::calcIofP(float* P, int size){
00618   if (STDEBUG) cout << "calcIofP, size=" << size << endl;
00619   float I = 0;
00620   for (int i = 0; i < size; i++){
00621     I -= P[i] * log(P[i]);
00622   }
00623   return I;
00624 }
00625 
00627 float Stump::calcIforSet(const std::vector<stump_experience*> &instances){
00628   if (STDEBUG) cout << "calcIforSet" << endl;
00629 
00630   std::vector<float> classes;
00631   std::vector<int> classCounts;
00632 
00633   // go through instances and figure count of each type
00634   for (unsigned i = 0; i < instances.size(); i++){
00635 
00636     float val = instances[i]->output;
00637     bool newValue = true;
00638     // see if this is a new val
00639     for (unsigned j = 0; j < classes.size(); j++){
00640       // not new, increment count for this class
00641       if (val == classes[j]){
00642         newValue = false;
00643         classCounts[j]++;
00644         break;
00645       }
00646     }
00647 
00648     // it is a new value
00649     if (newValue){
00650       classes.push_back(val);
00651       classCounts.push_back(1);
00652       if (STDEBUG) cout << "found new class with val " << val << endl;
00653     }
00654   }
00655 
00656   // now calculate P
00657   float *P = new float[classes.size()];
00658   for (int i = 0; i < (int)classCounts.size(); i++){
00659     P[i] = (float)classCounts[i] / (float)instances.size();
00660   }
00661 
00662   // calculate I(P)
00663   float I = calcIofP(P, classes.size());
00664   delete [] P;
00665 
00666   return I;
00667 
00668 }
00669 
00672 float* Stump::sortOnDim(int dim){
00673   if (STDEBUG) cout << "sortOnDim,dim = " << dim << endl;
00674 
00675   float* values = new float[experiences.size()];
00676 
00677   for (int i = 0; i < (int)experiences.size(); i++){
00678     //cout << "Instance " << i << endl;
00679 
00680     float val = experiences[i]->input[dim];
00681     //cout << " val: " << val << endl;
00682 
00683     // find where this should go
00684     for (int j = 0; j <= i; j++){
00685       //cout << " j: " << j << endl;
00686 
00687       // get to i, this is the spot then
00688       if (j==i){
00689         values[j] = val;
00690         //cout << "  At i, putting value in slot j: " << j << endl;
00691       }
00692 
00693       // if this is the spot
00694       else if (val < values[j]){
00695         //cout << "  Found slot at j: " << j << endl;
00696 
00697         // slide everything forward to make room
00698         for (int k = i; k > j; k--){
00699           //cout << "   k = " << k << " Sliding value from k-1 to k" << endl;
00700           values[k] = values[k-1];
00701         }
00702 
00703         // put value in its spot at j
00704         //cout << "  Putting value at slot j: " << j << endl;
00705         values[j] = val;
00706 
00707         // break
00708         break;
00709       }
00710 
00711     }
00712   }
00713 
00714   if (STDEBUG){
00715     cout << "Sorted array: " << values[0];
00716     for (int i = 1; i < (int)experiences.size(); i++){
00717       cout << ", " << values[i];
00718     }
00719     cout << endl;
00720   }
00721 
00722   return values;
00723 
00724 }
00725 
00726 
00728 void Stump::printStump(){
00729 
00730   cout << "Type: " << type
00731        << " Dim: " << dim << " Val: " << val
00732        << " nExperiences: " << nExperiences ;
00733 
00734   cout << " Left Outputs: ";
00735   for (std::multiset<float>::iterator j = leftOutputs.begin();
00736        j != leftOutputs.end(); j++){
00737     cout << *j << ", ";
00738   }
00739   cout << endl;
00740   cout << " Right Outputs: ";
00741   for (std::multiset<float>::iterator j = rightOutputs.begin();
00742        j != rightOutputs.end(); j++){
00743     cout << *j << ", ";
00744   }
00745   cout << endl;
00746 
00747 }
00748 
00751 /*
00752   std::vector<float> Stump::calcChiSquare(std::vector<experience> experiences){
00753 
00754   std::vector<float> chiSquare;
00755   chiSquare.resize(experiences[0].input.size());
00756 
00757   // for each input attribute
00758   for (int i = 0; i < (int)experiences[0].input.size(); i++){
00759 
00760   // attribute counts
00761   std::vector<float> attribValue;
00762   std::vector<int> attribCount;
00763 
00764   // outcome counts
00765   std::vector<float> outcomeValue;
00766   std::vector<int> outcomeCount;
00767 
00768   for (int k = 0; k < (int)experiences.size(); k++){
00769 
00770   experience* exp = &(experiences[k]);
00771 
00772   // match attribute value
00773   bool attribMatch = false;
00774   for (int l = 0; l < (int)attribValue.size(); l++){
00775   if (attribValue[l] == exp->input[i]){
00776   attribMatch = true;
00777   attribCount[l]++;
00778   break;
00779   }
00780   }
00781   // no match
00782   attribValue.push_back(exp->input[i]);
00783   attribCount.push_back(1);
00784 
00785   // match outcome value
00786   bool outcomeMatch = false;
00787   for (int l = 0; l < (int)outcomeValue.size(); l++){
00788   if (outcomeValue[l] == exp->output){
00789   outcomeMatch = true;
00790   outcomeCount[l]++;
00791   break;
00792   }
00793   }
00794   // no match
00795   outcomeValue.push_back(exp->output);
00796   outcomeCount.push_back(1);
00797 
00798   }
00799 
00800   // table
00801   // TODO
00802 
00803   float** actual = new float*[attribValue.size()];//[outcomeValue.size()];
00804   float** expected = new float*[attribValue.size()];//[outcomeValue.size()];
00805   for (int k = 0; k < (int)attribValue.size(); k++){
00806   actual[k] = new float[outcomeValue.size()];
00807   expected[k] = new float[outcomeValue.size()];
00808   }
00809 
00810   // calculate actual table
00811   for (int k = 0; k < (int)experiences.size(); k++){
00812   experience* exp = &(experiences[k]);
00813 
00814   // match
00815   for (int l = 0; l < (int)attribValue.size(); l++){
00816   if (attribValue[l] != exp->input[i])
00817   continue;
00818   for (int j = 0; j < (int)outcomeValue.size(); j++){
00819   if (outcomeValue[j] != exp->output)
00820   continue;
00821   actual[l][j]++;
00822   }
00823   }
00824   }
00825 
00826   // calculate expected table and compare
00827   float x2 = 0.0;
00828   for (int l = 0; l < (int)attribValue.size(); l++){
00829   for (int j = 0; j < (int)outcomeValue.size(); j++){
00830 
00831   // calculate expected
00832   expected[l][j] = attribCount[l] * outcomeCount[j] /
00833   (float)experiences.size();
00834 
00835   float cell = (actual[l][j] - expected[l][j]) *
00836   (actual[l][j] - expected[l][j]) / expected[i][j];
00837 
00838   x2 += cell;
00839   }
00840   }
00841 
00842   delete actual;
00843   delete expected;
00844 
00845   chiSquare[i] = x2;
00846   cout << " Feature " << i << " Independence: " << chiSquare[i] << endl;
00847 
00848   }
00849   return chiSquare;
00850 
00851   }
00852 */
00853 
00854 // output a map of outcomes and their probabilities for this leaf node
00855 void Stump::outputProbabilities(std::multiset<float> outputs, std::map<float, float>* retval){
00856   if (STDEBUG) cout << " Calculating output probs" << endl;
00857 
00858   // go through all possible output values
00859   for (std::multiset<float>::iterator it = outputs.begin();
00860        it != outputs.end(); it = outputs.upper_bound(*it)){
00861 
00862     // get count for this value
00863     float val = *it;
00864     int count = outputs.count(val);
00865     if (count > 0){
00866       (*retval)[val] = (float) count / (float)outputs.size();
00867       if (STDEBUG) cout << "  Output value " << val << " had count of " << count
00868                         << " on "
00869                         << outputs.size() << " experiences and prob of "
00870                         << (*retval)[val] << endl;
00871     }
00872   }
00873 
00874 }


rl_agent
Author(s): Todd Hester
autogenerated on Thu Jun 6 2019 22:00:13