rl_agent: PO_ETUCT.cc Source File

Go to the documentation of this file.
00001 
00010 #include "PO_ETUCT.hh"
00011 #include <algorithm>
00012 
00013 #include <sys/time.h>
00014 
00015 
00016 PO_ETUCT::PO_ETUCT(int numactions, float gamma, float rrange, float lambda,
00017                    int MAX_ITER, float MAX_TIME, int MAX_DEPTH, int modelType,
00018                    const std::vector<float> &fmax, const std::vector<float> &fmin,
00019                    const std::vector<int> &nstatesPerDim, bool trackActual,
00020                    int historySize, Random r):
00021   numactions(numactions), gamma(gamma), rrange(rrange), lambda(lambda),
00022   MAX_ITER(MAX_ITER), MAX_TIME(MAX_TIME),
00023   MAX_DEPTH(MAX_DEPTH), modelType(modelType), statesPerDim(nstatesPerDim),
00024   trackActual(trackActual), HISTORY_SIZE(historySize),
00025   HISTORY_FL_SIZE(historySize*numactions)//+historySize*fmax.size())
00026 {
00027   rng = r;
00028 
00029   nstates = 0;
00030   nactions = 0;
00031   lastUpdate = -1;
00032   seedMode = false;
00033 
00034   timingType = true;
00035 
00036   model = NULL;
00037   planTime = getSeconds();
00038 
00039   PLANNERDEBUG = false;//true;
00040   ACTDEBUG = false; //true;
00041   MODELDEBUG = false;//true;//false;
00042   UCTDEBUG = false;
00043   REALSTATEDEBUG = false;
00044   HISTORYDEBUG = false;
00045 
00046   featmax = fmax;
00047   featmin = fmin;
00048 
00049   if (statesPerDim[0] > 0){
00050     cout << "Planner PO_ETUCT using discretization of " << statesPerDim[0] << endl;
00051   }
00052   if (trackActual){
00053     cout << "PO_ETUCT tracking real state values" << endl;
00054   }
00055 
00056   if (HISTORY_SIZE == 0){
00057     saHistory.push_back(0.0);
00058   }
00059   else {
00060     if (HISTORYDEBUG) {
00061       cout << "History size of " << HISTORY_SIZE
00062            << " float size of " << HISTORY_FL_SIZE
00063            << " with state size: " << fmin.size()
00064            << " and numact: " << numactions << endl;
00065     }
00066     for (int i = 0; i < HISTORY_FL_SIZE; i++){
00067       saHistory.push_back(0.0);
00068     }
00069   }
00070 
00071 }
00072 
00073 PO_ETUCT::~PO_ETUCT() {
00074   //cout << "planner delete" << endl;
00075   // clear all state info
00076 
00077   for (std::map<state_t, state_info>::iterator i = statedata.begin();
00078        i != statedata.end(); i++){
00079 
00080     // get state's info
00081     //cout << "  planner got info" << endl;
00082     state_info* info = &((*i).second);
00083 
00084     deleteInfo(info);
00085   }
00086 
00087   featmax.clear();
00088   featmin.clear();
00089 
00090   statespace.clear();
00091   statedata.clear();
00092   //cout << "planner done" << endl;
00093 }
00094 
00095 void PO_ETUCT::setModel(MDPModel* m){
00096 
00097   model = m;
00098 
00099 }
00100 
00101 
00103 // Functional functions :) //
00105 
00106 
00107 void PO_ETUCT::initNewState(state_t s){
00108   //if (PLANNERDEBUG) cout << "initNewState(s = " << s
00109   //     << ") size = " << s->size() << endl;
00110 
00111   // create state info and add to hash map
00112   state_info* info = &(statedata[s]);
00113   initStateInfo(s, info);
00114 
00115   // dont init any model info
00116   // we'll get it during search if we have to
00117 
00118 }
00119 
00120 bool PO_ETUCT::updateModelWithExperience(const std::vector<float> &laststate,
00121                                          int lastact,
00122                                          const std::vector<float> &currstate,
00123                                          float reward, bool term){
00124   //  if (PLANNERDEBUG) cout << "updateModelWithExperience(last = " << &laststate
00125   //     << ", curr = " << &currstate
00126   //        << ", lastact = " << lastact
00127   //     << ", r = " << reward
00128   //     << ", term = " << term
00129   //     << ")" << endl;
00130 
00131   if (!timingType)
00132     planTime = getSeconds();
00133 
00134   state_t last = NULL;
00135 
00136   // add one history to last state
00137   if (HISTORY_SIZE > 0){
00138     std::vector<float> modState = laststate;
00139     if (HISTORYDEBUG) {
00140       cout << "Original state vector (size " << modState.size() << ": " << modState[0];
00141       for (unsigned i = 1; i < modState.size(); i++){
00142         cout << "," << modState[i];
00143       }
00144       cout << endl;
00145     }
00146     // add history onto modState
00147     for (int i = 0; i < HISTORY_FL_SIZE; i++){
00148       modState.push_back(saHistory[i]);
00149     }
00150 
00151     if (HISTORYDEBUG) {
00152       cout << "New state vector (size " << modState.size() << ": " << modState[0];
00153       for (unsigned i = 1; i < modState.size(); i++){
00154         cout << "," << modState[i];
00155       }
00156       cout << endl;
00157     }
00158 
00159     last = canonicalize(modState);
00160 
00161     if (!seedMode){
00162       // push this state and action onto the history vector
00163       /*
00164         for (unsigned i = 0; i < last->size(); i++){
00165         saHistory.push_back((*last)[i]);
00166         saHistory.pop_front();
00167         }
00168       */
00169       for (int i = 0; i < numactions; i++){
00170         if (i == lastact)
00171           saHistory.push_back(1.0);
00172         else
00173           saHistory.push_back(0.0);
00174         saHistory.pop_front();
00175       }
00176       if (HISTORYDEBUG) {
00177         cout << "New history vector (size " << saHistory.size() << ": " << saHistory[0];
00178         for (unsigned i = 1; i < saHistory.size(); i++){
00179           cout << "," << saHistory[i];
00180         }
00181         cout << endl;
00182       }
00183     }
00184   }
00185 
00186   // no history
00187   else {
00188 
00189     // canonicalize these things
00190     last = canonicalize(laststate);
00191   }
00192 
00193   prevstate = last;
00194   prevact = lastact;
00195 
00196   // get state info
00197   previnfo = &(statedata[last]);
00198 
00199   // init model?
00200   if (model == NULL){
00201     cout << "ERROR IN MODEL OR MODEL SIZE" << endl;
00202     exit(-1);
00203   }
00204 
00205   if (MODELDEBUG){
00206     cout << "Update with exp from state: ";
00207     for (unsigned i = 0; i < last->size(); i++){
00208       cout << (*last)[i] << ", ";
00209     }
00210     cout << " action: " << lastact;
00211     cout << " to state: ";
00212     for (unsigned i = 0; i < currstate.size(); i++){
00213       cout << currstate[i] << ", ";
00214     }
00215     cout << " and reward: " << reward << endl;
00216   }
00217 
00218   experience e;
00219   e.s = *last;
00220   e.next = currstate;
00221   e.act = lastact;
00222   e.reward = reward;
00223   e.terminal = term;
00224 
00225   bool modelChanged = model->updateWithExperience(e);
00226 
00227   if (timingType)
00228     planTime = getSeconds();
00229 
00230   return modelChanged;
00231 
00232 }
00233 
00234 void PO_ETUCT::updateStateActionFromModel(state_t s, int a, state_info* info){
00235 
00236   StateActionInfo* newModel = NULL;
00237   newModel = &(info->model[a]);
00238 
00239   updateStateActionHistoryFromModel(*s, a, newModel);
00240 
00241 }
00242 
00243 void PO_ETUCT::updateStateActionHistoryFromModel(const std::vector<float> modState, int a, StateActionInfo *newModel){
00244 
00245   // update state info
00246   // get state action info for each action
00247   model->getStateActionInfo(modState, a, newModel);
00248   newModel->frameUpdated = nactions;
00249 
00250   if (HISTORY_SIZE > 0){
00251 
00252     // figure out new history
00253     std::deque<float> newHistory;
00254     int stateSize = modState.size() - HISTORY_FL_SIZE;
00255 
00256     if (HISTORYDEBUG) cout << "input history was: ";
00257     for (int i = 0; i < HISTORY_FL_SIZE; i++){
00258       newHistory.push_back(modState[i+stateSize]);
00259       if (HISTORYDEBUG) cout << modState[i+stateSize] << ", ";
00260     }
00261     if (HISTORYDEBUG) cout << endl;
00262 
00263     // now add on for action
00264     for (int i = 0; i < numactions; i++){
00265       if (i == a)
00266         newHistory.push_back(1.0);
00267       else
00268         newHistory.push_back(0.0);
00269       newHistory.pop_front();
00270     }
00271 
00272     if (HISTORYDEBUG){
00273       cout << "act: " << a << ", new history:";
00274       for (unsigned i = 0; i < newHistory.size(); i++){
00275         cout << newHistory[i] << ", ";
00276       }
00277       cout << endl;
00278     }
00279 
00280     // add outcome histories onto newModel predictions
00281     std::map< std::vector<float>, float> oldProbs = newModel->transitionProbs;
00282     newModel->transitionProbs.clear();
00283 
00284     for (std::map<std::vector<float>, float>::iterator outIt
00285            = oldProbs.begin();
00286          outIt != oldProbs.end(); outIt++){
00287 
00288       float prob = (*outIt).second;
00289       std::vector<float> next = (*outIt).first;
00290 
00291       for (unsigned i = 0; i < newHistory.size(); i++){
00292         next.push_back(newHistory[i]);
00293       }
00294 
00295       if (HISTORYDEBUG){
00296         cout << "add history onto prediction of state: ";
00297         for (unsigned i = 0; i < next.size(); i++){
00298           cout << next[i] << ", ";
00299         }
00300         cout << " with prob " << prob << endl;
00301       }
00302 
00303       newModel->transitionProbs[next] = prob;
00304     }
00305   }
00306 
00307   //canonNextStates(newModel);
00308 
00309 }
00310 
00311 
00312 
00313 void PO_ETUCT::canonNextStates(StateActionInfo* modelInfo){
00314 
00315   // loop through all next states
00316   for (std::map<std::vector<float>, float>::iterator outIt
00317          = modelInfo->transitionProbs.begin();
00318        outIt != modelInfo->transitionProbs.end(); outIt++){
00319 
00320     std::vector<float> nextstate = (*outIt).first;
00321 
00322     // check that it is valid, otherwise replace with current
00323     bool badState = false;
00324     for (unsigned j = 0; j < featmax.size(); j++){
00325       if (nextstate[j] < (featmin[j]-EPSILON)
00326           || nextstate[j] > (featmax[j]+EPSILON)){
00327         //cout << "next state out of range " << nextstate[j] << endl;
00328         badState = true;
00329         break;
00330       }
00331     }
00332 
00333     if (!badState){
00334       canonicalize(nextstate);
00335     }
00336   }
00337 }
00338 
00339 
00340 
00341 
00342 int PO_ETUCT::getBestAction(const std::vector<float> &state){
00343   //  if (PLANNERDEBUG) cout << "getBestAction(s = " << &state << ")" << endl;
00344 
00345   //  resetUCTCounts();
00346 
00347   // add current history on top
00348   std::vector<float> modState = state;
00349   for (int i = 0; i < HISTORY_FL_SIZE; i++){
00350     modState.push_back(saHistory[i]);
00351   }
00352 
00353   state_t s = canonicalize(modState);
00354 
00355   int i = 0;
00356   for (i = 0; i < MAX_ITER; i++){
00357 
00358     uctSearch(modState, s, 0);
00359 
00360     // break after some max time
00361     if ((getSeconds() - planTime) > MAX_TIME){ // && i > 500){
00362       break;
00363     }
00364 
00365   }
00366   double currTime = getSeconds();
00367   if (false || UCTDEBUG){
00368     cout << "Search complete after " << (currTime-planTime) << " seconds and "
00369          << i << " iterations." << endl;
00370   }
00371 
00372   // get state info
00373   state_info* info = &(statedata[s]);
00374 
00375   // Get Q values
00376   std::vector<float> &Q = info->Q;
00377 
00378   // Choose an action
00379   const std::vector<float>::iterator a =
00380     random_max_element(Q.begin(), Q.end()); // Choose maximum
00381 
00382   int act = a - Q.begin();
00383   nactions++;
00384 
00385   if (UCTDEBUG){
00386     cout << "State " << (*s)[0];
00387     for (unsigned i = 1; i < s->size(); i++){
00388       cout << "," << (*s)[i];
00389     }
00390     cout << ", Took action " << act << ", "
00391          << "value: " << *a << endl;
00392   }
00393 
00394   // return index of action
00395   return act;
00396 }
00397 
00398 
00399 
00400 
00401 
00402 
00403 void PO_ETUCT::planOnNewModel(){
00404 
00405   // reset visit counts/q values
00406   resetUCTCounts();
00407 
00408   // for rmax, only s-a's prediction has changed
00409   if (modelType == RMAX){
00410     updateStateActionFromModel(prevstate, prevact, previnfo);
00411   }
00412 
00413   // for other model types, it all could change, clear all cached model predictions
00414   else {
00415 
00416     // still update flagged s-a's
00417     // then less stuff to query while planning
00418     for (std::set<std::vector<float> >::iterator i = statespace.begin();
00419          i != statespace.end(); i++){
00420       state_t s = canonicalize(*i);
00421       state_info* info = &(statedata[s]);
00422       if (info->needsUpdate){
00423         for (int j = 0; j < numactions; j++){
00424           updateStateActionFromModel(s, j, info);
00425         }
00426         info->needsUpdate = false;
00427       }
00428     }
00429     lastUpdate = nactions;
00430   }
00431 
00432 }
00433 
00434 
00435 void PO_ETUCT::resetUCTCounts(){
00436   // if (PLANNERDEBUG) cout << "Reset UCT Counts" << endl;
00437   const int MIN_VISITS = 10;
00438 
00439   // loop through here
00440   for (std::set<std::vector<float> >::iterator i = statespace.begin();
00441        i != statespace.end(); i++){
00442     state_t s = canonicalize(*i);
00443 
00444     state_info* info = &(statedata[s]);
00445 
00446     if (info->uctVisits > (MIN_VISITS * numactions))
00447       info->uctVisits = MIN_VISITS * numactions;
00448 
00449     for (int j = 0; j < numactions; j++){
00450       if (info->uctActions[j] > MIN_VISITS)
00451         info->uctActions[j] = MIN_VISITS;
00452     }
00453 
00454   }
00455 
00456 }
00457 
00458 
00459 
00460 
00462 // Helper Functions       //
00464 
00465 PO_ETUCT::state_t PO_ETUCT::canonicalize(const std::vector<float> &s) {
00466   //if (PLANNERDEBUG) cout << "canonicalize(s = " << s[0] << ", "
00467   //                     << s[1] << ")" << endl;
00468 
00469   // discretize it
00470   std::vector<float> s2;
00471   if (statesPerDim[0] > 0){
00472     s2 = discretizeState(s);
00473   } else {
00474     s2 = s;
00475   }
00476 
00477   // get state_t for pointer if its in statespace
00478   const std::pair<std::set<std::vector<float> >::iterator, bool> result =
00479     statespace.insert(s2);
00480   state_t retval = &*result.first; // Dereference iterator then get pointer
00481 
00482   //if (PLANNERDEBUG) cout << " returns " << retval
00483   //       << " New: " << result.second << endl;
00484 
00485   // if not, init this new state
00486   if (result.second) { // s is new, so initialize Q(s,a) for all a
00487     initNewState(retval);
00488     if (PLANNERDEBUG) {
00489       cout << " New state initialized "
00490            << " orig:(" << s[0] << "," << s[1] << ")"
00491            << " disc:(" << s2[0] << "," << s2[1] << ")" << endl;
00492     }
00493   }
00494 
00495 
00496   return retval;
00497 }
00498 
00499 
00500 // init state info
00501 void PO_ETUCT::initStateInfo(state_t s, state_info* info){
00502   //if (PLANNERDEBUG) cout << "initStateInfo()";
00503 
00504   info->id = nstates++;
00505   if (PLANNERDEBUG){
00506     cout << " id = " << info->id;
00507     cout << ", (" << (*s)[0] << "," << (*s)[1] << ")" << endl;
00508   }
00509 
00510   info->model = new StateActionInfo[numactions];
00511 
00512   // model q values, visit counts
00513   info->Q.resize(numactions, 0);
00514   info->uctActions.resize(numactions, 1);
00515   info->uctVisits = 1;
00516   info->visited = 0; //false;
00517   info->needsUpdate = true;
00518 
00519   for (int i = 0; i < numactions; i++){
00520     info->Q[i] = rng.uniform(0,0.01);
00521   }
00522 
00523   //if (PLANNERDEBUG) cout << "done with initStateInfo()" << endl;
00524 
00525 }
00526 
00527 
00528 void PO_ETUCT::printStates(){
00529 
00530   for (std::set< std::vector<float> >::iterator i = statespace.begin();
00531        i != statespace.end(); i++){
00532 
00533     state_t s = canonicalize(*i);
00534 
00535     state_info* info = &(statedata[s]);
00536 
00537     cout << "State " << info->id << ": ";
00538     for (unsigned j = 0; j < s->size(); j++){
00539       cout << (*s)[j] << ", ";
00540     }
00541     cout << endl;
00542 
00543     for (int act = 0; act < numactions; act++){
00544       cout << " Q: " << info->Q[act] << endl;
00545     }
00546 
00547   }
00548 }
00549 
00550 
00551 void PO_ETUCT::deleteInfo(state_info* info){
00552 
00553   delete [] info->model;
00554 
00555 }
00556 
00557 
00558 
00559 double PO_ETUCT::getSeconds(){
00560   struct timezone tz;
00561   timeval timeT;
00562   gettimeofday(&timeT, &tz);
00563   return  timeT.tv_sec + (timeT.tv_usec / 1000000.0);
00564 }
00565 
00566 
00567 
00568 float PO_ETUCT::uctSearch(const std::vector<float> &actS, state_t discS, int depth){
00569   if (UCTDEBUG){
00570     cout << " uctSearch state ";
00571     for (unsigned i = 0; i < actS.size(); i++){
00572       cout << actS[i] << ", ";
00573     }
00574     cout << " at depth " << depth << endl;
00575   }
00576 
00577   state_info* info = &(statedata[discS]);
00578 
00579   // if max depth
00580   // iterative deepening (probability inversely proportional to visits)
00581   //float terminateProb = 1.0/(2.0+(float)info->uctVisits);
00582 
00583   // already visited, stop here
00584   if (depth > MAX_DEPTH){
00585     // return max q value here
00586     std::vector<float>::iterator maxAct =
00587       std::max_element(info->Q.begin(),
00588                        info->Q.end());
00589     float maxval = *maxAct;
00590 
00591     if (UCTDEBUG)
00592       cout << "Terminated after depth: " << depth
00593         //   << " prob: " << terminateProb
00594            << " Q: " << maxval
00595            << " visited: " << info->visited << endl;
00596 
00597     return maxval;
00598   }
00599 
00600   // select action
00601   int action = selectUCTAction(info);
00602 
00603   // simulate action to get next state and reward
00604   // depending on exploration, may also terminate us
00605   float reward = 0;
00606   bool term = false;
00607 
00608   float learnRate;
00609   //float learnRate = 0.001;
00610   //float learnRate = 1.0 / info->uctActions[action];
00611   //    learnRate = 10.0 / (info->uctActions[action] + 100.0);
00612   learnRate = 10.0 / (info->uctActions[action] + 10.0);
00613   //if (learnRate < 0.001 && MAX_TIME < 0.5)
00614   //learnRate = 0.001;
00615   //learnRate = 0.05;
00616   //learnRate = 1.0;
00617 
00618   // tell model learning thread to update this state since we've visited it
00619   info->needsUpdate = true;
00620 
00621   // simulate next state, reward, terminal
00622   std::vector<float> actualNext = simulateNextState(actS, discS, info, action, &reward, &term);
00623 
00624   // simulate reward from this action
00625   if (term){
00626     // this one terminated
00627     if (UCTDEBUG) cout << "   Terminated on exploration condition" << endl;
00628     info->Q[action] += learnRate * (reward - info->Q[action]);
00629     info->uctVisits++;
00630     info->uctActions[action]++;
00631     if (UCTDEBUG)
00632       cout << " Depth: " << depth << " Selected action " << action
00633            << " r: " << reward
00634            << " StateVisits: " << info->uctVisits
00635            << " ActionVisits: " << info->uctActions[action] << endl;
00636 
00637     return reward;
00638   }
00639 
00640   // get discretized version of next
00641   state_t discNext = canonicalize(actualNext);
00642 
00643   if (UCTDEBUG)
00644     cout << " Depth: " << depth << " Selected action " << action
00645          << " r: " << reward  << endl;
00646 
00647   info->visited++; // = true;
00648 
00649   // new q value
00650   float newQ = reward + gamma * uctSearch(actualNext, discNext, depth+1);
00651 
00652   if (info->visited == 1){
00653 
00654     // update q and visit counts
00655     info->Q[action] += learnRate * (newQ - info->Q[action]);
00656     info->uctVisits++;
00657     info->uctActions[action]++;
00658 
00659     if (UCTDEBUG)
00660       cout << " Depth: " << depth << " newQ: " << newQ
00661            << " StateVisits: " << info->uctVisits
00662            << " ActionVisits: " << info->uctActions[action] << endl;
00663 
00664     if (lambda < 1.0){
00665 
00666       // new idea, return max of Q or new q
00667       std::vector<float>::iterator maxAct =
00668         std::max_element(info->Q.begin(),
00669                          info->Q.end());
00670       float maxval = *maxAct;
00671 
00672       if (UCTDEBUG)
00673         cout << " Replacing newQ: " << newQ;
00674 
00675       // replace with w avg of maxq and new val
00676       newQ = (lambda * newQ) + ((1.0-lambda) * maxval);
00677 
00678       if (UCTDEBUG)
00679         cout << " with wAvg: " << newQ << endl;
00680     }
00681 
00682   }
00683 
00684   info->visited--;
00685 
00686   // return q
00687   return newQ;
00688 
00689 }
00690 
00691 
00692 int PO_ETUCT::selectUCTAction(state_info* info){
00693   //  if (UCTDEBUG) cout << "  selectUCTAction" << endl;
00694 
00695   std::vector<float> &Q = info->Q;
00696 
00697   // loop through
00698   float rewardBound = rrange;
00699   if (rewardBound < 1.0)
00700     rewardBound = 1.0;
00701   rewardBound /= (1.0 - gamma);
00702   if (UCTDEBUG) cout << "Reward bound: " << rewardBound << endl;
00703 
00704   std::vector<float> uctQ(numactions, 0.0);
00705 
00706   for (int i = 0; i < numactions; i++){
00707 
00708     // this actions value is Q + rMax * 2 sqrt (log N(s) / N(s,a))
00709     uctQ[i] = Q[i] +
00710       rewardBound * 2.0 * sqrt(log((float)info->uctVisits) /
00711                                (float)info->uctActions[i]);
00712 
00713     if (UCTDEBUG)
00714       cout << "  Action: " << i << " Q: " << Q[i]
00715            << " visits: " << info->uctActions[i]
00716            << " value: " << uctQ[i] << endl;
00717   }
00718 
00719   // max element of uctQ
00720   std::vector<float>::iterator maxAct =
00721     max_element(uctQ.begin(), uctQ.end());
00722   float maxval = *maxAct;
00723   int act = maxAct - uctQ.begin();
00724 
00725   if (UCTDEBUG)
00726     cout << "  Selected " << act << " val: " << maxval << endl;
00727 
00728   return act;
00729 
00730 }
00731 
00732 
00733 
00734 std::vector<float> PO_ETUCT::simulateNextState(const std::vector<float> &actualState, state_t discState, state_info* info, int action, float* reward, bool* term){
00735 
00736   StateActionInfo* modelInfo = &(info->model[action]);
00737   bool upToDate = modelInfo->frameUpdated >= lastUpdate;
00738 
00739   if (!upToDate){
00740     
00741     updateStateActionHistoryFromModel(*discState, action, modelInfo);
00742     
00743   }
00744 
00745 
00746   *reward = modelInfo->reward;
00747   *term = (rng.uniform() < modelInfo->termProb);
00748 
00749   if (*term){
00750     return actualState;
00751   }
00752 
00753   float randProb = rng.uniform();
00754 
00755   float probSum = 0.0;
00756   std::vector<float> nextstate;
00757 
00758   if (REALSTATEDEBUG) cout << "randProb: " << randProb << " numNext: " << modelInfo->transitionProbs.size() << endl;
00759 
00760   if (modelInfo->transitionProbs.size() == 0)
00761     nextstate = actualState;
00762 
00763   for (std::map<std::vector<float>, float>::iterator outIt
00764          = modelInfo->transitionProbs.begin();
00765        outIt != modelInfo->transitionProbs.end(); outIt++){
00766 
00767     float prob = (*outIt).second;
00768     probSum += prob;
00769     if (REALSTATEDEBUG) cout << randProb << ", " << probSum << ", " << prob << endl;
00770 
00771     if (randProb <= probSum){
00772       nextstate = (*outIt).first;
00773       if (REALSTATEDEBUG) cout << "selected state " << randProb << ", " << probSum << ", " << prob << endl;
00774       break;
00775     }
00776   }
00777 
00778   if (trackActual){
00779 
00780     // find the relative change from discrete center
00781     std::vector<float> relChange = subVec(nextstate, *discState);
00782 
00783     // add that on to actual current state value
00784     nextstate = addVec(actualState, relChange);
00785 
00786 
00787   }
00788 
00789   if (UCTDEBUG){
00790     cout << "initial prediction: ";
00791     for (unsigned i = 0; i < nextstate.size(); i++){
00792       cout << nextstate[i] << ", ";
00793     } 
00794     cout  << endl;
00795   }
00796 
00797   // check that next state is valid
00798   for (unsigned j = 0; j < featmax.size(); j++){
00799     if (nextstate[j] < (featmin[j]-EPSILON)
00800         || nextstate[j] > (featmax[j]+EPSILON)){
00801 
00802       if (HISTORY_SIZE == 0) return actualState;
00803 
00804       // still tack on correct history
00805       std::vector<float> modState = actualState;
00806       int stateOnlySize = modState.size()-HISTORY_FL_SIZE;
00807       for (int i = stateOnlySize; i < (int)modState.size(); i++){
00808         if (action == (i - stateOnlySize))
00809           modState[i] = 1;
00810         else
00811           modState[i] = 0;
00812       }
00813       return modState;
00814     }
00815   }
00816 
00817   if (UCTDEBUG || HISTORYDEBUG){
00818     cout << "predicted next state: ";
00819     for (unsigned i = 0; i < nextstate.size(); i++){
00820       cout << nextstate[i] << ", ";
00821     } 
00822     cout  << endl;
00823   }
00824 
00825   // return new actual state
00826   return nextstate;
00827 
00828 }
00829 
00830 
00831 void PO_ETUCT::savePolicy(const char* filename){
00832 
00833   ofstream policyFile(filename, ios::out | ios::binary | ios::trunc);
00834 
00835   // first part, save the vector size
00836   int fsize = featmin.size();
00837   policyFile.write((char*)&fsize, sizeof(int));
00838 
00839   // save numactions
00840   policyFile.write((char*)&numactions, sizeof(int));
00841 
00842   // go through all states, and save Q values
00843   for (std::set< std::vector<float> >::iterator i = statespace.begin();
00844        i != statespace.end(); i++){
00845 
00846     state_t s = canonicalize(*i);
00847     state_info* info = &(statedata[s]);
00848 
00849     // save state
00850     policyFile.write((char*)&((*i)[0]), sizeof(float)*fsize);
00851 
00852     // save q-values
00853     policyFile.write((char*)&(info->Q[0]), sizeof(float)*numactions);
00854 
00855   }
00856 
00857   policyFile.close();
00858 }
00859 
00860 void PO_ETUCT::logValues(ofstream *of, int xmin, int xmax, int ymin, int ymax){
00861   std::vector<float> state(2, 0.0);
00862   for (int i = xmin ; i < xmax; i++){
00863     for (int j = ymin; j < ymax; j++){
00864       state[0] = j;
00865       state[1] = i;
00866       state_t s = canonicalize(state);
00867       state_info* info = &(statedata[s]);
00868       std::vector<float> &Q_s = info->Q;
00869       const std::vector<float>::iterator max =
00870         random_max_element(Q_s.begin(), Q_s.end());
00871       *of << (*max) << ",";
00872     }
00873   }
00874 }
00875 
00876 
00877 // should do it such that an already discretized state stays the same
00878 // mainly the numerical value of each bin should be the average of that bin
00879 std::vector<float> PO_ETUCT::discretizeState(const std::vector<float> &s){
00880   std::vector<float> ds(s.size());
00881 
00882   for (unsigned i = 0; i < statesPerDim.size(); i++){
00883 
00884     // since i'm sometimes doing this for discrete domains
00885     // want to center bins on 0, not edge on 0
00886     //cout << "feat " << i << " range: " << featmax[i] << " " << featmin[i] << " " << (featmax[i]-featmin[i]) << " n: " << (float)statesPerDim;
00887 
00888     float factor = (featmax[i] - featmin[i]) / (float)statesPerDim[i];
00889     int bin = 0;
00890     if (s[i] > 0){
00891       bin = (int)((s[i]+factor/2) / factor);
00892     } else {
00893       bin = (int)((s[i]-factor/2) / factor);
00894     }
00895 
00896     ds[i] = factor*bin;
00897     //cout << "P factor: " << factor << " bin: " << bin;
00898     //cout << " Original: " << s[i] << " Discrete: " << ds[i] << endl;
00899   }
00900   for (unsigned i = statesPerDim.size(); i < s.size(); i++){
00901     ds[i] = s[i];
00902   }
00903 
00904   return ds;
00905 }
00906 
00907 std::vector<float> PO_ETUCT::addVec(const std::vector<float> &a, const std::vector<float> &b){
00908   if (a.size() != b.size())
00909     cout << "ERROR: add vector sizes wrong " << a.size() << ", " << b.size() << endl;
00910 
00911   std::vector<float> c(a.size(), 0.0);
00912   for (unsigned i = 0; i < a.size(); i++){
00913     c[i] = a[i] + b[i];
00914   }
00915 
00916   return c;
00917 }
00918 
00919 std::vector<float> PO_ETUCT::subVec(const std::vector<float> &a, const std::vector<float> &b){
00920   if (a.size() != b.size())
00921     cout << "ERROR: sub vector sizes wrong " << a.size() << ", " << b.size() << endl;
00922 
00923   std::vector<float> c(a.size(), 0.0);
00924   for (unsigned i = 0; i < a.size(); i++){
00925     c[i] = a[i] - b[i];
00926   }
00927 
00928   return c;
00929 }
00930 
00931 
00932 void PO_ETUCT::setFirst(){
00933   if (HISTORY_SIZE == 0) return;
00934 
00935   if (HISTORYDEBUG) cout << "first action, set sahistory to 0s" << endl;
00936 
00937   // first action, reset history vector
00938   saHistory.resize(saHistory.size(), 0.0);
00939 }
00940 
00941 void PO_ETUCT::setSeeding(bool seeding){
00942 
00943   if (HISTORYDEBUG) cout << "set seed mode to " << seeding << endl;
00944   seedMode = seeding;
00945 
00946 }