rl_agent: ValueIteration.cc Source File

Go to the documentation of this file.
00001 
00006 #include "ValueIteration.hh"
00007 #include <algorithm>
00008 
00009 #include <sys/time.h>
00010 
00011 
00012 ValueIteration::ValueIteration(int numactions, float gamma,
00013                                int MAX_LOOPS, float MAX_TIME, int modelType,
00014                                const std::vector<float> &fmax, 
00015                                const std::vector<float> &fmin, 
00016                                const std::vector<int> &n, Random newRng):
00017   numactions(numactions), gamma(gamma),
00018   MAX_LOOPS(MAX_LOOPS), MAX_TIME(MAX_TIME), modelType(modelType),
00019   statesPerDim(n)
00020 {
00021   rng = newRng;
00022 
00023   nstates = 0;
00024   nactions = 0;
00025 
00026   timingType = false; //true;
00027 
00028   model = NULL;
00029   planTime = getSeconds();
00030 
00031   // algorithm options
00032   MAX_STEPS = 100; //50; //60; //80; //0; //5; //10;
00033 
00034   PLANNERDEBUG = false;
00035   POLICYDEBUG = false;
00036   ACTDEBUG = false;
00037   MODELDEBUG = false;
00038 
00039   featmax = fmax;
00040   featmin = fmin;
00041 
00042   if (statesPerDim[0] > 0){
00043     cout << "Planner VI using discretization of " << statesPerDim[0] << endl;
00044   }
00045 
00046 
00047 }
00048 
00049 ValueIteration::~ValueIteration() {
00050   for (std::map<state_t, state_info>::iterator i = statedata.begin();
00051        i != statedata.end(); i++){
00052     
00053     // get state's info
00054     //cout << "  planner got info" << endl;
00055     state_info* info = &((*i).second);
00056 
00057     deleteInfo(info);
00058   }
00059 
00060   statedata.clear();
00061   
00062 }
00063 
00064 void ValueIteration::setModel(MDPModel* m){
00065 
00066   model = m;
00067 
00068   //  initStates();
00069 
00070 }
00071 
00072 
00073 // canonicalize all the states so we already have them in our statespace
00074 void ValueIteration::initStates(){
00075   cout << "init states" << endl;
00076   std::vector<float> s(featmin.size());
00077 
00078   fillInState(s,0);
00079   cout << "init states complete" << endl;
00080 }
00081 
00082 void ValueIteration::fillInState(std::vector<float>s, int depth){
00083 
00084   // if depth == size, canonicalize and return
00085   if (depth == (int)featmin.size()){
00086     canonicalize(s);
00087     return;
00088   }
00089 
00090   // go through all features at depth
00091   for (float i = featmin[depth]; i < featmax[depth]+1; i++){
00092     s[depth] = i;
00093     fillInState(s, depth+1);
00094   }
00095 }
00096 
00097 
00099 // Functional functions :) //
00101 
00102 
00103 void ValueIteration::initNewState(state_t s){
00104   if (PLANNERDEBUG) cout << "initNewState(s = " << s
00105                          << ") size = " << s->size() << endl;
00106 
00107   if (MODELDEBUG) cout << "New State: " << endl;
00108 
00109   // create state info and add to hash map
00110   state_info* info = &(statedata[s]);
00111   initStateInfo(info);
00112 
00113   // init these from model
00114   for (int i = 0; i < numactions; i++){
00115     model->getStateActionInfo(*s, i, &(info->modelInfo[i]));
00116   }
00117 
00118 
00119   if (PLANNERDEBUG) cout << "done with initNewState()" << endl;
00120 
00121 }
00122 
00123 bool ValueIteration::updateModelWithExperience(const std::vector<float> &laststate,
00124                                                int lastact,
00125                                                const std::vector<float> &currstate,
00126                                                float reward, bool term){
00127   if (PLANNERDEBUG) cout << "updateModelWithExperience(last = " << &laststate
00128                          << ", curr = " << &currstate
00129                          << ", lastact = " << lastact
00130                          << ", r = " << reward
00131                          << ")" << endl;
00132 
00133   if (!timingType)
00134     planTime = getSeconds();
00135 
00136   // canonicalize these things
00137   state_t last = canonicalize(laststate);
00138   state_t curr = canonicalize(currstate);
00139 
00140   prevstate = laststate;
00141   prevact = lastact;
00142 
00143   // if not transition to terminal
00144   if (curr == NULL)
00145     return false;
00146 
00147   // get state info
00148   state_info* info = &(statedata[last]);
00149 
00150   // update the state visit count
00151   info->visits[lastact]++;
00152 
00153   // init model?
00154   if (model == NULL){
00155     cout << "ERROR IN MODEL OR MODEL SIZE" << endl;
00156     exit(-1);
00157   }
00158 
00159   experience e;
00160   e.s = laststate;
00161   e.next = currstate;
00162   e.act = lastact;
00163   e.reward = reward;
00164   e.terminal = term;
00165   bool modelChanged = model->updateWithExperience(e);
00166 
00167   if (PLANNERDEBUG) cout << "VI Added exp: " << modelChanged << endl;
00168   if (timingType)
00169     planTime = getSeconds();
00170 
00171   return modelChanged;
00172 
00173 }
00174 
00175 
00176 void ValueIteration::updateStateActionFromModel(const std::vector<float> &state, int a){
00177   if (PLANNERDEBUG) cout << "updateStateActionFromModel()" << endl;
00178 
00179   state_t s = canonicalize(state);
00180 
00181   // get state's info
00182   state_info* info = &(statedata[s]);
00183 
00184   // update state info
00185   // get state action info for each action
00186   model->getStateActionInfo(state, a, &(info->modelInfo[a]));
00187 
00188   info->fresh = false;
00189 
00190 }
00191 
00192 
00193 void ValueIteration::updateStatesFromModel(){
00194   if (PLANNERDEBUG) cout << "updateStatesFromModel()" << endl;
00195 
00196   // for each state
00197   for (std::set<std::vector<float> >::iterator i = statespace.begin();
00198        i != statespace.end(); i++){
00199 
00200     if (PLANNERDEBUG){
00201       cout << "updateStatesFromModel i = " << &(*i) << endl;
00202       cout << "State is ";
00203       for (unsigned j = 0; j < (*i).size(); j++){
00204         cout << (*i)[j] << ", ";
00205       }
00206       cout << endl;
00207     }
00208 
00209     state_t s = canonicalize(*i);
00210 
00211     // get state's info
00212     state_info* info = &(statedata[s]);
00213 
00214     // update state info
00215     // get state action info for each action
00216     for (int j = 0; j < numactions; j++){
00217       model->getStateActionInfo(*s, j, &(info->modelInfo[j]));
00218     }
00219 
00220     //s2.clear();
00221 
00222     info->fresh = false;
00223 
00224     if (PLANNERDEBUG) cout << "updateStatesFromModel i = " << &i << " complete" << endl;
00225 
00226   }
00227 
00228   if (PLANNERDEBUG) cout << "updateStatesFromModel " << " totally complete" << endl;
00229 
00230 }
00231 
00232 
00233 int ValueIteration::getBestAction(const std::vector<float> &state){
00234   if (PLANNERDEBUG) cout << "getBestAction(s = " << &state
00235                          << ")" << endl;
00236 
00237   state_t s = canonicalize(state);
00238 
00239   // get state info
00240   state_info* info = &(statedata[s]);
00241 
00242   // Get Q values
00243   std::vector<float> &Q = info->Q;
00244 
00245   // Choose an action
00246   const std::vector<float>::iterator a =
00247     random_max_element(Q.begin(), Q.end()); // Choose maximum
00248 
00249   int act = a - Q.begin();
00250   float val = *a;
00251   if (ACTDEBUG){
00252     cout << endl << "chooseAction State " << (*s)[0] << "," << (*s)[1]
00253          << " act: " << act << " val: " << val << endl;
00254     for (int iAct = 0; iAct < numactions; iAct++){
00255       cout << " Action: " << iAct
00256            << " val: " << Q[iAct]
00257            << " visits: " << info->visits[iAct]
00258            << " modelsAgree: " << info->modelInfo[iAct].known << endl;
00259     }
00260   }
00261 
00262   nactions++;
00263 
00264   // return index of action
00265   return act;
00266 }
00267 
00268 
00269 
00270 
00271 // use VI to compute new policy using model
00272 void ValueIteration::createPolicy(){
00273   if (PLANNERDEBUG || POLICYDEBUG) cout << endl << "createPolicy()" << endl;
00274 
00275   float maxError = 5000;
00276   int nloops = 0;
00277 
00278   calculateReachableStates();
00279 
00280   float MIN_ERROR = 0.0001;
00281   //float initTime = getSeconds();
00282   //cout << "max time " << MAX_TIME  << " max loops: " << MAX_LOOPS << endl;
00283   int statesUpdated = 0;
00284 
00285   // until convergence (always at least MIN_LOOPS)
00286   while (maxError > MIN_ERROR){ // && nloops < MAX_LOOPS){
00287 
00288     //if ((getSeconds() - initTime) > MAX_TIME)
00289     // break;
00290 
00291     //    if ((getSeconds() - planTime) > MAX_TIME)
00292     //break;
00293 
00294     if (POLICYDEBUG)
00295       cout << "max error: " << maxError << " nloops: " << nloops
00296            << endl;
00297 
00298     maxError = 0;
00299     nloops++;
00300 
00301     // for all states
00302     for (std::set<std::vector<float> >::iterator i = statespace.begin();
00303          i != statespace.end(); i++){
00304 
00305       //      if ((getSeconds() - planTime) > MAX_TIME)
00306       //break;
00307 
00308       statesUpdated++;
00309       state_t s = canonicalize(*i);
00310 
00311       // get state's info
00312       state_info* info = &(statedata[s]);
00313 
00314       if (POLICYDEBUG){
00315         cout << endl << " State: id: " << info->id << ": " ;
00316         for (unsigned si = 0; si < s->size(); si++){
00317           cout << (*s)[si] << ",";
00318         }
00319         cout << " Steps: " << info->stepsAway << endl;
00320 
00321       }
00322 
00323       // skip states we can't reach
00324       if (info->stepsAway > 99999){
00325         if (POLICYDEBUG)
00326           cout << "State not reachable, ignoring." << endl;
00327         continue;
00328       }
00329 
00330       // for each action
00331       for (int act = 0; act < numactions; act++){
00332 
00333 
00334 
00335         // get state action info for this action
00336         StateActionInfo *modelInfo = &(info->modelInfo[act]);
00337 
00338         if (POLICYDEBUG)
00339           cout << "  Action: " << act
00340                << " State visits: " << info->visits[act]
00341                << " reward: " << modelInfo->reward 
00342                << " term: " << modelInfo->termProb << endl;
00343 
00344         // Q = R + discounted val of next state
00345         // this is the R part :)
00346         float newQ = modelInfo->reward;
00347 
00348         float probSum = modelInfo->termProb;
00349 
00350         // for all next states, add discounted value appropriately
00351         // loop through next state's that are in this state-actions list
00352         for (std::map<std::vector<float>, float>::iterator outIt
00353                = modelInfo->transitionProbs.begin();
00354              outIt != modelInfo->transitionProbs.end(); outIt++){
00355 
00356           std::vector<float> nextstate = (*outIt).first;
00357 
00358           if (POLICYDEBUG){
00359             cout << "  Next state was: ";
00360             for (unsigned oi = 0; oi < nextstate.size(); oi++){
00361               cout << nextstate[oi] << ",";
00362             }
00363             cout << endl;
00364           }
00365 
00366           // get transition probability
00367           float transitionProb = (1.0-modelInfo->termProb) *
00368             modelInfo->transitionProbs[nextstate];
00369 
00370           probSum += transitionProb;
00371 
00372           if (POLICYDEBUG)
00373             cout << "   prob: " << transitionProb << endl;
00374 
00375           if (transitionProb < 0 || transitionProb > 1.0001){
00376             cout << "Error with transitionProb: " << transitionProb << endl;
00377             exit(-1);
00378           }
00379 
00380           // if there is some probability of this transition
00381           if (transitionProb > 0.0){
00382 
00383             float maxval = 0.0;
00384 
00385             // make sure its a real state
00386             bool realState = true;
00387 
00388             for (unsigned b = 0; b < nextstate.size(); b++){
00389               if (nextstate[b] < (featmin[b]-EPSILON)
00390                   || nextstate[b] > (featmax[b]+EPSILON)){
00391                 realState = false;
00392                 if (POLICYDEBUG)
00393                   cout << "    Next state is not valid (feature "
00394                        << b << " out of range)" << endl;
00395                 break;
00396               }
00397             }
00398 
00399             state_t next;
00400             
00401             // update q values for any states within MAX_STEPS of visited states
00402             if (info->stepsAway >= MAX_STEPS || !realState){
00403               next = s;
00404             } else {
00405               next = canonicalize(nextstate);
00406             }
00407             
00408             state_info* nextinfo = &(statedata[next]);
00409             //nextinfo->fresh = false;
00410             
00411             int newSteps = info->stepsAway + 1;
00412             if (newSteps < nextinfo->stepsAway){
00413               if (POLICYDEBUG) {
00414                 cout << "    Setting state to "
00415                      << newSteps << " steps away." << endl;
00416               }
00417               nextinfo->stepsAway = newSteps;
00418             }
00419             
00420             // find the max value of this next state
00421             std::vector<float>::iterator maxAct =
00422               std::max_element(nextinfo->Q.begin(),
00423                                nextinfo->Q.end());
00424             maxval = *maxAct;
00425             
00426             nextstate.clear();
00427             
00428             if (POLICYDEBUG) cout << "    Max value: " << maxval << endl;
00429 
00430             // update q value with this value
00431             newQ += (gamma * transitionProb * maxval);
00432 
00433           } // transition probability > 0
00434 
00435         } // outcome loop
00436 
00437 
00438         if (probSum < 0.9999 || probSum > 1.0001){
00439           cout << "Error: transition probabilities do not add to 1: Sum: "
00440                << probSum << endl;
00441           exit(-1);
00442         }
00443 
00444 
00445         // set q value
00446         float tdError = fabs(info->Q[act] - newQ);
00447         if (POLICYDEBUG) cout << "  NewQ: " << newQ
00448                               << " OldQ: " << info->Q[act] << endl;
00449         info->Q[act] = newQ;
00450 
00451         // check max error
00452         if (tdError > maxError)
00453           maxError = tdError;
00454 
00455         if (POLICYDEBUG)
00456           cout << "  TD error: " << tdError
00457                << " Max error: " << maxError << endl;
00458 
00459       } // action loop
00460 
00461     } // state loop
00462 
00463   } // while not converged loop
00464 
00465   if (false || nloops >= MAX_LOOPS){
00466     cout << nactions << " Policy creation ended with maxError: " << maxError
00467          << " nloops: " << nloops << " time: " << (getSeconds()-planTime)
00468          << " states: " << statesUpdated
00469          << endl;
00470   }
00471 
00472   // remove unreachable states
00473   removeUnreachableStates();
00474 
00475   if (POLICYDEBUG) cout << nactions
00476                         << " policy creation complete: maxError: "
00477                         << maxError << " nloops: " << nloops
00478                         << endl;
00479 
00480 
00481 }
00482 
00483 
00484 void ValueIteration::planOnNewModel(){
00485 
00486   // update model info
00487   // can just update one for tabular model
00488   if (modelType == RMAX){
00489     updateStateActionFromModel(prevstate, prevact);
00490   }
00491   else {
00492     updateStatesFromModel();
00493   }
00494 
00495   // run value iteration
00496   createPolicy();
00497 
00498 }
00499 
00500 
00502 // Helper Functions       //
00504 
00505 ValueIteration::state_t ValueIteration::canonicalize(const std::vector<float> &s) {
00506   if (PLANNERDEBUG) cout << "canonicalize(s = " << s[0] << ", "
00507                          << s[1] << ")" << endl;
00508 
00509   std::vector<float> s2;
00510   if (statesPerDim[0] > 0){
00511     s2 = discretizeState(s);
00512   } else {
00513     s2 = s;
00514   }
00515 
00516   if (PLANNERDEBUG) cout << "discretized(" << s2[0] << ", " << s2[1] << ")" << endl;
00517 
00518   // get state_t for pointer if its in statespace
00519   const std::pair<std::set<std::vector<float> >::iterator, bool> result =
00520     statespace.insert(s2);
00521   state_t retval = &*result.first; // Dereference iterator then get pointer
00522 
00523   if (PLANNERDEBUG) cout << " returns " << retval
00524                          << " New: " << result.second << endl;
00525 
00526   // if not, init this new state
00527   if (result.second) { // s is new, so initialize Q(s,a) for all a
00528     initNewState(retval);
00529     if (PLANNERDEBUG) cout << " New state initialized" << endl;
00530   }
00531 
00532   return retval;
00533 }
00534 
00535 
00536 
00537 // init state info
00538 void ValueIteration::initStateInfo(state_info* info){
00539   if (PLANNERDEBUG) cout << "initStateInfo()";
00540 
00541   info->id = nstates++;
00542   if (PLANNERDEBUG) cout << " id = " << info->id << endl;
00543 
00544   info->fresh = true;
00545   info->stepsAway = 100000;
00546 
00547   // model data (transition, reward, known)
00548   info->modelInfo = new StateActionInfo[numactions];
00549 
00550   // model q values, visit counts
00551   info->visits.resize(numactions, 0);
00552   info->Q.resize(numactions, 0);
00553 
00554   for (int i = 0; i < numactions; i++){
00555     info->Q[i] = rng.uniform(0,0.01);
00556   }
00557 
00558   if (PLANNERDEBUG) cout << "done with initStateInfo()" << endl;
00559 
00560 }
00561 
00562 
00563 void ValueIteration::printStates(){
00564 
00565   for (std::set< std::vector<float> >::iterator i = statespace.begin();
00566        i != statespace.end(); i++){
00567 
00568     state_t s = canonicalize(*i);
00569 
00570     state_info* info = &(statedata[s]);
00571 
00572     cout << "State " << info->id << ": ";
00573     for (unsigned j = 0; j < s->size(); j++){
00574       cout << (*s)[j] << ", ";
00575     }
00576     cout << endl;
00577 
00578     for (int act = 0; act < numactions; act++){
00579       cout << " visits[" << act << "] = " << info->visits[act]
00580            << " Q: " << info->Q[act]
00581            << " R: " << info->modelInfo[act].reward << endl;
00582     }
00583 
00584   }
00585 }
00586 
00587 
00588 
00589 
00590 void ValueIteration::calculateReachableStates(){
00591   // for all states
00592   // set plausible flag to see if we need to calc q value for this state
00593   for (std::set<std::vector<float> >::iterator i = statespace.begin();
00594        i != statespace.end(); i++){
00595 
00596     state_t s = canonicalize(*i);
00597 
00598     // get state's info
00599     state_info* info = &(statedata[s]);
00600 
00601     info->stepsAway = 100000;
00602 
00603     //if (info->fresh){
00604     //  info->stepsAway = 0;
00605     //  continue;
00606     //}
00607 
00608     for (int j = 0; j < numactions; j++){
00609       if (info->visits[j] > 0){
00610         info->stepsAway = 0;
00611         break;
00612       }
00613     }
00614   }
00615 }
00616 
00617 void ValueIteration::removeUnreachableStates(){
00618 
00619   return;
00620 
00621   // for all states
00622   // set plausible flag to see if we need to calc q value for this state
00623   for (std::set<std::vector<float> >::iterator i = statespace.begin();
00624        i != statespace.end(); i++){
00625 
00626     state_t s = canonicalize(*i);
00627 
00628     // get state's info
00629     state_info* info = &(statedata[s]);
00630 
00631     // state is unreachable
00632     if (info->stepsAway > MAX_STEPS){
00633 
00634       if (POLICYDEBUG){
00635         cout << "Removing unreachable state: " << (*s)[0];
00636         for (unsigned i = 1; i < s->size(); i++){
00637           cout << ", " << (*s)[i];
00638         }
00639         cout << endl;
00640       }
00641 
00642       // delete state
00643       deleteInfo(info);
00644 
00645       // remove from statespace
00646       statespace.erase(*s);
00647 
00648       // remove from statedata hashmap
00649       statedata.erase(s);
00650 
00651     }
00652   }
00653 }
00654 
00655 
00656 
00657 
00658 
00659 
00660 void ValueIteration::deleteInfo(state_info* info){
00661 
00662   delete [] info->modelInfo;
00663 
00664 }
00665 
00666 
00667 double ValueIteration::getSeconds(){
00668   struct timezone tz;
00669   timeval timeT;
00670   gettimeofday(&timeT, &tz);
00671   return  timeT.tv_sec + (timeT.tv_usec / 1000000.0);
00672 }
00673 
00674 
00675 void ValueIteration::savePolicy(const char* filename){
00676 
00677   ofstream policyFile(filename, ios::out | ios::binary | ios::trunc);
00678 
00679   // first part, save the vector size
00680   int fsize = featmin.size();
00681   policyFile.write((char*)&fsize, sizeof(int));
00682 
00683   // save numactions
00684   policyFile.write((char*)&numactions, sizeof(int));
00685 
00686   // go through all states, and save Q values
00687   for (std::set< std::vector<float> >::iterator i = statespace.begin();
00688        i != statespace.end(); i++){
00689 
00690     state_t s = canonicalize(*i);
00691     state_info* info = &(statedata[s]);
00692 
00693     // save state
00694     policyFile.write((char*)&((*i)[0]), sizeof(float)*fsize);
00695 
00696     // save q-values
00697     policyFile.write((char*)&(info->Q[0]), sizeof(float)*numactions);
00698 
00699   }
00700 
00701   policyFile.close();
00702 }
00703 
00704 
00705 // should do it such that an already discretized state stays the same
00706 // mainly the numerical value of each bin should be the average of that bin
00707 std::vector<float> ValueIteration::discretizeState(const std::vector<float> &s){
00708   std::vector<float> ds(s.size());
00709 
00710   for (unsigned i = 0; i < s.size(); i++){
00711     
00712     // since i'm sometimes doing this for discrete domains
00713     // want to center bins on 0, not edge on 0
00714     //cout << "feat " << i << " range: " << featmax[i] << " " << featmin[i] << " " << (featmax[i]-featmin[i]) << " n: " << (float)statesPerDim;
00715 
00716     float factor = (featmax[i] - featmin[i]) / (float)statesPerDim[i];
00717     int bin = 0;
00718     if (s[i] > 0){
00719       bin = (int)((s[i]+factor/2) / factor);
00720     } else {
00721       bin = (int)((s[i]-factor/2) / factor);
00722     }
00723 
00724     ds[i] = factor*bin;
00725     //cout << " factor: " << factor << " bin: " << bin;
00726     //cout << " Original: " << s[i] << " Discrete: " << ds[i] << endl;
00727   }
00728 
00729   return ds;
00730 }