rl_agent: PolicyIteration.cc Source File

Go to the documentation of this file.
00001 #include "PolicyIteration.hh"
00002 #include <algorithm>
00003 
00004 //#include <time.h>
00005 #include <sys/time.h>
00006 
00007 
00008 PolicyIteration::PolicyIteration(int numactions, float gamma,
00009                                  int MAX_LOOPS, float MAX_TIME, int modelType,
00010                                  const std::vector<float> &fmax, 
00011                                  const std::vector<float> &fmin, 
00012                                  const std::vector<int> &n, Random r):
00013   numactions(numactions), gamma(gamma), 
00014   MAX_LOOPS(MAX_LOOPS), MAX_TIME(MAX_TIME), modelType(modelType),
00015   statesPerDim(n)
00016 {
00017   rng = r;
00018   
00019   nstates = 0;
00020   nactions = 0;
00021 
00022   timingType = false; //true;
00023 
00024   model = NULL;
00025   planTime = getSeconds();
00026 
00027   // algorithm options
00028   MAX_STEPS = 10; //50; //60; //80; //0; //5; //10;
00029 
00030   PLANNERDEBUG =false;
00031   POLICYDEBUG = false; //true; //false;
00032   ACTDEBUG = false; //true;
00033   MODELDEBUG = false; //true;
00034 
00035  if (statesPerDim[0] > 0){
00036     cout << "Planner PI using discretization of " << statesPerDim[0] << endl;
00037  }
00038 
00039   featmax = fmax;
00040   featmin = fmin;
00041 
00042 }
00043 
00044 PolicyIteration::~PolicyIteration() {
00045   for (std::map<state_t, state_info>::iterator i = statedata.begin();
00046        i != statedata.end(); i++){
00047     
00048     // get state's info
00049     //cout << "  planner got info" << endl;
00050     state_info* info = &((*i).second);
00051 
00052     deleteInfo(info);
00053   }
00054 
00055   statedata.clear();
00056 }
00057 
00058 void PolicyIteration::setModel(MDPModel* m){
00059 
00060   model = m;
00061 
00062 }
00063 
00064 
00066 // Functional functions :) //
00068 
00069 
00070 void PolicyIteration::initNewState(state_t s){
00071   if (PLANNERDEBUG) cout << "initNewState(s = " << s 
00072                      << ") size = " << s->size() << endl;
00073 
00074   if (MODELDEBUG) cout << "New State: " << endl;
00075 
00076   // create state info and add to hash map
00077   state_info* info = &(statedata[s]);
00078   initStateInfo(info);
00079 
00080   // init these from model
00081   for (int i = 0; i < numactions; i++){
00082     model->getStateActionInfo(*s, i, &(info->modelInfo[i]));
00083   }
00084 
00085 
00086   if (PLANNERDEBUG) cout << "done with initNewState()" << endl;
00087 
00088 }
00089 
00091 bool PolicyIteration::updateModelWithExperience(const std::vector<float> &laststate, 
00092                                                 int lastact, 
00093                                                 const std::vector<float> &currstate, 
00094                                                 float reward, bool term){
00095   if (PLANNERDEBUG) cout << "updateModelWithExperience(last = " << &laststate
00096                       << ", curr = " << &currstate
00097                       << ", lastact = " << lastact 
00098                       << ", r = " << reward
00099                       << ")" << endl;
00100   
00101   if (!timingType)
00102     planTime = getSeconds();
00103 
00104 
00105   // canonicalize these things
00106   state_t last = canonicalize(laststate);
00107   state_t curr = canonicalize(currstate);
00108 
00109   prevstate = laststate;
00110   prevact = lastact;
00111 
00112   // if not transition to terminal
00113   if (curr == NULL)
00114     return false;
00115 
00116   // get state info
00117   state_info* info = &(statedata[last]);
00118   
00119   // update the state visit count
00120   info->visits[lastact]++;
00121 
00122   // init model?
00123   if (model == NULL){
00124     cout << "ERROR IN MODEL OR MODEL SIZE" << endl;
00125     exit(-1);
00126   }
00127 
00128   experience e;
00129   e.s = *last;
00130   e.next = *curr;
00131   e.act = lastact;
00132   e.reward = reward;
00133   e.terminal = term;
00134   bool modelChanged = model->updateWithExperience(e);
00135 
00136   if (PLANNERDEBUG) cout << "Added exp: " << modelChanged << endl;
00137   if (timingType)
00138     planTime = getSeconds();
00139 
00140   return modelChanged;
00141 
00142 }
00143 
00145 void PolicyIteration::updateStateActionFromModel(const std::vector<float> &state, int a){
00146 if (PLANNERDEBUG) cout << "updateStateActionFromModel()" << endl;
00147 
00148  state_t s = canonicalize(state);
00149     
00150  // get state's info
00151  state_info* info = &(statedata[s]); 
00152  
00153  // update state info
00154  // get state action info for each action
00155  model->getStateActionInfo(state, a, &(info->modelInfo[a]));
00156  
00157  info->fresh = false;
00158 
00159 }
00160 
00162 void PolicyIteration::updateStatesFromModel(){
00163 if (PLANNERDEBUG) cout << "updateStatesFromModel()" << endl;
00164 
00165   // for each state
00166   for (std::set<std::vector<float> >::iterator i = statespace.begin();
00167        i != statespace.end(); i++){
00168     
00169     if (PLANNERDEBUG) cout << "updateStatesFromModel i = " << &i << endl;
00170     if ((getSeconds() - planTime) > MAX_TIME)
00171       break;
00172     
00173     state_t s = canonicalize(*i);
00174     
00175     // get state's info
00176     state_info* info = &(statedata[s]); 
00177 
00178     // update state info
00179     // get state action info for each action
00180     for (int j = 0; j < numactions; j++){
00181       model->getStateActionInfo(*s, j, &(info->modelInfo[j]));
00182     }
00183     
00184     //s2.clear();
00185 
00186     info->fresh = false;
00187     
00188     if (PLANNERDEBUG) cout << "updateStatesFromModel i = " << &i << " complete" << endl;
00189 
00190   }
00191 
00192 }
00193 
00194 
00196 int PolicyIteration::getBestAction(const std::vector<float> &state){
00197   if (PLANNERDEBUG) cout << "getBestAction(s = " << &state 
00198                       << ")" << endl;
00199   
00200   state_t s = canonicalize(state);
00201 
00202   // get state info
00203   state_info* info = &(statedata[s]);
00204   
00205   int act = info->bestAction;
00206 
00207   if (ACTDEBUG){
00208     cout << endl << "chooseAction State " << (*s)[0] << "," << (*s)[1] 
00209          << " act: " << act << " val: " << info->value << endl;
00210     for (int iAct = 0; iAct < numactions; iAct++){
00211       cout << " Action: " << iAct 
00212            << " visits: " << info->visits[iAct] 
00213            << " modelsAgree: " << info->modelInfo[iAct].known << endl;
00214     }
00215   }
00216 
00217   nactions++;
00218 
00219   // return index of action
00220   return act;
00221 }
00222 
00223 
00224 
00225 
00226 
00227 void PolicyIteration::planOnNewModel(){
00228 
00229   // update model info
00230   // can just update one for tabular model
00231   if (modelType == RMAX){
00232     updateStateActionFromModel(prevstate, prevact);
00233   }
00234   else {
00235     updateStatesFromModel();
00236   }
00237 
00238   // run value iteration
00239   createPolicy();
00240 
00241 }
00242 
00243 
00245 // Helper Functions       //
00247 
00248 PolicyIteration::state_t PolicyIteration::canonicalize(const std::vector<float> &s) {
00249   if (PLANNERDEBUG) cout << "canonicalize(s = " << s[0] << ", " 
00250                              << s[1] << ")" << endl; 
00251 
00252   std::vector<float> s2;
00253   if (statesPerDim[0] > 0){
00254     s2 = discretizeState(s);
00255   } else {
00256     s2 = s;
00257   }
00258 
00259   // get state_t for pointer if its in statespace
00260   const std::pair<std::set<std::vector<float> >::iterator, bool> result =
00261     statespace.insert(s2);
00262   state_t retval = &*result.first; // Dereference iterator then get pointer 
00263 
00264   if (PLANNERDEBUG) cout << " returns " << retval 
00265                      << " New: " << result.second << endl;
00266 
00267   // if not, init this new state
00268   if (result.second) { // s is new, so initialize Q(s,a) for all a
00269     initNewState(retval);
00270     if (PLANNERDEBUG) cout << " New state initialized" << endl;
00271   }
00272 
00273   
00274   return retval; 
00275 }
00276 
00277 
00278 // init state info
00279 void PolicyIteration::initStateInfo(state_info* info){
00280   if (PLANNERDEBUG) cout << "initStateInfo()";
00281 
00282   info->id = nstates++;
00283   if (PLANNERDEBUG) cout << " id = " << info->id << endl;
00284 
00285   info->fresh = true;
00286   info->stepsAway = 100000;
00287 
00288   // model data (transition, reward, known)
00289   info->modelInfo = new StateActionInfo[numactions];     
00290 
00291   // model q values, visit counts
00292   info->visits.resize(numactions, 0);
00293 
00294   info->value = 0.0;
00295   info->bestAction = rng.uniformDiscrete(0, numactions-1);
00296 
00297   if (PLANNERDEBUG) cout << "done with initStateInfo()" << endl;
00298 
00299 }
00300 
00301 
00303 void PolicyIteration::printStates(){
00304   
00305   for (std::set< std::vector<float> >::iterator i = statespace.begin();
00306        i != statespace.end(); i++){
00307 
00308     state_t s = canonicalize(*i);
00309 
00310     state_info* info = &(statedata[s]);
00311 
00312     cout << "State " << info->id << ": ";
00313     for (unsigned j = 0; j < s->size(); j++){
00314       cout << (*s)[j] << ", ";
00315     }
00316     cout << endl;
00317     
00318     cout << "Value: " << info->value 
00319          << " bestAction: " << info->bestAction << endl;
00320 
00321     for (int act = 0; act < numactions; act++){
00322       cout << " visits[" << act << "] = " << info->visits[act] 
00323            << " R: " << info->modelInfo[act].reward << endl;
00324     }
00325 
00326   }
00327 }
00328 
00329 
00330 
00331 
00333 void PolicyIteration::calculateReachableStates(){
00334   // for all states
00335   // set plausible flag to see if we need to calc q value for this state
00336   for (std::set<std::vector<float> >::iterator i = statespace.begin();
00337        i != statespace.end(); i++){
00338     
00339     state_t s = canonicalize(*i);
00340     
00341     // get state's info
00342     state_info* info = &(statedata[s]);  
00343 
00344     info->stepsAway = 100000;
00345 
00346     //if (info->fresh){
00347     //  info->stepsAway = 0;
00348     //  continue;
00349     //}
00350 
00351     for (int j = 0; j < numactions; j++){
00352       if (info->visits[j] > 0){
00353         info->stepsAway = 0;
00354         break;
00355       }
00356     }
00357   }
00358 }
00359 
00361 void PolicyIteration::removeUnreachableStates(){
00362 
00363   return;
00364 
00365   // for all states
00366   // set plausible flag to see if we need to calc q value for this state
00367   for (std::set<std::vector<float> >::iterator i = statespace.begin();
00368        i != statespace.end(); i++){
00369     
00370     state_t s = canonicalize(*i);
00371     
00372     // get state's info
00373     state_info* info = &(statedata[s]);  
00374     
00375     // state is unreachable
00376     if (info->stepsAway > MAX_STEPS){
00377 
00378       if (POLICYDEBUG){
00379         cout << "Removing unreachable state: " << (*s)[0];
00380         for (unsigned i = 1; i < s->size(); i++){
00381           cout << ", " << (*s)[i];
00382         }
00383         cout << endl;
00384       }
00385 
00386       // delete state
00387       deleteInfo(info);
00388 
00389       // remove from statespace
00390       statespace.erase(*s);
00391 
00392       // remove from statedata hashmap
00393       statedata.erase(s);
00394 
00395     }
00396   }
00397 }
00398 
00399 
00400 
00401 
00402 
00403 
00404 void PolicyIteration::deleteInfo(state_info* info){
00405 
00406   delete [] info->modelInfo;
00407 
00408 }
00409 
00410 
00411 double PolicyIteration::getSeconds(){
00412   struct timezone tz;
00413   timeval timeT;
00414   gettimeofday(&timeT, &tz);
00415   return  timeT.tv_sec + (timeT.tv_usec / 1000000.0);
00416 }
00417 
00418 
00420 void PolicyIteration::createPolicy(){
00421   if (PLANNERDEBUG || POLICYDEBUG) cout << endl << "createPolicy()" << endl;
00422 
00423   int nloops = 0;
00424 
00425   calculateReachableStates();
00426 
00427   //float initTime = getSeconds();
00428 
00429   bool policyStable = false;
00430 
00431   while (!policyStable && nloops < MAX_LOOPS){
00432 
00433     //if ((getSeconds() - initTime) > MAX_TIME)
00434     //  break;
00435 
00436     if ((getSeconds() - planTime) > MAX_TIME)
00437       break;
00438 
00439     // policy evaluation
00440     policyEvaluation();
00441 
00442     // policy improvement
00443     policyStable = policyImprovement();
00444 
00445     if (POLICYDEBUG)
00446       cout << "Loop " << nloops << " stable: " << policyStable << endl;
00447 
00448     nloops++;
00449 
00450   }
00451 
00452   if (POLICYDEBUG){
00453     cout << "Finished after " << nloops << " loop and " 
00454          << (getSeconds() - planTime)
00455          << " time. " << endl;
00456   }
00457 
00458 }
00459 
00460 
00461 bool PolicyIteration::policyImprovement(){
00462 
00463   if (POLICYDEBUG)
00464     cout << endl << "PolicyImprovement" << endl;
00465 
00466   bool policyStable = true;
00467 
00468   // for all states
00469   for (std::set<std::vector<float> >::iterator i = statespace.begin();
00470        i != statespace.end(); i++){
00471     
00472     if ((getSeconds() - planTime) > MAX_TIME)
00473       break;
00474     
00475     state_t s = canonicalize(*i);
00476     
00477     // get state's info
00478     state_info* info = &(statedata[s]); 
00479 
00480 
00481     if (POLICYDEBUG){
00482       cout << endl << " State: id: " << info->id << ": " ;
00483       for (unsigned si = 0; si < s->size(); si++){
00484         cout << (*s)[si] << ",";
00485       }
00486       cout << " Value: " << info->value  
00487            << " Action: " << info->bestAction << endl;
00488 
00489     }
00490 
00491     int prevAction = info->bestAction;
00492 
00493     std::vector<float> Q(numactions);
00494 
00495     // find action with maximum value
00496     for (int act = 0; act < numactions; act++){
00497 
00498       float val = getActionValue(s, info, act);
00499       Q[act] = val;
00500 
00501     }
00502 
00503     // get max of q's
00504     const std::vector<float>::iterator a =
00505       random_max_element(Q.begin(), Q.end()); // Choose maximum
00506     
00507     int act = a - Q.begin();
00508     float val = *a;
00509 
00510     // if this value is the same as what we were doing, keep old action
00511     if (val != Q[prevAction]){
00512       info->bestAction = act;
00513       info->value = val;
00514 
00515       policyStable = false;
00516 
00517       if (POLICYDEBUG)
00518         cout << "Action changed from " << prevAction << " to " 
00519              << info->bestAction << endl;
00520     }
00521     
00522   }
00523   
00524   return policyStable;
00525 
00526 }
00527 
00528 float PolicyIteration::getActionValue(state_t s, state_info* info, 
00529                                    int act){
00530   
00531 
00532   // get state action info for this action
00533   StateActionInfo *modelInfo = &(info->modelInfo[act]);
00534   
00535   if (POLICYDEBUG)
00536     cout << "  Action: " << act 
00537          << " State visits: " << info->visits[act] << endl;
00538   
00539   // Q = R + discounted val of next state
00540   // this is the R part :)
00541   float newQ = modelInfo->reward;
00542       
00543   float probSum = modelInfo->termProb;
00544   
00545   // for all next states, add discounted value appropriately
00546   // loop through next state's that are in this state-actions list
00547   for (std::map<std::vector<float>, float>::iterator outIt 
00548          = modelInfo->transitionProbs.begin();
00549        outIt != modelInfo->transitionProbs.end(); outIt++){
00550     
00551     std::vector<float> nextstate = (*outIt).first;  
00552     
00553     if (POLICYDEBUG){
00554       cout << "  Next state was: ";
00555       for (unsigned oi = 0; oi < nextstate.size(); oi++){
00556         cout << nextstate[oi] << ",";
00557       }
00558       cout << endl;
00559     }
00560     
00561     // get transition probability
00562     float transitionProb = (1.0-modelInfo->termProb)*
00563       modelInfo->transitionProbs[nextstate];
00564     
00565     probSum += transitionProb;
00566     
00567     if (POLICYDEBUG)
00568       cout << "   prob: " << transitionProb << endl;
00569     
00570     if (transitionProb < 0 || transitionProb > 1.0001){
00571       cout << "Error with transitionProb: " << transitionProb << endl;
00572       exit(-1);
00573     }
00574     
00575     // if there is some probability of this transition
00576     if (transitionProb > 0.0){
00577       
00578       // assume maxval of qmax if we don't know the state
00579       float maxval = 0.0;
00580       
00581       // make sure its a real state 
00582       bool realState = true;
00583       
00584       for (unsigned b = 0; b < nextstate.size(); b++){
00585         if (nextstate[b] < (featmin[b]-EPSILON)
00586             || nextstate[b] > (featmax[b]+EPSILON)){
00587           realState = false;
00588           if (POLICYDEBUG) 
00589             cout << "    Next state is not valid (feature " 
00590                  << b << " out of range)" << endl;
00591           break;
00592         }
00593       }
00594       
00595       // update q values for any states within MAX_STEPS of visited states
00596       if (info->stepsAway < MAX_STEPS && realState){
00597         
00598         state_t next = canonicalize(nextstate);
00599         
00600         state_info* nextinfo = &(statedata[next]);  
00601         //nextinfo->fresh = false;
00602         
00603         int newSteps = info->stepsAway + 1;
00604         if (newSteps < nextinfo->stepsAway){
00605           if (POLICYDEBUG) {
00606             cout << "    Setting state to " 
00607                  << newSteps << " steps away." << endl;
00608           }
00609           nextinfo->stepsAway = newSteps;
00610         }
00611         
00612         // find the max value of this next state
00613         maxval = nextinfo->value;
00614         
00615       } // within max steps
00616       else {
00617         maxval = 0.0;
00618         if (POLICYDEBUG){
00619           cout << "This state is too far away, state: ";
00620           for (unsigned si = 0; si < s->size(); si++){
00621             cout << (*s)[si] << ",";
00622           }
00623           cout << " Action: " << act << endl;
00624         }
00625       }
00626       
00627       nextstate.clear();
00628       
00629       if (POLICYDEBUG) cout << "    Max value: " << maxval << endl;
00630       
00631       // update q value with this value
00632       newQ += (gamma * transitionProb * maxval);
00633       
00634     } // transition probability > 0
00635     
00636   } // outcome loop                             
00637   
00638   
00639   if (probSum < 0.9999 || probSum > 1.0001){
00640     cout << "Error: transition probabilities do not add to 1: Sum: " 
00641          << probSum << endl;
00642     exit(-1);
00643   }
00644   
00645   
00646   return newQ;
00647     
00648 } // end of function
00649 
00650 
00651 
00652 void PolicyIteration::policyEvaluation(){
00653   
00654   if (POLICYDEBUG)
00655     cout << endl << "Policy Evaluation" << endl;
00656 
00657   float maxError = 5000;
00658   float MIN_ERROR = 0.0001;
00659   int nloops = 0;
00660 
00661   // until convergence
00662   while (maxError > MIN_ERROR && nloops < MAX_LOOPS){
00663 
00664 
00665     if (POLICYDEBUG) 
00666       cout << "max error: " << maxError << " nloops: " << nloops << endl;
00667 
00668     maxError = 0;
00669     nloops++;
00670 
00671     // for all states
00672     for (std::set<std::vector<float> >::iterator i = statespace.begin();
00673            i != statespace.end(); i++){
00674       
00675       if ((getSeconds() - planTime) > MAX_TIME)
00676         break;
00677 
00678       state_t s = canonicalize(*i);
00679 
00680       // get state's info
00681       state_info* info = &(statedata[s]);      
00682 
00683       if (POLICYDEBUG){
00684         cout << endl << " State: id: " << info->id << ": " ;
00685         for (unsigned si = 0; si < s->size(); si++){
00686           cout << (*s)[si] << ",";
00687         }
00688         cout << " Value: " << info->value 
00689              << " action: " << info->bestAction << endl;
00690         cout << " Steps: " << info->stepsAway << endl;
00691         
00692       }
00693 
00694       // skip states we can't reach
00695       if (info->stepsAway > 99999){
00696         if (POLICYDEBUG) 
00697           cout << "State not reachable, ignoring." << endl;
00698         continue;
00699       }
00700 
00701       // for action that is taken
00702       int act = info->bestAction;
00703 
00704       float oldValue = info->value;
00705 
00706       float newVal = getActionValue(s, info, act);
00707 
00708       info->value = newVal;
00709 
00710       // set q value
00711       float tdError = fabs(info->value - oldValue);
00712       if (POLICYDEBUG) cout << "  NewVal: " << info->value 
00713                             << " OldVal: " << oldValue << endl;
00714       
00715       // check max error
00716       if (tdError > maxError)
00717         maxError = tdError;
00718       
00719       if (POLICYDEBUG)
00720         cout << "  TD error: " << tdError 
00721              << " Max error: " << maxError << endl;
00722          
00723       
00724     } // state loop
00725     
00726   } // while not converged loop
00727   
00728   if (false || nloops >= MAX_LOOPS){
00729     cout << nactions << " Policy evaluation ended with maxError: " 
00730          << maxError << " nloops: " << nloops << endl;
00731   }
00732   
00733   if (POLICYDEBUG) cout << nactions 
00734                         << " policy eval complete: maxError: " 
00735                         << maxError << " nloops: " << nloops 
00736                         << endl;
00737   
00738     
00739 }
00740 
00741 
00742 void PolicyIteration::savePolicy(const char* filename){
00743 
00744   ofstream policyFile(filename, ios::out | ios::binary | ios::trunc);
00745 
00746   // first part, save the vector size
00747   int fsize = featmin.size();
00748   policyFile.write((char*)&fsize, sizeof(int));
00749 
00750   // save numactions
00751   policyFile.write((char*)&numactions, sizeof(int));
00752 
00753   // go through all states, and save Q values
00754   for (std::set< std::vector<float> >::iterator i = statespace.begin();
00755        i != statespace.end(); i++){
00756 
00757     state_t s = canonicalize(*i);
00758     state_info* info = &(statedata[s]);
00759 
00760     // save state
00761     policyFile.write((char*)&((*i)[0]), sizeof(float)*fsize);
00762 
00763     // save fake q-values (value for best, -1 less for suboptimal)
00764     float badvalue = info->value - 1.0;
00765     for (int j = 0; j < numactions; j++){
00766       if (j == info->bestAction){
00767         policyFile.write((char*)&(info->value), sizeof(float));
00768       } else {
00769         policyFile.write((char*)&(badvalue), sizeof(float));
00770       } 
00771     }
00772 
00773   }
00774 
00775   policyFile.close();
00776 }
00777 
00778 // should do it such that an already discretized state stays the same
00779 // mainly the numerical value of each bin should be the average of that bin
00780 std::vector<float> PolicyIteration::discretizeState(const std::vector<float> &s){
00781   std::vector<float> ds(s.size());
00782 
00783   for (unsigned i = 0; i < s.size(); i++){
00784 
00785     // since i'm sometimes doing this for discrete domains
00786     // want to center bins on 0, not edge on 0
00787     //cout << "feat " << i << " range: " << featmax[i] << " " << featmin[i] << " " << (featmax[i]-featmin[i]) << " n: " << (float)statesPerDim;
00788 
00789     float factor = (featmax[i] - featmin[i]) / (float)statesPerDim[i];
00790     int bin = 0;
00791     if (s[i] > 0){
00792       bin = (int)((s[i]+factor/2) / factor);
00793     } else {
00794       bin = (int)((s[i]-factor/2) / factor);
00795     }
00796 
00797     ds[i] = factor*bin;
00798     //cout << " factor: " << factor << " bin: " << bin;
00799     //cout << " Original: " << s[i] << " Discrete: " << ds[i] << endl;
00800   }
00801 
00802   return ds;
00803 }