rl_agent: Dyna.cc Source File

Go to the documentation of this file.
00001 #include <rl_agent/Dyna.hh>
00002 #include <algorithm>
00003 
00004 #include <sys/time.h>
00005 
00006 
00007 Dyna::Dyna(int numactions, float gamma,
00008            float initialvalue, float alpha, int k, float ep,
00009                    Random rng):
00010   numactions(numactions), gamma(gamma),
00011   initialvalue(initialvalue), alpha(alpha), k(k),
00012   rng(rng), currentq(NULL), laststate(NULL), lastact(0)
00013 {
00014 
00015   epsilon = ep;
00016   ACTDEBUG = false; //true; //false;
00017   cout << "Dyna agent with k:" << k << endl;
00018 
00019 }
00020 
00021 Dyna::~Dyna() {}
00022 
00023 int Dyna::first_action(const std::vector<float> &s) {
00024 
00025   if (ACTDEBUG){
00026     cout << "First - in state: ";
00027     printState(s);
00028     cout << endl;
00029   }
00030 
00031   return getBestAction(s);
00032 
00033 }
00034 
00035 int Dyna::getBestAction(const std::vector<float> &s){
00036   //cout << "get best action" << endl;
00037 
00038   // for some amount of time, update based on randomly sampled experiences
00039   int numExp = (int)experiences.size();
00040   for (int i = 0; i < k && numExp > 0; i++){
00041     
00042     // update from randoml sampled action
00043     int exp = 0;
00044     if (numExp > 1)
00045       exp = rng.uniformDiscrete(0, numExp-1);
00046     //cout << count << " Update exp " << exp << endl;
00047 
00048     dynaExperience e = experiences[exp];
00049 
00050     std::vector<float> &Q_s = Q[e.s];
00051     if (e.term){
00052       Q_s[e.a] += alpha * (e.r - Q_s[e.a]);
00053     } else {
00054       std::vector<float> &Q_next = Q[e.next];
00055       const std::vector<float>::iterator max =
00056         random_max_element(Q_next.begin(), Q_next.end());
00057       Q_s[e.a] += alpha * (e.r + (gamma * *max) - Q_s[e.a]);
00058     }
00059 
00060   }
00061 
00062 
00063   // then do normal action selection
00064   // Get action values
00065   state_t st = canonicalize(s);
00066   std::vector<float> &Q_s = Q[st];
00067 
00068   // Choose an action
00069   const std::vector<float>::iterator a =
00070     rng.uniform() < epsilon
00071     ? Q_s.begin() + rng.uniformDiscrete(0, numactions - 1) // Choose randomly
00072     : random_max_element(Q_s.begin(), Q_s.end()); // Choose maximum
00073 
00074   // Store location to update value later
00075   currentq = &*a;
00076   laststate = st;
00077   lastact = a - Q_s.begin();
00078 
00079   if (ACTDEBUG){
00080     cout << " act: " << (a-Q_s.begin()) << " val: " << *a << endl;
00081     for (int iAct = 0; iAct < numactions; iAct++){
00082       cout << " Action: " << iAct 
00083            << " val: " << Q_s[iAct] << endl;
00084     }
00085     cout << "Took action " << (a-Q_s.begin()) << " from state ";
00086     printState(s);
00087     cout << endl;
00088   }
00089 
00090   return a - Q_s.begin();
00091 }
00092 
00093 void Dyna::addExperience(float r, state_t s, bool term){
00094 
00095   dynaExperience e;
00096   e.s = laststate;
00097   e.a = lastact;
00098   e.next = s;
00099   e.r = r;
00100   e.term = term;
00101 
00102   experiences.push_back(e);
00103 
00104 }
00105 
00106 int Dyna::next_action(float r, const std::vector<float> &s) {
00107 
00108   if (ACTDEBUG){
00109     cout << "Next: got reward " << r << " in state: ";
00110     printState(s);
00111     cout << endl;
00112   }
00113 
00114   state_t st = canonicalize(s);
00115 
00116   addExperience(r,st,false);
00117 
00118   // Get action values
00119   std::vector<float> &Q_s = Q[st];
00120   const std::vector<float>::iterator max =
00121     random_max_element(Q_s.begin(), Q_s.end());
00122 
00123   // Update value of action just executed
00124   *currentq += alpha * (r + gamma * (*max) - *currentq);
00125 
00126   return getBestAction(s);
00127 
00128 }
00129 
00130 
00131 
00132 
00133 void Dyna::last_action(float r) {
00134 
00135   if (ACTDEBUG){
00136     cout << "Last: got reward " << r << endl;
00137   }
00138 
00139   addExperience(r,NULL,true);
00140 
00141   *currentq += alpha * (r - *currentq);
00142   currentq = NULL;
00143   laststate = NULL;
00144 }
00145 
00146 Dyna::state_t Dyna::canonicalize(const std::vector<float> &s) {
00147   const std::pair<std::set<std::vector<float> >::iterator, bool> result =
00148     statespace.insert(s);
00149   state_t retval = &*result.first; // Dereference iterator then get pointer 
00150   if (result.second) { // s is new, so initialize Q(s,a) for all a
00151     std::vector<float> &Q_s = Q[retval];
00152     Q_s.resize(numactions,initialvalue);
00153   }
00154   return retval; 
00155 }
00156 
00157 
00158 
00159   std::vector<float>::iterator
00160 Dyna::random_max_element(
00161                              std::vector<float>::iterator start,
00162                              std::vector<float>::iterator end) {
00163 
00164   std::vector<float>::iterator max =
00165     std::max_element(start, end);
00166   int n = std::count(max, end, *max);
00167   if (n > 1) {
00168     n = rng.uniformDiscrete(1, n);
00169     while (n > 1) {
00170       max = std::find(max + 1, end, *max);
00171       --n;
00172     }
00173   }
00174   return max;
00175 }
00176 
00177 
00178 
00179 
00180 void Dyna::setDebug(bool d){
00181   ACTDEBUG = d;
00182 }
00183 
00184 
00185 void Dyna::printState(const std::vector<float> &s){
00186   for (unsigned j = 0; j < s.size(); j++){
00187     cout << s[j] << ", ";
00188   }
00189 }
00190 
00191 
00192 
00193 void Dyna::seedExp(std::vector<experience> seeds){
00194 
00195   // for each seeding experience, update our model
00196   for (unsigned i = 0; i < seeds.size(); i++){
00197     experience e = seeds[i];
00198      
00199     laststate = canonicalize(e.s);
00200     lastact = e.act;
00201     state_t st = canonicalize(e.next);
00202     std::vector<float> &Q_s = Q[laststate];
00203     std::vector<float> &Q_next = Q[st];
00204     
00205     // add experience
00206     addExperience(e.reward,st,e.terminal);
00207 
00208     // get max value of next state
00209     const std::vector<float>::iterator max =
00210       random_max_element(Q_next.begin(), Q_next.end());
00211 
00212     // Get q value for action taken
00213     const std::vector<float>::iterator a = Q_s.begin() + e.act;
00214     currentq = &*a;
00215 
00216     // Update value of action just executed
00217     *currentq += alpha * (e.reward + gamma * (*max) - *currentq);
00218 
00219  
00220     /*
00221     cout << "Seeding with experience " << i << endl;
00222     cout << "last: " << (e.s)[0] << ", " << (e.s)[1] << ", " 
00223          << (e.s)[2] << endl;
00224     cout << "act: " << e.act << " r: " << e.reward << endl;
00225     cout << "next: " << (e.next)[0] << ", " << (e.next)[1] << ", " 
00226          << (e.next)[2] << ", " << e.terminal << endl;
00227     cout << "Q: " << *currentq << " max: " << *max << endl;
00228     */
00229 
00230   }
00231 
00232 
00233 }
00234 
00235 void Dyna::logValues(ofstream *of, int xmin, int xmax, int ymin, int ymax){
00236   std::vector<float> s;
00237   s.resize(2, 0.0);
00238   for (int i = xmin ; i < xmax; i++){
00239     for (int j = ymin; j < ymax; j++){
00240       s[0] = j;
00241       s[1] = i;
00242       std::vector<float> &Q_s = Q[canonicalize(s)];
00243       const std::vector<float>::iterator max =
00244         random_max_element(Q_s.begin(), Q_s.end());
00245       *of << (*max) << ",";
00246     }
00247   }
00248 }
00249 
00250 
00251 float Dyna::getValue(std::vector<float> state){
00252 
00253   state_t s = canonicalize(state);
00254 
00255   // Get Q values
00256   std::vector<float> &Q_s = Q[s];
00257 
00258   // Choose an action
00259   const std::vector<float>::iterator a =
00260     random_max_element(Q_s.begin(), Q_s.end()); // Choose maximum
00261 
00262   // Get avg value
00263   float valSum = 0.0;
00264   float cnt = 0;
00265   for (std::set<std::vector<float> >::iterator i = statespace.begin();
00266        i != statespace.end(); i++){
00267 
00268     state_t s = canonicalize(*i);
00269 
00270     // get state's info
00271     std::vector<float> &Q_s = Q[s];
00272       
00273     for (int j = 0; j < numactions; j++){
00274       valSum += Q_s[j];
00275       cnt++;
00276     }
00277   }
00278 
00279   cout << "Avg Value: " << (valSum / cnt) << endl;
00280 
00281   return *a;
00282 }
00283 
00284 
00285 void Dyna::savePolicy(const char* filename){
00286 
00287   ofstream policyFile(filename, ios::out | ios::binary | ios::trunc);
00288 
00289   // first part, save the vector size
00290   std::set< std::vector<float> >::iterator i = statespace.begin();
00291   int fsize = (*i).size();
00292   policyFile.write((char*)&fsize, sizeof(int));
00293 
00294   // save numactions
00295   policyFile.write((char*)&numactions, sizeof(int));
00296 
00297   // go through all states, and save Q values
00298   for (std::set< std::vector<float> >::iterator i = statespace.begin();
00299        i != statespace.end(); i++){
00300 
00301     state_t s = canonicalize(*i);
00302     std::vector<float> *Q_s = &(Q[s]);
00303 
00304     // save state
00305     policyFile.write((char*)&((*i)[0]), sizeof(float)*fsize);
00306 
00307     // save q-values
00308     policyFile.write((char*)&((*Q_s)[0]), sizeof(float)*numactions);
00309 
00310   }
00311 
00312   policyFile.close();
00313 }
00314 
00315 
00316 
00317 double Dyna::getSeconds(){
00318   struct timezone tz;
00319   timeval timeT;
00320   gettimeofday(&timeT, &tz);
00321   return  timeT.tv_sec + (timeT.tv_usec / 1000000.0);
00322 }