rl_agent: QLearner.cc Source File

Go to the documentation of this file.
00001 #include <rl_agent/QLearner.hh>
00002 #include <algorithm>
00003 
00004 QLearner::QLearner(int numactions, float gamma,
00005                    float initialvalue, float alpha, float ep,
00006                    Random rng):
00007   numactions(numactions), gamma(gamma),
00008   initialvalue(initialvalue), alpha(alpha),
00009   rng(rng), currentq(NULL)
00010 {
00011 
00012   epsilon = ep;
00013   ACTDEBUG = false; //true; //false;
00014 
00015 }
00016 
00017 QLearner::~QLearner() {}
00018 
00019 int QLearner::first_action(const std::vector<float> &s) {
00020 
00021   if (ACTDEBUG){
00022     cout << "First - in state: ";
00023     printState(s);
00024     cout << endl;
00025   }
00026 
00027   // Get action values
00028   std::vector<float> &Q_s = Q[canonicalize(s)];
00029 
00030   // Choose an action
00031   const std::vector<float>::iterator a =
00032     rng.uniform() < epsilon
00033     ? Q_s.begin() + rng.uniformDiscrete(0, numactions - 1) // Choose randomly
00034     : random_max_element(Q_s.begin(), Q_s.end()); // Choose maximum
00035 
00036   // Store location to update value later
00037   currentq = &*a;
00038 
00039   if (ACTDEBUG){
00040     cout << " act: " << (a-Q_s.begin()) << " val: " << *a << endl;
00041     for (int iAct = 0; iAct < numactions; iAct++){
00042       cout << " Action: " << iAct
00043            << " val: " << Q_s[iAct] << endl;
00044     }
00045     cout << "Took action " << (a-Q_s.begin()) << " from state ";
00046     printState(s);
00047     cout << endl;
00048   }
00049 
00050   return a - Q_s.begin();
00051 }
00052 
00053 int QLearner::next_action(float r, const std::vector<float> &s) {
00054 
00055   if (ACTDEBUG){
00056     cout << "Next: got reward " << r << " in state: ";
00057     printState(s);
00058     cout << endl;
00059   }
00060 
00061   // Get action values
00062   std::vector<float> &Q_s = Q[canonicalize(s)];
00063   const std::vector<float>::iterator max =
00064     random_max_element(Q_s.begin(), Q_s.end());
00065 
00066   // Update value of action just executed
00067   *currentq += alpha * (r + gamma * (*max) - *currentq);
00068 
00069   // Choose an action
00070   const std::vector<float>::iterator a =
00071     rng.uniform() < epsilon
00072     ? Q_s.begin() + rng.uniformDiscrete(0, numactions - 1)
00073     : max;
00074 
00075   // Store location to update value later
00076   currentq = &*a;
00077 
00078   if (ACTDEBUG){
00079     cout << " act: " << (a-Q_s.begin()) << " val: " << *a << endl;
00080     for (int iAct = 0; iAct < numactions; iAct++){
00081       cout << " Action: " << iAct
00082            << " val: " << Q_s[iAct] << endl;
00083     }
00084     cout << "Took action " << (a-Q_s.begin()) << " from state ";
00085     printState(s);
00086     cout << endl;
00087   }
00088 
00089   return a - Q_s.begin();
00090 }
00091 
00092 void QLearner::last_action(float r) {
00093 
00094   if (ACTDEBUG){
00095     cout << "Last: got reward " << r << endl;
00096   }
00097 
00098   *currentq += alpha * (r - *currentq);
00099   currentq = NULL;
00100 }
00101 
00102 QLearner::state_t QLearner::canonicalize(const std::vector<float> &s) {
00103   const std::pair<std::set<std::vector<float> >::iterator, bool> result =
00104     statespace.insert(s);
00105   state_t retval = &*result.first; // Dereference iterator then get pointer
00106   if (result.second) { // s is new, so initialize Q(s,a) for all a
00107     std::vector<float> &Q_s = Q[retval];
00108     Q_s.resize(numactions,initialvalue);
00109   }
00110   return retval;
00111 }
00112 
00113 
00114 
00115 std::vector<float>::iterator
00116 QLearner::random_max_element(
00117                              std::vector<float>::iterator start,
00118                              std::vector<float>::iterator end) {
00119 
00120   std::vector<float>::iterator max =
00121     std::max_element(start, end);
00122   int n = std::count(max, end, *max);
00123   if (n > 1) {
00124     n = rng.uniformDiscrete(1, n);
00125     while (n > 1) {
00126       max = std::find(max + 1, end, *max);
00127       --n;
00128     }
00129   }
00130   return max;
00131 }
00132 
00133 
00134 
00135 
00136 void QLearner::setDebug(bool d){
00137   ACTDEBUG = d;
00138 }
00139 
00140 
00141 void QLearner::printState(const std::vector<float> &s){
00142   for (unsigned j = 0; j < s.size(); j++){
00143     cout << s[j] << ", ";
00144   }
00145 }
00146 
00147 
00148 
00149 void QLearner::seedExp(std::vector<experience> seeds){
00150 
00151   // for each seeding experience, update our model
00152   for (unsigned i = 0; i < seeds.size(); i++){
00153     experience e = seeds[i];
00154 
00155     std::vector<float> &Q_s = Q[canonicalize(e.s)];
00156     std::vector<float> &Q_next = Q[canonicalize(e.next)];
00157 
00158     // get max value of next state
00159     const std::vector<float>::iterator max =
00160       random_max_element(Q_next.begin(), Q_next.end());
00161 
00162     // Get q value for action taken
00163     const std::vector<float>::iterator a = Q_s.begin() + e.act;
00164     currentq = &*a;
00165 
00166     // Update value of action just executed
00167     *currentq += alpha * (e.reward + gamma * (*max) - *currentq);
00168 
00169 
00170     /*
00171       cout << "Seeding with experience " << i << endl;
00172       cout << "last: " << (e.s)[0] << ", " << (e.s)[1] << ", "
00173       << (e.s)[2] << endl;
00174       cout << "act: " << e.act << " r: " << e.reward << endl;
00175       cout << "next: " << (e.next)[0] << ", " << (e.next)[1] << ", "
00176       << (e.next)[2] << ", " << e.terminal << endl;
00177       cout << "Q: " << *currentq << " max: " << *max << endl;
00178     */
00179 
00180   }
00181 
00182 
00183 }
00184 
00185 void QLearner::logValues(ofstream *of, int xmin, int xmax, int ymin, int ymax){
00186   std::vector<float> s;
00187   s.resize(2, 0.0);
00188   for (int i = xmin ; i < xmax; i++){
00189     for (int j = ymin; j < ymax; j++){
00190       s[0] = j;
00191       s[1] = i;
00192       std::vector<float> &Q_s = Q[canonicalize(s)];
00193       const std::vector<float>::iterator max =
00194         random_max_element(Q_s.begin(), Q_s.end());
00195       *of << (*max) << ",";
00196     }
00197   }
00198 }
00199 
00200 
00201 float QLearner::getValue(std::vector<float> state){
00202 
00203   state_t s = canonicalize(state);
00204 
00205   // Get Q values
00206   std::vector<float> &Q_s = Q[s];
00207 
00208   // Choose an action
00209   const std::vector<float>::iterator a =
00210     random_max_element(Q_s.begin(), Q_s.end()); // Choose maximum
00211 
00212   // Get avg value
00213   float valSum = 0.0;
00214   float cnt = 0;
00215   for (std::set<std::vector<float> >::iterator i = statespace.begin();
00216        i != statespace.end(); i++){
00217 
00218     state_t s = canonicalize(*i);
00219 
00220     // get state's info
00221     std::vector<float> &Q_s = Q[s];
00222 
00223     for (int j = 0; j < numactions; j++){
00224       valSum += Q_s[j];
00225       cnt++;
00226     }
00227   }
00228 
00229   cout << "Avg Value: " << (valSum / cnt) << endl;
00230 
00231   return *a;
00232 }
00233 
00234 
00235 void QLearner::savePolicy(const char* filename){
00236 
00237   ofstream policyFile(filename, ios::out | ios::binary | ios::trunc);
00238 
00239   // first part, save the vector size
00240   std::set< std::vector<float> >::iterator i = statespace.begin();
00241   int fsize = (*i).size();
00242   policyFile.write((char*)&fsize, sizeof(int));
00243 
00244   // save numactions
00245   policyFile.write((char*)&numactions, sizeof(int));
00246 
00247   // go through all states, and save Q values
00248   for (std::set< std::vector<float> >::iterator i = statespace.begin();
00249        i != statespace.end(); i++){
00250 
00251     state_t s = canonicalize(*i);
00252     std::vector<float> *Q_s = &(Q[s]);
00253 
00254     // save state
00255     policyFile.write((char*)&((*i)[0]), sizeof(float)*fsize);
00256 
00257     // save q-values
00258     policyFile.write((char*)&((*Q_s)[0]), sizeof(float)*numactions);
00259 
00260   }
00261 
00262   policyFile.close();
00263 }
00264 
00265 
00266 void QLearner::loadPolicy(const char* filename){
00267   bool LOADDEBUG = false;
00268 
00269   ifstream policyFile(filename, ios::in | ios::binary);
00270   if (!policyFile.is_open())
00271     return;
00272 
00273   // first part, save the vector size
00274   int fsize;
00275   policyFile.read((char*)&fsize, sizeof(int));
00276   if (LOADDEBUG) cout << "Numfeats loaded: " << fsize << endl;
00277 
00278   // save numactions
00279   int nact;
00280   policyFile.read((char*)&nact, sizeof(int));
00281 
00282   if (nact != numactions){
00283     cout << "this policy is not valid loaded nact: " << nact
00284          << " was told: " << numactions << endl;
00285     exit(-1);
00286   }
00287 
00288   // go through all states, loading q values
00289   while(!policyFile.eof()){
00290     std::vector<float> state;
00291     state.resize(fsize, 0.0);
00292 
00293     // load state
00294     policyFile.read((char*)&(state[0]), sizeof(float)*fsize);
00295     if (LOADDEBUG){
00296       cout << "load policy for state: ";
00297       printState(state);
00298     }
00299 
00300     state_t s = canonicalize(state);
00301     std::vector<float> *Q_s = &(Q[s]);
00302 
00303     if (policyFile.eof()) break;
00304 
00305     // load q values
00306     policyFile.read((char*)&((*Q_s)[0]), sizeof(float)*numactions);
00307 
00308     if (LOADDEBUG){
00309       cout << "Q values: " << endl;
00310       for (int iAct = 0; iAct < numactions; iAct++){
00311         cout << " Action: " << iAct << " val: " << (*Q_s)[iAct] << endl;
00312       }
00313     }
00314   }
00315 
00316   policyFile.close();
00317   cout << "Policy loaded!!!" << endl;
00318   //loaded = true;
00319 }
00320 
00321