00001 #include <rl_agent/Dyna.hh>
00002 #include <algorithm>
00003
00004 #include <sys/time.h>
00005
00006
00007 Dyna::Dyna(int numactions, float gamma,
00008 float initialvalue, float alpha, int k, float ep,
00009 Random rng):
00010 numactions(numactions), gamma(gamma),
00011 initialvalue(initialvalue), alpha(alpha), k(k),
00012 rng(rng), currentq(NULL), laststate(NULL), lastact(0)
00013 {
00014
00015 epsilon = ep;
00016 ACTDEBUG = false;
00017 cout << "Dyna agent with k:" << k << endl;
00018
00019 }
00020
00021 Dyna::~Dyna() {}
00022
00023 int Dyna::first_action(const std::vector<float> &s) {
00024
00025 if (ACTDEBUG){
00026 cout << "First - in state: ";
00027 printState(s);
00028 cout << endl;
00029 }
00030
00031 return getBestAction(s);
00032
00033 }
00034
00035 int Dyna::getBestAction(const std::vector<float> &s){
00036
00037
00038
00039 int numExp = (int)experiences.size();
00040 for (int i = 0; i < k && numExp > 0; i++){
00041
00042
00043 int exp = 0;
00044 if (numExp > 1)
00045 exp = rng.uniformDiscrete(0, numExp-1);
00046
00047
00048 dynaExperience e = experiences[exp];
00049
00050 std::vector<float> &Q_s = Q[e.s];
00051 if (e.term){
00052 Q_s[e.a] += alpha * (e.r - Q_s[e.a]);
00053 } else {
00054 std::vector<float> &Q_next = Q[e.next];
00055 const std::vector<float>::iterator max =
00056 random_max_element(Q_next.begin(), Q_next.end());
00057 Q_s[e.a] += alpha * (e.r + (gamma * *max) - Q_s[e.a]);
00058 }
00059
00060 }
00061
00062
00063
00064
00065 state_t st = canonicalize(s);
00066 std::vector<float> &Q_s = Q[st];
00067
00068
00069 const std::vector<float>::iterator a =
00070 rng.uniform() < epsilon
00071 ? Q_s.begin() + rng.uniformDiscrete(0, numactions - 1)
00072 : random_max_element(Q_s.begin(), Q_s.end());
00073
00074
00075 currentq = &*a;
00076 laststate = st;
00077 lastact = a - Q_s.begin();
00078
00079 if (ACTDEBUG){
00080 cout << " act: " << (a-Q_s.begin()) << " val: " << *a << endl;
00081 for (int iAct = 0; iAct < numactions; iAct++){
00082 cout << " Action: " << iAct
00083 << " val: " << Q_s[iAct] << endl;
00084 }
00085 cout << "Took action " << (a-Q_s.begin()) << " from state ";
00086 printState(s);
00087 cout << endl;
00088 }
00089
00090 return a - Q_s.begin();
00091 }
00092
00093 void Dyna::addExperience(float r, state_t s, bool term){
00094
00095 dynaExperience e;
00096 e.s = laststate;
00097 e.a = lastact;
00098 e.next = s;
00099 e.r = r;
00100 e.term = term;
00101
00102 experiences.push_back(e);
00103
00104 }
00105
00106 int Dyna::next_action(float r, const std::vector<float> &s) {
00107
00108 if (ACTDEBUG){
00109 cout << "Next: got reward " << r << " in state: ";
00110 printState(s);
00111 cout << endl;
00112 }
00113
00114 state_t st = canonicalize(s);
00115
00116 addExperience(r,st,false);
00117
00118
00119 std::vector<float> &Q_s = Q[st];
00120 const std::vector<float>::iterator max =
00121 random_max_element(Q_s.begin(), Q_s.end());
00122
00123
00124 *currentq += alpha * (r + gamma * (*max) - *currentq);
00125
00126 return getBestAction(s);
00127
00128 }
00129
00130
00131
00132
00133 void Dyna::last_action(float r) {
00134
00135 if (ACTDEBUG){
00136 cout << "Last: got reward " << r << endl;
00137 }
00138
00139 addExperience(r,NULL,true);
00140
00141 *currentq += alpha * (r - *currentq);
00142 currentq = NULL;
00143 laststate = NULL;
00144 }
00145
00146 Dyna::state_t Dyna::canonicalize(const std::vector<float> &s) {
00147 const std::pair<std::set<std::vector<float> >::iterator, bool> result =
00148 statespace.insert(s);
00149 state_t retval = &*result.first;
00150 if (result.second) {
00151 std::vector<float> &Q_s = Q[retval];
00152 Q_s.resize(numactions,initialvalue);
00153 }
00154 return retval;
00155 }
00156
00157
00158
00159 std::vector<float>::iterator
00160 Dyna::random_max_element(
00161 std::vector<float>::iterator start,
00162 std::vector<float>::iterator end) {
00163
00164 std::vector<float>::iterator max =
00165 std::max_element(start, end);
00166 int n = std::count(max, end, *max);
00167 if (n > 1) {
00168 n = rng.uniformDiscrete(1, n);
00169 while (n > 1) {
00170 max = std::find(max + 1, end, *max);
00171 --n;
00172 }
00173 }
00174 return max;
00175 }
00176
00177
00178
00179
00180 void Dyna::setDebug(bool d){
00181 ACTDEBUG = d;
00182 }
00183
00184
00185 void Dyna::printState(const std::vector<float> &s){
00186 for (unsigned j = 0; j < s.size(); j++){
00187 cout << s[j] << ", ";
00188 }
00189 }
00190
00191
00192
00193 void Dyna::seedExp(std::vector<experience> seeds){
00194
00195
00196 for (unsigned i = 0; i < seeds.size(); i++){
00197 experience e = seeds[i];
00198
00199 laststate = canonicalize(e.s);
00200 lastact = e.act;
00201 state_t st = canonicalize(e.next);
00202 std::vector<float> &Q_s = Q[laststate];
00203 std::vector<float> &Q_next = Q[st];
00204
00205
00206 addExperience(e.reward,st,e.terminal);
00207
00208
00209 const std::vector<float>::iterator max =
00210 random_max_element(Q_next.begin(), Q_next.end());
00211
00212
00213 const std::vector<float>::iterator a = Q_s.begin() + e.act;
00214 currentq = &*a;
00215
00216
00217 *currentq += alpha * (e.reward + gamma * (*max) - *currentq);
00218
00219
00220
00221
00222
00223
00224
00225
00226
00227
00228
00229
00230 }
00231
00232
00233 }
00234
00235 void Dyna::logValues(ofstream *of, int xmin, int xmax, int ymin, int ymax){
00236 std::vector<float> s;
00237 s.resize(2, 0.0);
00238 for (int i = xmin ; i < xmax; i++){
00239 for (int j = ymin; j < ymax; j++){
00240 s[0] = j;
00241 s[1] = i;
00242 std::vector<float> &Q_s = Q[canonicalize(s)];
00243 const std::vector<float>::iterator max =
00244 random_max_element(Q_s.begin(), Q_s.end());
00245 *of << (*max) << ",";
00246 }
00247 }
00248 }
00249
00250
00251 float Dyna::getValue(std::vector<float> state){
00252
00253 state_t s = canonicalize(state);
00254
00255
00256 std::vector<float> &Q_s = Q[s];
00257
00258
00259 const std::vector<float>::iterator a =
00260 random_max_element(Q_s.begin(), Q_s.end());
00261
00262
00263 float valSum = 0.0;
00264 float cnt = 0;
00265 for (std::set<std::vector<float> >::iterator i = statespace.begin();
00266 i != statespace.end(); i++){
00267
00268 state_t s = canonicalize(*i);
00269
00270
00271 std::vector<float> &Q_s = Q[s];
00272
00273 for (int j = 0; j < numactions; j++){
00274 valSum += Q_s[j];
00275 cnt++;
00276 }
00277 }
00278
00279 cout << "Avg Value: " << (valSum / cnt) << endl;
00280
00281 return *a;
00282 }
00283
00284
00285 void Dyna::savePolicy(const char* filename){
00286
00287 ofstream policyFile(filename, ios::out | ios::binary | ios::trunc);
00288
00289
00290 std::set< std::vector<float> >::iterator i = statespace.begin();
00291 int fsize = (*i).size();
00292 policyFile.write((char*)&fsize, sizeof(int));
00293
00294
00295 policyFile.write((char*)&numactions, sizeof(int));
00296
00297
00298 for (std::set< std::vector<float> >::iterator i = statespace.begin();
00299 i != statespace.end(); i++){
00300
00301 state_t s = canonicalize(*i);
00302 std::vector<float> *Q_s = &(Q[s]);
00303
00304
00305 policyFile.write((char*)&((*i)[0]), sizeof(float)*fsize);
00306
00307
00308 policyFile.write((char*)&((*Q_s)[0]), sizeof(float)*numactions);
00309
00310 }
00311
00312 policyFile.close();
00313 }
00314
00315
00316
00317 double Dyna::getSeconds(){
00318 struct timezone tz;
00319 timeval timeT;
00320 gettimeofday(&timeT, &tz);
00321 return timeT.tv_sec + (timeT.tv_usec / 1000000.0);
00322 }