00001 #include <rl_agent/QLearner.hh>
00002 #include <algorithm>
00003
00004 QLearner::QLearner(int numactions, float gamma,
00005 float initialvalue, float alpha, float ep,
00006 Random rng):
00007 numactions(numactions), gamma(gamma),
00008 initialvalue(initialvalue), alpha(alpha),
00009 rng(rng), currentq(NULL)
00010 {
00011
00012 epsilon = ep;
00013 ACTDEBUG = false;
00014
00015 }
00016
00017 QLearner::~QLearner() {}
00018
00019 int QLearner::first_action(const std::vector<float> &s) {
00020
00021 if (ACTDEBUG){
00022 cout << "First - in state: ";
00023 printState(s);
00024 cout << endl;
00025 }
00026
00027
00028 std::vector<float> &Q_s = Q[canonicalize(s)];
00029
00030
00031 const std::vector<float>::iterator a =
00032 rng.uniform() < epsilon
00033 ? Q_s.begin() + rng.uniformDiscrete(0, numactions - 1)
00034 : random_max_element(Q_s.begin(), Q_s.end());
00035
00036
00037 currentq = &*a;
00038
00039 if (ACTDEBUG){
00040 cout << " act: " << (a-Q_s.begin()) << " val: " << *a << endl;
00041 for (int iAct = 0; iAct < numactions; iAct++){
00042 cout << " Action: " << iAct
00043 << " val: " << Q_s[iAct] << endl;
00044 }
00045 cout << "Took action " << (a-Q_s.begin()) << " from state ";
00046 printState(s);
00047 cout << endl;
00048 }
00049
00050 return a - Q_s.begin();
00051 }
00052
00053 int QLearner::next_action(float r, const std::vector<float> &s) {
00054
00055 if (ACTDEBUG){
00056 cout << "Next: got reward " << r << " in state: ";
00057 printState(s);
00058 cout << endl;
00059 }
00060
00061
00062 std::vector<float> &Q_s = Q[canonicalize(s)];
00063 const std::vector<float>::iterator max =
00064 random_max_element(Q_s.begin(), Q_s.end());
00065
00066
00067 *currentq += alpha * (r + gamma * (*max) - *currentq);
00068
00069
00070 const std::vector<float>::iterator a =
00071 rng.uniform() < epsilon
00072 ? Q_s.begin() + rng.uniformDiscrete(0, numactions - 1)
00073 : max;
00074
00075
00076 currentq = &*a;
00077
00078 if (ACTDEBUG){
00079 cout << " act: " << (a-Q_s.begin()) << " val: " << *a << endl;
00080 for (int iAct = 0; iAct < numactions; iAct++){
00081 cout << " Action: " << iAct
00082 << " val: " << Q_s[iAct] << endl;
00083 }
00084 cout << "Took action " << (a-Q_s.begin()) << " from state ";
00085 printState(s);
00086 cout << endl;
00087 }
00088
00089 return a - Q_s.begin();
00090 }
00091
00092 void QLearner::last_action(float r) {
00093
00094 if (ACTDEBUG){
00095 cout << "Last: got reward " << r << endl;
00096 }
00097
00098 *currentq += alpha * (r - *currentq);
00099 currentq = NULL;
00100 }
00101
00102 QLearner::state_t QLearner::canonicalize(const std::vector<float> &s) {
00103 const std::pair<std::set<std::vector<float> >::iterator, bool> result =
00104 statespace.insert(s);
00105 state_t retval = &*result.first;
00106 if (result.second) {
00107 std::vector<float> &Q_s = Q[retval];
00108 Q_s.resize(numactions,initialvalue);
00109 }
00110 return retval;
00111 }
00112
00113
00114
00115 std::vector<float>::iterator
00116 QLearner::random_max_element(
00117 std::vector<float>::iterator start,
00118 std::vector<float>::iterator end) {
00119
00120 std::vector<float>::iterator max =
00121 std::max_element(start, end);
00122 int n = std::count(max, end, *max);
00123 if (n > 1) {
00124 n = rng.uniformDiscrete(1, n);
00125 while (n > 1) {
00126 max = std::find(max + 1, end, *max);
00127 --n;
00128 }
00129 }
00130 return max;
00131 }
00132
00133
00134
00135
00136 void QLearner::setDebug(bool d){
00137 ACTDEBUG = d;
00138 }
00139
00140
00141 void QLearner::printState(const std::vector<float> &s){
00142 for (unsigned j = 0; j < s.size(); j++){
00143 cout << s[j] << ", ";
00144 }
00145 }
00146
00147
00148
00149 void QLearner::seedExp(std::vector<experience> seeds){
00150
00151
00152 for (unsigned i = 0; i < seeds.size(); i++){
00153 experience e = seeds[i];
00154
00155 std::vector<float> &Q_s = Q[canonicalize(e.s)];
00156 std::vector<float> &Q_next = Q[canonicalize(e.next)];
00157
00158
00159 const std::vector<float>::iterator max =
00160 random_max_element(Q_next.begin(), Q_next.end());
00161
00162
00163 const std::vector<float>::iterator a = Q_s.begin() + e.act;
00164 currentq = &*a;
00165
00166
00167 *currentq += alpha * (e.reward + gamma * (*max) - *currentq);
00168
00169
00170
00171
00172
00173
00174
00175
00176
00177
00178
00179
00180 }
00181
00182
00183 }
00184
00185 void QLearner::logValues(ofstream *of, int xmin, int xmax, int ymin, int ymax){
00186 std::vector<float> s;
00187 s.resize(2, 0.0);
00188 for (int i = xmin ; i < xmax; i++){
00189 for (int j = ymin; j < ymax; j++){
00190 s[0] = j;
00191 s[1] = i;
00192 std::vector<float> &Q_s = Q[canonicalize(s)];
00193 const std::vector<float>::iterator max =
00194 random_max_element(Q_s.begin(), Q_s.end());
00195 *of << (*max) << ",";
00196 }
00197 }
00198 }
00199
00200
00201 float QLearner::getValue(std::vector<float> state){
00202
00203 state_t s = canonicalize(state);
00204
00205
00206 std::vector<float> &Q_s = Q[s];
00207
00208
00209 const std::vector<float>::iterator a =
00210 random_max_element(Q_s.begin(), Q_s.end());
00211
00212
00213 float valSum = 0.0;
00214 float cnt = 0;
00215 for (std::set<std::vector<float> >::iterator i = statespace.begin();
00216 i != statespace.end(); i++){
00217
00218 state_t s = canonicalize(*i);
00219
00220
00221 std::vector<float> &Q_s = Q[s];
00222
00223 for (int j = 0; j < numactions; j++){
00224 valSum += Q_s[j];
00225 cnt++;
00226 }
00227 }
00228
00229 cout << "Avg Value: " << (valSum / cnt) << endl;
00230
00231 return *a;
00232 }
00233
00234
00235 void QLearner::savePolicy(const char* filename){
00236
00237 ofstream policyFile(filename, ios::out | ios::binary | ios::trunc);
00238
00239
00240 std::set< std::vector<float> >::iterator i = statespace.begin();
00241 int fsize = (*i).size();
00242 policyFile.write((char*)&fsize, sizeof(int));
00243
00244
00245 policyFile.write((char*)&numactions, sizeof(int));
00246
00247
00248 for (std::set< std::vector<float> >::iterator i = statespace.begin();
00249 i != statespace.end(); i++){
00250
00251 state_t s = canonicalize(*i);
00252 std::vector<float> *Q_s = &(Q[s]);
00253
00254
00255 policyFile.write((char*)&((*i)[0]), sizeof(float)*fsize);
00256
00257
00258 policyFile.write((char*)&((*Q_s)[0]), sizeof(float)*numactions);
00259
00260 }
00261
00262 policyFile.close();
00263 }
00264
00265
00266 void QLearner::loadPolicy(const char* filename){
00267 bool LOADDEBUG = false;
00268
00269 ifstream policyFile(filename, ios::in | ios::binary);
00270 if (!policyFile.is_open())
00271 return;
00272
00273
00274 int fsize;
00275 policyFile.read((char*)&fsize, sizeof(int));
00276 if (LOADDEBUG) cout << "Numfeats loaded: " << fsize << endl;
00277
00278
00279 int nact;
00280 policyFile.read((char*)&nact, sizeof(int));
00281
00282 if (nact != numactions){
00283 cout << "this policy is not valid loaded nact: " << nact
00284 << " was told: " << numactions << endl;
00285 exit(-1);
00286 }
00287
00288
00289 while(!policyFile.eof()){
00290 std::vector<float> state;
00291 state.resize(fsize, 0.0);
00292
00293
00294 policyFile.read((char*)&(state[0]), sizeof(float)*fsize);
00295 if (LOADDEBUG){
00296 cout << "load policy for state: ";
00297 printState(state);
00298 }
00299
00300 state_t s = canonicalize(state);
00301 std::vector<float> *Q_s = &(Q[s]);
00302
00303 if (policyFile.eof()) break;
00304
00305
00306 policyFile.read((char*)&((*Q_s)[0]), sizeof(float)*numactions);
00307
00308 if (LOADDEBUG){
00309 cout << "Q values: " << endl;
00310 for (int iAct = 0; iAct < numactions; iAct++){
00311 cout << " Action: " << iAct << " val: " << (*Q_s)[iAct] << endl;
00312 }
00313 }
00314 }
00315
00316 policyFile.close();
00317 cout << "Policy loaded!!!" << endl;
00318
00319 }
00320
00321