00001 #include "PolicyIteration.hh"
00002 #include <algorithm>
00003
00004
00005 #include <sys/time.h>
00006
00007
00008 PolicyIteration::PolicyIteration(int numactions, float gamma,
00009 int MAX_LOOPS, float MAX_TIME, int modelType,
00010 const std::vector<float> &fmax,
00011 const std::vector<float> &fmin,
00012 const std::vector<int> &n, Random r):
00013 numactions(numactions), gamma(gamma),
00014 MAX_LOOPS(MAX_LOOPS), MAX_TIME(MAX_TIME), modelType(modelType),
00015 statesPerDim(n)
00016 {
00017 rng = r;
00018
00019 nstates = 0;
00020 nactions = 0;
00021
00022 timingType = false;
00023
00024 model = NULL;
00025 planTime = getSeconds();
00026
00027
00028 MAX_STEPS = 10;
00029
00030 PLANNERDEBUG =false;
00031 POLICYDEBUG = false;
00032 ACTDEBUG = false;
00033 MODELDEBUG = false;
00034
00035 if (statesPerDim[0] > 0){
00036 cout << "Planner PI using discretization of " << statesPerDim[0] << endl;
00037 }
00038
00039 featmax = fmax;
00040 featmin = fmin;
00041
00042 }
00043
00044 PolicyIteration::~PolicyIteration() {
00045 for (std::map<state_t, state_info>::iterator i = statedata.begin();
00046 i != statedata.end(); i++){
00047
00048
00049
00050 state_info* info = &((*i).second);
00051
00052 deleteInfo(info);
00053 }
00054
00055 statedata.clear();
00056 }
00057
00058 void PolicyIteration::setModel(MDPModel* m){
00059
00060 model = m;
00061
00062 }
00063
00064
00066
00068
00069
00070 void PolicyIteration::initNewState(state_t s){
00071 if (PLANNERDEBUG) cout << "initNewState(s = " << s
00072 << ") size = " << s->size() << endl;
00073
00074 if (MODELDEBUG) cout << "New State: " << endl;
00075
00076
00077 state_info* info = &(statedata[s]);
00078 initStateInfo(info);
00079
00080
00081 for (int i = 0; i < numactions; i++){
00082 model->getStateActionInfo(*s, i, &(info->modelInfo[i]));
00083 }
00084
00085
00086 if (PLANNERDEBUG) cout << "done with initNewState()" << endl;
00087
00088 }
00089
00091 bool PolicyIteration::updateModelWithExperience(const std::vector<float> &laststate,
00092 int lastact,
00093 const std::vector<float> &currstate,
00094 float reward, bool term){
00095 if (PLANNERDEBUG) cout << "updateModelWithExperience(last = " << &laststate
00096 << ", curr = " << &currstate
00097 << ", lastact = " << lastact
00098 << ", r = " << reward
00099 << ")" << endl;
00100
00101 if (!timingType)
00102 planTime = getSeconds();
00103
00104
00105
00106 state_t last = canonicalize(laststate);
00107 state_t curr = canonicalize(currstate);
00108
00109 prevstate = laststate;
00110 prevact = lastact;
00111
00112
00113 if (curr == NULL)
00114 return false;
00115
00116
00117 state_info* info = &(statedata[last]);
00118
00119
00120 info->visits[lastact]++;
00121
00122
00123 if (model == NULL){
00124 cout << "ERROR IN MODEL OR MODEL SIZE" << endl;
00125 exit(-1);
00126 }
00127
00128 experience e;
00129 e.s = *last;
00130 e.next = *curr;
00131 e.act = lastact;
00132 e.reward = reward;
00133 e.terminal = term;
00134 bool modelChanged = model->updateWithExperience(e);
00135
00136 if (PLANNERDEBUG) cout << "Added exp: " << modelChanged << endl;
00137 if (timingType)
00138 planTime = getSeconds();
00139
00140 return modelChanged;
00141
00142 }
00143
00145 void PolicyIteration::updateStateActionFromModel(const std::vector<float> &state, int a){
00146 if (PLANNERDEBUG) cout << "updateStateActionFromModel()" << endl;
00147
00148 state_t s = canonicalize(state);
00149
00150
00151 state_info* info = &(statedata[s]);
00152
00153
00154
00155 model->getStateActionInfo(state, a, &(info->modelInfo[a]));
00156
00157 info->fresh = false;
00158
00159 }
00160
00162 void PolicyIteration::updateStatesFromModel(){
00163 if (PLANNERDEBUG) cout << "updateStatesFromModel()" << endl;
00164
00165
00166 for (std::set<std::vector<float> >::iterator i = statespace.begin();
00167 i != statespace.end(); i++){
00168
00169 if (PLANNERDEBUG) cout << "updateStatesFromModel i = " << &i << endl;
00170 if ((getSeconds() - planTime) > MAX_TIME)
00171 break;
00172
00173 state_t s = canonicalize(*i);
00174
00175
00176 state_info* info = &(statedata[s]);
00177
00178
00179
00180 for (int j = 0; j < numactions; j++){
00181 model->getStateActionInfo(*s, j, &(info->modelInfo[j]));
00182 }
00183
00184
00185
00186 info->fresh = false;
00187
00188 if (PLANNERDEBUG) cout << "updateStatesFromModel i = " << &i << " complete" << endl;
00189
00190 }
00191
00192 }
00193
00194
00196 int PolicyIteration::getBestAction(const std::vector<float> &state){
00197 if (PLANNERDEBUG) cout << "getBestAction(s = " << &state
00198 << ")" << endl;
00199
00200 state_t s = canonicalize(state);
00201
00202
00203 state_info* info = &(statedata[s]);
00204
00205 int act = info->bestAction;
00206
00207 if (ACTDEBUG){
00208 cout << endl << "chooseAction State " << (*s)[0] << "," << (*s)[1]
00209 << " act: " << act << " val: " << info->value << endl;
00210 for (int iAct = 0; iAct < numactions; iAct++){
00211 cout << " Action: " << iAct
00212 << " visits: " << info->visits[iAct]
00213 << " modelsAgree: " << info->modelInfo[iAct].known << endl;
00214 }
00215 }
00216
00217 nactions++;
00218
00219
00220 return act;
00221 }
00222
00223
00224
00225
00226
00227 void PolicyIteration::planOnNewModel(){
00228
00229
00230
00231 if (modelType == RMAX){
00232 updateStateActionFromModel(prevstate, prevact);
00233 }
00234 else {
00235 updateStatesFromModel();
00236 }
00237
00238
00239 createPolicy();
00240
00241 }
00242
00243
00245
00247
00248 PolicyIteration::state_t PolicyIteration::canonicalize(const std::vector<float> &s) {
00249 if (PLANNERDEBUG) cout << "canonicalize(s = " << s[0] << ", "
00250 << s[1] << ")" << endl;
00251
00252 std::vector<float> s2;
00253 if (statesPerDim[0] > 0){
00254 s2 = discretizeState(s);
00255 } else {
00256 s2 = s;
00257 }
00258
00259
00260 const std::pair<std::set<std::vector<float> >::iterator, bool> result =
00261 statespace.insert(s2);
00262 state_t retval = &*result.first;
00263
00264 if (PLANNERDEBUG) cout << " returns " << retval
00265 << " New: " << result.second << endl;
00266
00267
00268 if (result.second) {
00269 initNewState(retval);
00270 if (PLANNERDEBUG) cout << " New state initialized" << endl;
00271 }
00272
00273
00274 return retval;
00275 }
00276
00277
00278
00279 void PolicyIteration::initStateInfo(state_info* info){
00280 if (PLANNERDEBUG) cout << "initStateInfo()";
00281
00282 info->id = nstates++;
00283 if (PLANNERDEBUG) cout << " id = " << info->id << endl;
00284
00285 info->fresh = true;
00286 info->stepsAway = 100000;
00287
00288
00289 info->modelInfo = new StateActionInfo[numactions];
00290
00291
00292 info->visits.resize(numactions, 0);
00293
00294 info->value = 0.0;
00295 info->bestAction = rng.uniformDiscrete(0, numactions-1);
00296
00297 if (PLANNERDEBUG) cout << "done with initStateInfo()" << endl;
00298
00299 }
00300
00301
00303 void PolicyIteration::printStates(){
00304
00305 for (std::set< std::vector<float> >::iterator i = statespace.begin();
00306 i != statespace.end(); i++){
00307
00308 state_t s = canonicalize(*i);
00309
00310 state_info* info = &(statedata[s]);
00311
00312 cout << "State " << info->id << ": ";
00313 for (unsigned j = 0; j < s->size(); j++){
00314 cout << (*s)[j] << ", ";
00315 }
00316 cout << endl;
00317
00318 cout << "Value: " << info->value
00319 << " bestAction: " << info->bestAction << endl;
00320
00321 for (int act = 0; act < numactions; act++){
00322 cout << " visits[" << act << "] = " << info->visits[act]
00323 << " R: " << info->modelInfo[act].reward << endl;
00324 }
00325
00326 }
00327 }
00328
00329
00330
00331
00333 void PolicyIteration::calculateReachableStates(){
00334
00335
00336 for (std::set<std::vector<float> >::iterator i = statespace.begin();
00337 i != statespace.end(); i++){
00338
00339 state_t s = canonicalize(*i);
00340
00341
00342 state_info* info = &(statedata[s]);
00343
00344 info->stepsAway = 100000;
00345
00346
00347
00348
00349
00350
00351 for (int j = 0; j < numactions; j++){
00352 if (info->visits[j] > 0){
00353 info->stepsAway = 0;
00354 break;
00355 }
00356 }
00357 }
00358 }
00359
00361 void PolicyIteration::removeUnreachableStates(){
00362
00363 return;
00364
00365
00366
00367 for (std::set<std::vector<float> >::iterator i = statespace.begin();
00368 i != statespace.end(); i++){
00369
00370 state_t s = canonicalize(*i);
00371
00372
00373 state_info* info = &(statedata[s]);
00374
00375
00376 if (info->stepsAway > MAX_STEPS){
00377
00378 if (POLICYDEBUG){
00379 cout << "Removing unreachable state: " << (*s)[0];
00380 for (unsigned i = 1; i < s->size(); i++){
00381 cout << ", " << (*s)[i];
00382 }
00383 cout << endl;
00384 }
00385
00386
00387 deleteInfo(info);
00388
00389
00390 statespace.erase(*s);
00391
00392
00393 statedata.erase(s);
00394
00395 }
00396 }
00397 }
00398
00399
00400
00401
00402
00403
00404 void PolicyIteration::deleteInfo(state_info* info){
00405
00406 delete [] info->modelInfo;
00407
00408 }
00409
00410
00411 double PolicyIteration::getSeconds(){
00412 struct timezone tz;
00413 timeval timeT;
00414 gettimeofday(&timeT, &tz);
00415 return timeT.tv_sec + (timeT.tv_usec / 1000000.0);
00416 }
00417
00418
00420 void PolicyIteration::createPolicy(){
00421 if (PLANNERDEBUG || POLICYDEBUG) cout << endl << "createPolicy()" << endl;
00422
00423 int nloops = 0;
00424
00425 calculateReachableStates();
00426
00427
00428
00429 bool policyStable = false;
00430
00431 while (!policyStable && nloops < MAX_LOOPS){
00432
00433
00434
00435
00436 if ((getSeconds() - planTime) > MAX_TIME)
00437 break;
00438
00439
00440 policyEvaluation();
00441
00442
00443 policyStable = policyImprovement();
00444
00445 if (POLICYDEBUG)
00446 cout << "Loop " << nloops << " stable: " << policyStable << endl;
00447
00448 nloops++;
00449
00450 }
00451
00452 if (POLICYDEBUG){
00453 cout << "Finished after " << nloops << " loop and "
00454 << (getSeconds() - planTime)
00455 << " time. " << endl;
00456 }
00457
00458 }
00459
00460
00461 bool PolicyIteration::policyImprovement(){
00462
00463 if (POLICYDEBUG)
00464 cout << endl << "PolicyImprovement" << endl;
00465
00466 bool policyStable = true;
00467
00468
00469 for (std::set<std::vector<float> >::iterator i = statespace.begin();
00470 i != statespace.end(); i++){
00471
00472 if ((getSeconds() - planTime) > MAX_TIME)
00473 break;
00474
00475 state_t s = canonicalize(*i);
00476
00477
00478 state_info* info = &(statedata[s]);
00479
00480
00481 if (POLICYDEBUG){
00482 cout << endl << " State: id: " << info->id << ": " ;
00483 for (unsigned si = 0; si < s->size(); si++){
00484 cout << (*s)[si] << ",";
00485 }
00486 cout << " Value: " << info->value
00487 << " Action: " << info->bestAction << endl;
00488
00489 }
00490
00491 int prevAction = info->bestAction;
00492
00493 std::vector<float> Q(numactions);
00494
00495
00496 for (int act = 0; act < numactions; act++){
00497
00498 float val = getActionValue(s, info, act);
00499 Q[act] = val;
00500
00501 }
00502
00503
00504 const std::vector<float>::iterator a =
00505 random_max_element(Q.begin(), Q.end());
00506
00507 int act = a - Q.begin();
00508 float val = *a;
00509
00510
00511 if (val != Q[prevAction]){
00512 info->bestAction = act;
00513 info->value = val;
00514
00515 policyStable = false;
00516
00517 if (POLICYDEBUG)
00518 cout << "Action changed from " << prevAction << " to "
00519 << info->bestAction << endl;
00520 }
00521
00522 }
00523
00524 return policyStable;
00525
00526 }
00527
00528 float PolicyIteration::getActionValue(state_t s, state_info* info,
00529 int act){
00530
00531
00532
00533 StateActionInfo *modelInfo = &(info->modelInfo[act]);
00534
00535 if (POLICYDEBUG)
00536 cout << " Action: " << act
00537 << " State visits: " << info->visits[act] << endl;
00538
00539
00540
00541 float newQ = modelInfo->reward;
00542
00543 float probSum = modelInfo->termProb;
00544
00545
00546
00547 for (std::map<std::vector<float>, float>::iterator outIt
00548 = modelInfo->transitionProbs.begin();
00549 outIt != modelInfo->transitionProbs.end(); outIt++){
00550
00551 std::vector<float> nextstate = (*outIt).first;
00552
00553 if (POLICYDEBUG){
00554 cout << " Next state was: ";
00555 for (unsigned oi = 0; oi < nextstate.size(); oi++){
00556 cout << nextstate[oi] << ",";
00557 }
00558 cout << endl;
00559 }
00560
00561
00562 float transitionProb = (1.0-modelInfo->termProb)*
00563 modelInfo->transitionProbs[nextstate];
00564
00565 probSum += transitionProb;
00566
00567 if (POLICYDEBUG)
00568 cout << " prob: " << transitionProb << endl;
00569
00570 if (transitionProb < 0 || transitionProb > 1.0001){
00571 cout << "Error with transitionProb: " << transitionProb << endl;
00572 exit(-1);
00573 }
00574
00575
00576 if (transitionProb > 0.0){
00577
00578
00579 float maxval = 0.0;
00580
00581
00582 bool realState = true;
00583
00584 for (unsigned b = 0; b < nextstate.size(); b++){
00585 if (nextstate[b] < (featmin[b]-EPSILON)
00586 || nextstate[b] > (featmax[b]+EPSILON)){
00587 realState = false;
00588 if (POLICYDEBUG)
00589 cout << " Next state is not valid (feature "
00590 << b << " out of range)" << endl;
00591 break;
00592 }
00593 }
00594
00595
00596 if (info->stepsAway < MAX_STEPS && realState){
00597
00598 state_t next = canonicalize(nextstate);
00599
00600 state_info* nextinfo = &(statedata[next]);
00601
00602
00603 int newSteps = info->stepsAway + 1;
00604 if (newSteps < nextinfo->stepsAway){
00605 if (POLICYDEBUG) {
00606 cout << " Setting state to "
00607 << newSteps << " steps away." << endl;
00608 }
00609 nextinfo->stepsAway = newSteps;
00610 }
00611
00612
00613 maxval = nextinfo->value;
00614
00615 }
00616 else {
00617 maxval = 0.0;
00618 if (POLICYDEBUG){
00619 cout << "This state is too far away, state: ";
00620 for (unsigned si = 0; si < s->size(); si++){
00621 cout << (*s)[si] << ",";
00622 }
00623 cout << " Action: " << act << endl;
00624 }
00625 }
00626
00627 nextstate.clear();
00628
00629 if (POLICYDEBUG) cout << " Max value: " << maxval << endl;
00630
00631
00632 newQ += (gamma * transitionProb * maxval);
00633
00634 }
00635
00636 }
00637
00638
00639 if (probSum < 0.9999 || probSum > 1.0001){
00640 cout << "Error: transition probabilities do not add to 1: Sum: "
00641 << probSum << endl;
00642 exit(-1);
00643 }
00644
00645
00646 return newQ;
00647
00648 }
00649
00650
00651
00652 void PolicyIteration::policyEvaluation(){
00653
00654 if (POLICYDEBUG)
00655 cout << endl << "Policy Evaluation" << endl;
00656
00657 float maxError = 5000;
00658 float MIN_ERROR = 0.0001;
00659 int nloops = 0;
00660
00661
00662 while (maxError > MIN_ERROR && nloops < MAX_LOOPS){
00663
00664
00665 if (POLICYDEBUG)
00666 cout << "max error: " << maxError << " nloops: " << nloops << endl;
00667
00668 maxError = 0;
00669 nloops++;
00670
00671
00672 for (std::set<std::vector<float> >::iterator i = statespace.begin();
00673 i != statespace.end(); i++){
00674
00675 if ((getSeconds() - planTime) > MAX_TIME)
00676 break;
00677
00678 state_t s = canonicalize(*i);
00679
00680
00681 state_info* info = &(statedata[s]);
00682
00683 if (POLICYDEBUG){
00684 cout << endl << " State: id: " << info->id << ": " ;
00685 for (unsigned si = 0; si < s->size(); si++){
00686 cout << (*s)[si] << ",";
00687 }
00688 cout << " Value: " << info->value
00689 << " action: " << info->bestAction << endl;
00690 cout << " Steps: " << info->stepsAway << endl;
00691
00692 }
00693
00694
00695 if (info->stepsAway > 99999){
00696 if (POLICYDEBUG)
00697 cout << "State not reachable, ignoring." << endl;
00698 continue;
00699 }
00700
00701
00702 int act = info->bestAction;
00703
00704 float oldValue = info->value;
00705
00706 float newVal = getActionValue(s, info, act);
00707
00708 info->value = newVal;
00709
00710
00711 float tdError = fabs(info->value - oldValue);
00712 if (POLICYDEBUG) cout << " NewVal: " << info->value
00713 << " OldVal: " << oldValue << endl;
00714
00715
00716 if (tdError > maxError)
00717 maxError = tdError;
00718
00719 if (POLICYDEBUG)
00720 cout << " TD error: " << tdError
00721 << " Max error: " << maxError << endl;
00722
00723
00724 }
00725
00726 }
00727
00728 if (false || nloops >= MAX_LOOPS){
00729 cout << nactions << " Policy evaluation ended with maxError: "
00730 << maxError << " nloops: " << nloops << endl;
00731 }
00732
00733 if (POLICYDEBUG) cout << nactions
00734 << " policy eval complete: maxError: "
00735 << maxError << " nloops: " << nloops
00736 << endl;
00737
00738
00739 }
00740
00741
00742 void PolicyIteration::savePolicy(const char* filename){
00743
00744 ofstream policyFile(filename, ios::out | ios::binary | ios::trunc);
00745
00746
00747 int fsize = featmin.size();
00748 policyFile.write((char*)&fsize, sizeof(int));
00749
00750
00751 policyFile.write((char*)&numactions, sizeof(int));
00752
00753
00754 for (std::set< std::vector<float> >::iterator i = statespace.begin();
00755 i != statespace.end(); i++){
00756
00757 state_t s = canonicalize(*i);
00758 state_info* info = &(statedata[s]);
00759
00760
00761 policyFile.write((char*)&((*i)[0]), sizeof(float)*fsize);
00762
00763
00764 float badvalue = info->value - 1.0;
00765 for (int j = 0; j < numactions; j++){
00766 if (j == info->bestAction){
00767 policyFile.write((char*)&(info->value), sizeof(float));
00768 } else {
00769 policyFile.write((char*)&(badvalue), sizeof(float));
00770 }
00771 }
00772
00773 }
00774
00775 policyFile.close();
00776 }
00777
00778
00779
00780 std::vector<float> PolicyIteration::discretizeState(const std::vector<float> &s){
00781 std::vector<float> ds(s.size());
00782
00783 for (unsigned i = 0; i < s.size(); i++){
00784
00785
00786
00787
00788
00789 float factor = (featmax[i] - featmin[i]) / (float)statesPerDim[i];
00790 int bin = 0;
00791 if (s[i] > 0){
00792 bin = (int)((s[i]+factor/2) / factor);
00793 } else {
00794 bin = (int)((s[i]-factor/2) / factor);
00795 }
00796
00797 ds[i] = factor*bin;
00798
00799
00800 }
00801
00802 return ds;
00803 }