00001
00008 #include <stdlib.h>
00009
00010 #include <stdio.h>
00011 #include <assert.h>
00012
00013 #include <iostream>
00014 #include <fstream>
00015
00016 #include "MOMDP.h"
00017 #include "SARSOP.h"
00018 #include "BlindLBInitializer.h"
00019
00020 using namespace std;
00021
00022 using namespace momdp;
00023
00024 #define PRUNE_EPS (1e-10)
00025
00026
00027
00028
00029
00030
00031 namespace momdp {
00032
00033 BlindLBInitializer::BlindLBInitializer(SharedPointer<MOMDP> _pomdp, AlphaPlanePoolSet* _bound) {
00034 pomdp = (SharedPointer<MOMDP>) _pomdp;
00035 bound = _bound;
00036 }
00037
00038 void BlindLBInitializer::initialize(double targetPrecision)
00039 {
00040
00041 if(pomdp->XStates->size() != 1 && pomdp->hasPOMDPMatrices())
00042 {
00043
00044 initBlind_unfac(targetPrecision);
00045 }
00046 else
00047 {
00048 initBlind(targetPrecision);
00049 }
00050 }
00051
00052
00053 void BlindLBInitializer::initBlindWorstCase_unfac(alpha_vector& weakAlpha)
00054 {
00055 DEBUG_TRACE(cout << "BlindLBInitializer::initBlindWorstCase_unfac " << endl; );
00056
00057
00058 double worstStateVal;
00059 int safestAction = -1;
00060 double worstCaseReward = -99e+20;
00061
00062
00063 int numStatesObs = pomdp->XStates->size();
00064 int numStatesUnobs = pomdp->YStates->size();
00065 FOR (a, pomdp->getNumActions())
00066 {
00067 worstStateVal = 99e+20;
00068
00069
00070
00071 FOR (s, numStatesObs * numStatesUnobs)
00072 {
00073
00074 worstStateVal = std::min(worstStateVal, (*(pomdp->pomdpR))(s,a));
00075 }
00076 if (worstStateVal > worstCaseReward) {
00077 safestAction = a;
00078 worstCaseReward = worstStateVal;
00079 }
00080 }
00081
00082
00083
00084 DenseVector worstCaseDVector(numStatesObs*numStatesUnobs);
00085 assert(pomdp->getDiscount() < 1);
00086 double worstCaseLongTerm = worstCaseReward / (1 - pomdp->getDiscount());
00087 FOR (i, numStatesObs*numStatesUnobs)
00088 {
00089 worstCaseDVector(i) = worstCaseLongTerm;
00090 }
00091
00092
00093
00094
00095 FOR (i, numStatesObs*numStatesUnobs) {
00096
00097
00098 unsigned int x = (unsigned int) i/(numStatesUnobs);
00099 unsigned int y = i % (numStatesUnobs);
00100
00101 if (pomdp->isPOMDPTerminalState[x][y]) {
00102
00103 worstCaseDVector(i) = 0.0;
00104 }
00105 }
00106
00107 copy(weakAlpha, worstCaseDVector);
00108
00109 DEBUG_TRACE(cout << "weakAlpha" << endl; );
00110 DEBUG_TRACE( weakAlpha.write(cout) << endl; );
00111 }
00112
00113 void BlindLBInitializer::initBlind_unfac(double targetPrecision)
00114 {
00115 DEBUG_TRACE(cout << "BlindLBInitializer::initBlind_unfac " << targetPrecision << endl; );
00116
00117 int numStatesObs = pomdp->XStates->size();
00118 int numStatesUnobs = pomdp->YStates->size();
00119 alpha_vector al(numStatesObs*numStatesUnobs);
00120 alpha_vector nextAl, tmp, diff;
00121 alpha_vector weakAl;
00122 belief_vector dummy;
00123 double maxResidual;
00124
00125 std::vector<alpha_vector> als(numStatesObs);
00126
00127 initBlindWorstCase_unfac(weakAl);
00128
00129
00130
00131
00132
00133
00134
00135
00136 FOR (a, pomdp->actions->size())
00137 {
00138 DEBUG_TRACE(cout << "a " << a << endl; );
00139 al = weakAl;
00140
00141 do
00142 {
00143
00144 mult(nextAl, *(*(pomdp->pomdpT))[a], al);
00145
00146
00147
00148
00149
00150
00151
00152 nextAl *= pomdp->discount;
00153
00154
00155
00156
00157 copy_from_column(tmp, *(pomdp->pomdpR), a);
00158
00159
00160
00161
00162 nextAl += tmp;
00163
00164
00165
00166
00167
00168 diff = nextAl;
00169 diff -= al;
00170
00171
00172
00173
00174 maxResidual = diff.norm_inf();
00175
00176 al = nextAl;
00177
00178 DEBUG_TRACE(cout << "maxResidual " << maxResidual << endl; );
00179 DEBUG_TRACE(cout << "targetPrecision " << targetPrecision << endl; );
00180 } while (maxResidual > targetPrecision);
00181
00182
00183 #if USE_DEBUG_PRINT
00184 cout << "initLowerBoundBlind: a=" << a << " al=" << sparseRep(al) << endl;
00185 #endif
00186
00187
00188
00189
00190
00191
00192
00193
00194
00195
00196
00197
00198 FOR (s, numStatesObs * numStatesUnobs)
00199 {
00200
00201 unsigned int x = (unsigned int) s/(numStatesUnobs);
00202 unsigned int y = s % (numStatesUnobs);
00203
00204 if (y==0) als[x].resize(numStatesUnobs);
00205
00206 als[x](y) = al(s);
00207 }
00208
00209 FOR (sval, numStatesObs )
00210 {
00211 if (a == 0)
00212 {
00213 bound->set[sval]->planes.clear();
00214 }
00215
00216 DEBUG_TRACE(cout << "als[sval] " << sval << endl; );
00217 DEBUG_TRACE(als[sval].write(cout) << endl; );
00218
00219 SharedPointer<AlphaPlane> plane (new AlphaPlane());
00220 copy(*plane->alpha, als[sval]);
00221 plane->action = a;
00222 plane->sval = sval;
00223 plane->setTimeStamp(0);
00224
00225 SARSOPAlphaPlaneTuple *tempTuple = (SARSOPAlphaPlaneTuple *)plane->solverData;
00226 tempTuple->certed = 0;
00227
00228 bound->set[sval]->addAlphaPlane(plane);
00229
00230
00231
00232
00233
00234 }
00235
00236 }
00237
00238
00239
00240 }
00241
00242
00243
00244 void BlindLBInitializer::initBlindWorstCase(std::vector<alpha_vector>& weakAlphas)
00245 {
00246 DEBUG_TRACE(cout << "BlindLBInitializer::initBlindWorstCase " << endl; );
00247
00248
00249
00250 double worstStateVal;
00251 int safestAction = -1;
00252 double worstCaseReward;
00253
00254
00255 int numStates = pomdp->YStates->size();
00256 int numStatesObs = pomdp->XStates->size();
00257
00258 double worstCaseLongTerm;
00259 assert(pomdp->getDiscount() < 1);
00260
00261 FOR (sval, numStatesObs)
00262 {
00263 worstCaseReward = -99e+20;
00264 SharedPointer<SparseMatrix> rewardMatrix = pomdp->rewards->getMatrix(sval);
00265
00266 FOR (a, pomdp->getNumActions())
00267 {
00268 worstStateVal = 99e+20;
00269
00270 FOR (s, numStates)
00271 {
00272 worstStateVal = std::min(worstStateVal, rewardMatrix->operator ()(s,a));
00273
00274 }
00275 if (worstStateVal > worstCaseReward)
00276 {
00277 safestAction = a;
00278 worstCaseReward = worstStateVal;
00279 }
00280 }
00281
00282
00283
00284
00285 bool penultimateTerminal = true;
00286 bool isTerminal;
00287 SparseVector T_xa;
00288
00289 FOR (y, numStates)
00290 {
00291 copy_from_column( T_xa, *(pomdp->XTrans->getMatrixTr(safestAction, sval)), y);
00292
00293 FOR (nonZeroEntry,T_xa.data.size()) {
00294
00295 isTerminal = true;
00296
00297 FOR (yn, numStates)
00298 {
00299 if (!(pomdp->isPOMDPTerminalState[T_xa.data[nonZeroEntry].index][yn])) {
00300 isTerminal = false;
00301 break;
00302
00303 }
00304 }
00305
00306 if (!isTerminal) {
00307 penultimateTerminal = false;
00308 break;
00309 }
00310 }
00311 }
00312
00313 if (penultimateTerminal) {
00314 worstCaseLongTerm = worstCaseReward;
00315 } else {
00316 assert(pomdp->getDiscount() < 1);
00317 worstCaseLongTerm = worstCaseReward / (1 - pomdp->getDiscount());
00318 }
00319
00320
00321
00322
00323 weakAlphas[sval].resize(numStates);
00324 FOR (i, numStates)
00325 {
00326
00327
00328
00329 if (pomdp->isPOMDPTerminalState[sval][i])
00330 {
00331 weakAlphas[sval](i) = 0.0;
00332 }
00333 else
00334 {
00335 weakAlphas[sval](i) = worstCaseLongTerm;
00336 }
00337
00338 }
00339
00340 }
00341
00342 }
00343
00344
00345 int BlindLBInitializer::initBlindWorstCaseIntegrated(std::vector<alpha_vector>& weakAlphas)
00346 {
00347 DEBUG_TRACE(cout << "BlindLBInitializer::initBlindWorstCaseIntegrated" << endl; );
00348
00349
00350 double worstStateVal;
00351 int safestAction = -1;
00352 double worstCaseReward = -99e+20;
00353 int numStates = pomdp->YStates->size();
00354 int numStatesObs = pomdp->XStates->size();
00355
00356
00357 FOR (a, pomdp->getNumActions())
00358 {
00359 worstStateVal = 99e+20;
00360
00361 FOR (sval, numStatesObs)
00362 {
00363 SharedPointer<SparseMatrix> rewardMatrix = pomdp->rewards->getMatrix(sval);
00364 FOR (s, numStates)
00365 {
00366 worstStateVal = std::min(worstStateVal, rewardMatrix->operator ()(s,a));
00367 }
00368 }
00369 if (worstStateVal > worstCaseReward)
00370 {
00371 safestAction = a;
00372 worstCaseReward = worstStateVal;
00373 }
00374 }
00375
00376
00377 double worstCaseLongTerm;
00378
00379 assert(pomdp->getDiscount() < 1);
00380 worstCaseLongTerm = worstCaseReward / (1 - pomdp->getDiscount());
00381
00382 FOR (sval, numStatesObs)
00383 {
00384 weakAlphas[sval].resize(numStates);
00385 FOR (i, numStates)
00386 {
00387
00388
00389
00390 if (pomdp->isPOMDPTerminalState[sval][i])
00391 {
00392 weakAlphas[sval](i) = 0.0;
00393
00394
00395 }
00396 else
00397 {
00398 weakAlphas[sval](i) = worstCaseLongTerm;
00399 }
00400
00401
00402 }
00403
00404 DEBUG_TRACE(cout << "weakAlphas[sval] " << sval << endl; );
00405 DEBUG_TRACE( weakAlphas[sval].write(cout) << endl; );
00406 }
00407
00408 DEBUG_TRACE(cout << "safestAction " << safestAction << endl; );
00409
00410 return safestAction;
00411 }
00412
00413
00414 void BlindLBInitializer::initBlind(double targetPrecision)
00415 {
00416 DEBUG_TRACE(cout << "BlindLBInitializer::initBlind " << targetPrecision << endl; );
00417
00418 int numStatesObs = pomdp->XStates->size();
00419 int numStatesUnobs = pomdp->YStates->size();
00420
00421
00422 std::vector<alpha_vector> als(numStatesObs), nextAls(numStatesObs), weakAls(numStatesObs);
00423 alpha_vector tmp, diff, tmpAl, tmpNextAl, tmpNextAl1;
00424
00425 double maxResidual, maxResidualthis;
00426
00427 initBlindWorstCaseIntegrated(weakAls);
00428
00429
00430
00431
00432
00433 FOR (a, pomdp->actions->size())
00434 {
00435 als = weakAls;
00436
00437
00438
00439 do
00440 {
00441 maxResidual = 0;
00442 FOR (sval, numStatesObs ) {
00443
00444 tmpAl.resize(numStatesUnobs);
00445
00446 const vector<int>& possibleNextSvals = pomdp->XTrans->getMatrix(a, sval)->nonEmptyColumns();
00447
00448
00449
00450
00451 FOREACH (int, nextSvalIt, possibleNextSvals)
00452 {
00453 int nextSval = *nextSvalIt;
00454 mult(tmpNextAl, *pomdp->YTrans->getMatrix(a, sval, nextSval), als[nextSval]);
00455 emult_column( tmpNextAl1, *pomdp->XTrans->getMatrix(a, sval), nextSval, tmpNextAl );
00456 tmpAl += tmpNextAl1;
00457 }
00458
00459 tmpAl *= pomdp->discount;
00460
00461 copy_from_column(tmp, *(pomdp->rewards->getMatrix(sval)), a);
00462 tmpAl += tmp;
00463
00464
00465 diff = tmpAl;
00466 diff -= als[sval];
00467 maxResidualthis = diff.norm_inf();
00468 if (maxResidualthis > maxResidual)
00469 maxResidual = maxResidualthis;
00470
00471 nextAls[sval] = tmpAl;
00472 }
00473
00474 als = nextAls;
00475
00476 DEBUG_TRACE(cout << "maxResidual " << maxResidual << endl; );
00477
00478 DEBUG_TRACE(cout << "targetPrecision " << targetPrecision << endl; );
00479 } while (maxResidual > targetPrecision);
00480
00481
00482 FOR (sval, numStatesObs )
00483 {
00484 if (a == 0)
00485 {
00486 bound->set[sval]->planes.clear();
00487 }
00488
00489 DEBUG_TRACE(cout << "als[sval] " << sval << endl; );
00490 DEBUG_TRACE(als[sval].write(cout) << endl; );
00491
00492 SharedPointer<AlphaPlane> plane (new AlphaPlane());
00493 copy(*plane->alpha, als[sval]);
00494 plane->action = a;
00495 plane->sval = sval;
00496
00497
00498 plane->setTimeStamp(0);
00499
00500
00501 SARSOPAlphaPlaneTuple *tempTuple = (SARSOPAlphaPlaneTuple *)plane->solverData;
00502 tempTuple->certed = 0;
00503
00504 bound->set[sval]->addAlphaPlane(plane);
00505
00506
00507 }
00508 }
00509
00510
00511 }
00512
00513 };
00514