appl: BlindLBInitializer.cpp Source File

Go to the documentation of this file.
00001 
00008 #include <stdlib.h>
00009 //#include <unistd.h>
00010 #include <stdio.h>
00011 #include <assert.h>
00012 
00013 #include <iostream>
00014 #include <fstream>
00015 
00016 #include "MOMDP.h"
00017 #include "SARSOP.h"
00018 #include "BlindLBInitializer.h"
00019 
00020 using namespace std;
00021 
00022 using namespace momdp;
00023 
00024 #define PRUNE_EPS (1e-10)
00025 
00026 //#define DEBUGSYL_180908 1
00027 //#define DEBUGSYL_310708 1
00028 //#define DEBUG_LBVECTOR_220908 1
00029 //#define DEBUG_ITERCOMP_220908 1
00030 
00031 namespace momdp {
00032 
00033         BlindLBInitializer::BlindLBInitializer(SharedPointer<MOMDP> _pomdp, AlphaPlanePoolSet* _bound) {
00034                 pomdp = (SharedPointer<MOMDP>) _pomdp;
00035                 bound = _bound;
00036         }
00037 
00038         void BlindLBInitializer::initialize(double targetPrecision)
00039         {
00040 
00041                 if(pomdp->XStates->size() != 1 && pomdp->hasPOMDPMatrices())
00042                 {
00043                         // only does this if convert fast is called to produce pomdp version of the matrices
00044                         initBlind_unfac(targetPrecision);
00045                 }
00046                 else
00047                 {
00048                         initBlind(targetPrecision);
00049                 }
00050         }
00051 
00052 
00053         void BlindLBInitializer::initBlindWorstCase_unfac(alpha_vector& weakAlpha)
00054         {
00055                 DEBUG_TRACE(cout << "BlindLBInitializer::initBlindWorstCase_unfac " << endl; );
00056                 // set alpha to be a lower bound on the value of the best blind policy
00057 
00058                 double worstStateVal;
00059                 int safestAction = -1;
00060                 double worstCaseReward = -99e+20;
00061                 // calculate worstCaseReward = max_a min_s R(s,a)
00062                 // safestAction = argmax_a min_s R(s,a)
00063                 int numStatesObs = pomdp->XStates->size();
00064                 int numStatesUnobs = pomdp->YStates->size();
00065                 FOR (a, pomdp->getNumActions()) 
00066                 {
00067                         worstStateVal = 99e+20;
00068 
00069                         //cmatrix* pR = pomdp->pomdpR;
00070                         
00071                         FOR (s, numStatesObs * numStatesUnobs) 
00072                         {
00073                                 //     worstStateVal = std::min(worstStateVal, (*pR)(s,a));
00074                                 worstStateVal = std::min(worstStateVal, (*(pomdp->pomdpR))(s,a));
00075                         }
00076                         if (worstStateVal > worstCaseReward) {
00077                                 safestAction = a;
00078                                 worstCaseReward = worstStateVal;
00079                         }
00080                 }
00081                 //DenseVector worstCaseDVector(pomdp->getNumStateDimensions());
00082 
00083 
00084                 DenseVector worstCaseDVector(numStatesObs*numStatesUnobs);
00085                 assert(pomdp->getDiscount() < 1);
00086                 double worstCaseLongTerm = worstCaseReward / (1 - pomdp->getDiscount());
00087                 FOR (i, numStatesObs*numStatesUnobs) 
00088                 {
00089                         worstCaseDVector(i) = worstCaseLongTerm;
00090                 }
00091 
00092                 // post-process: make sure the value for all terminal states
00093                 // is exactly 0, since that is how the lbVal field of terminal
00094                 // nodes is initialized.
00095                 FOR (i, numStatesObs*numStatesUnobs) {
00096 
00097                         // convert i to x and y values
00098                         unsigned int x = (unsigned int) i/(numStatesUnobs);
00099                         unsigned int y = i % (numStatesUnobs);
00100 
00101                         if (pomdp->isPOMDPTerminalState[x][y]) {
00102 
00103                                 worstCaseDVector(i) = 0.0;
00104                         }
00105                 }
00106 
00107                 copy(weakAlpha, worstCaseDVector);
00108                 
00109                 DEBUG_TRACE(cout << "weakAlpha" << endl; );
00110                 DEBUG_TRACE( weakAlpha.write(cout) << endl; );
00111         }
00112 
00113         void BlindLBInitializer::initBlind_unfac(double targetPrecision)
00114         {
00115                 DEBUG_TRACE(cout << "BlindLBInitializer::initBlind_unfac " << targetPrecision << endl; );
00116                 
00117                 int numStatesObs = pomdp->XStates->size();
00118                 int numStatesUnobs = pomdp->YStates->size();
00119                 alpha_vector al(numStatesObs*numStatesUnobs);
00120                 alpha_vector nextAl, tmp, diff;
00121                 alpha_vector weakAl;
00122                 belief_vector dummy; // ignored
00123                 double maxResidual;
00124 
00125                 std::vector<alpha_vector> als(numStatesObs);
00126 
00127                 initBlindWorstCase_unfac(weakAl);
00128                 //bound->planes.clear();
00129 
00130 
00131                 // TODO:: bound->elapsed = bound->heurTimer.elapsed();
00132                 // TODO:: printf("%.2fs initBlindWorstCase(sval, tmpAl) done for all svals\n", bound->elapsed);
00133 
00134 
00135                 // produce one alpha vector for each fixed policy "always take action a"
00136                 FOR (a, pomdp->actions->size()) 
00137                 {
00138                     DEBUG_TRACE(cout << "a " << a << endl; );
00139                         al = weakAl;
00140 
00141                         do 
00142                         {
00143                                 // calculate nextAl
00144                                 mult(nextAl, *(*(pomdp->pomdpT))[a], al);
00145 //                              DEBUG_TRACE(cout << "al " << endl; );
00146 //                              DEBUG_TRACE(al.write(cout) << endl; );
00147                                 
00148 //                              DEBUG_TRACE(cout << "nextAl " << endl; );
00149 //                              DEBUG_TRACE(nextAl.write(cout) << endl; );
00150                                 
00151                                 //     mult(nextAl, pomdp->pomdpT[a], al);
00152                                 nextAl *= pomdp->discount;
00153                                 
00154 //                              DEBUG_TRACE(cout << "nextAl " << endl; );
00155 //                              DEBUG_TRACE(nextAl.write(cout) << endl; );
00156                                 
00157                                 copy_from_column(tmp, *(pomdp->pomdpR), a);
00158                                 //      copy_from_column(tmp, pomdp->pomdpR, a);
00159 //                              DEBUG_TRACE(cout << "tmp " << endl; );
00160 //                              DEBUG_TRACE(tmp.write(cout) << endl; );
00161                                 
00162                                 nextAl += tmp;
00163                                 
00164 //                              DEBUG_TRACE(cout << "nextAl " << endl; );
00165 //                              DEBUG_TRACE(nextAl.write(cout) << endl; );
00166 
00167                                 // calculate residual
00168                                 diff = nextAl;
00169                                 diff -= al;
00170                                 
00171 //                              DEBUG_TRACE(cout << "diff " << endl; );
00172 //                              DEBUG_TRACE(diff.write(cout) << endl; );
00173                                 
00174                                 maxResidual = diff.norm_inf();
00175 
00176                                 al = nextAl;
00177                                 
00178                                 DEBUG_TRACE(cout << "maxResidual " << maxResidual << endl; );
00179                                 DEBUG_TRACE(cout << "targetPrecision " << targetPrecision << endl; );
00180                         } while (maxResidual > targetPrecision);
00181 
00182 
00183 #if USE_DEBUG_PRINT
00184                         cout << "initLowerBoundBlind: a=" << a << " al=" << sparseRep(al) << endl;
00185 #endif
00186 
00187                         /* #if USE_MASKED_ALPHA
00188                         bound->addAlphaPlane(dummy, al, a, full_mask);
00189                         #else
00190                         SharedPointer<AlphaPlane> plane = new AlphaPlane(al, a);
00191                         plane->setTimeStamp(0); 
00192                         plane->certed = 0;//init certed count to 0
00193                         bound->addAlphaPlane(plane); 
00194                         // bound->addAlphaPlane(dummy, al, a); //!!to be added in later
00195                         #endif */
00196 
00197                         // break al up into als[sval] for each sval 
00198                         FOR (s, numStatesObs * numStatesUnobs) 
00199                         {
00200                                 // convert i to x and y values
00201                                 unsigned int x = (unsigned int) s/(numStatesUnobs);
00202                                 unsigned int y = s % (numStatesUnobs);
00203 
00204                                 if (y==0) als[x].resize(numStatesUnobs);
00205 
00206                                 als[x](y) = al(s);
00207                         }
00208 
00209                         FOR (sval, numStatesObs ) 
00210                         {
00211                                 if (a == 0) 
00212                                 {
00213                                         bound->set[sval]->planes.clear();
00214                                 }
00215 
00216                                 DEBUG_TRACE(cout << "als[sval] " << sval << endl; );
00217                                 DEBUG_TRACE(als[sval].write(cout) << endl; );
00218                                 
00219                                 SharedPointer<AlphaPlane> plane (new AlphaPlane());
00220                                 copy(*plane->alpha, als[sval]);
00221                                 plane->action = a;
00222                                 plane->sval = sval;
00223                                 plane->setTimeStamp(0); 
00224 
00225                                 SARSOPAlphaPlaneTuple *tempTuple = (SARSOPAlphaPlaneTuple *)plane->solverData;
00226                                 tempTuple->certed = 0; //init certed count to 0
00227                                 
00228                                 bound->set[sval]->addAlphaPlane(plane);
00229                                 // bound->addAlphaPlane(dummy, al, a); //!!to be added in later
00230 
00231                                 /*cout << "a " << a << " sval " << sval << endl;
00232                                 als[sval].write(std::cout);
00233                                 cout << endl;*/
00234                         }       
00235 
00236                 }
00237 
00238                 //bound->elapsed = bound->heurTimer.elapsed();
00239                 //printf("%.2fs initBlind() done\n", bound->elapsed);
00240         }
00241 
00242         // SYL260809 new
00243         // this gets the worst case initial vector separately for each sval and writes to the set of weakAlpha vectors
00244         void BlindLBInitializer::initBlindWorstCase(std::vector<alpha_vector>& weakAlphas)
00245         {
00246                 DEBUG_TRACE(cout << "BlindLBInitializer::initBlindWorstCase " << endl; );
00247                 
00248                 // set alpha to be a lower bound on the value of the best blind policy
00249 
00250                 double worstStateVal;
00251                 int safestAction = -1;
00252                 double worstCaseReward;
00253                 // calculate worstCaseReward = max_a min_s R(s,a)
00254                 // safestAction = argmax_a min_s R(s,a)
00255                 int numStates = pomdp->YStates->size();
00256                 int numStatesObs = pomdp->XStates->size();
00257 
00258                 double worstCaseLongTerm; 
00259                 assert(pomdp->getDiscount() < 1);
00260 
00261                 FOR (sval, numStatesObs) 
00262                 {       
00263                         worstCaseReward = -99e+20;
00264                         SharedPointer<SparseMatrix>  rewardMatrix = pomdp->rewards->getMatrix(sval);
00265         
00266                         FOR (a, pomdp->getNumActions()) 
00267                         {
00268                                 worstStateVal = 99e+20;
00269 
00270                                 FOR (s, numStates) 
00271                                 {
00272                                         worstStateVal = std::min(worstStateVal, rewardMatrix->operator ()(s,a));
00273                                         //worstStateVal = std::min(worstStateVal, (*(pomdp->rewards->matrix[sval]))(s,a));
00274                                 }
00275                                 if (worstStateVal > worstCaseReward) 
00276                                 {
00277                                         safestAction = a;
00278                                         worstCaseReward = worstStateVal;
00279                                 }
00280                         }
00281 
00282 // OPTION A, works better than OPTION B for some problems - gets better lowerbound vectors.
00283                         // if safestAction and sval combination transits to a terminal state
00284                         // then, do not multiply with long term discount effect
00285                         bool penultimateTerminal = true;
00286                         bool isTerminal;
00287                         SparseVector T_xa;
00288         
00289                         FOR (y, numStates) 
00290                         {
00291                                 copy_from_column( T_xa, *(pomdp->XTrans->getMatrixTr(safestAction, sval)), y);
00292         
00293                                 FOR (nonZeroEntry,T_xa.data.size()) {
00294         
00295                                         isTerminal = true;
00296                                         // need to make sure that ALL (regardless of y' value) are terminal states
00297                                         FOR (yn, numStates) 
00298                                         {
00299                                                 if (!(pomdp->isPOMDPTerminalState[T_xa.data[nonZeroEntry].index][yn])) {
00300                                                         isTerminal = false;
00301                                                         break;
00302         
00303                                                 }
00304                                         }
00305         
00306                                         if (!isTerminal) {
00307                                                 penultimateTerminal = false;
00308                                                 break;
00309                                         }
00310                                 }
00311                         }
00312         
00313                         if (penultimateTerminal) {
00314                                 worstCaseLongTerm = worstCaseReward;
00315                         } else {
00316                                 assert(pomdp->getDiscount() < 1);
00317                                 worstCaseLongTerm = worstCaseReward / (1 - pomdp->getDiscount());
00318                         }
00319 
00320 // OR, OPTION B
00321 //                      worstCaseLongTerm = worstCaseReward / (1 - pomdp->getDiscount());
00322 
00323                         weakAlphas[sval].resize(numStates);
00324                         FOR (i, numStates) 
00325                         {
00326                                 // post-process: make sure the value for all terminal states
00327                                 // is exactly 0, since that is how the lbVal field of terminal
00328                                 // nodes is initialized.
00329                                 if (pomdp->isPOMDPTerminalState[sval][i]) 
00330                                 {
00331                                         weakAlphas[sval](i) = 0.0;
00332                                 }
00333                                 else
00334                                 {
00335                                         weakAlphas[sval](i) = worstCaseLongTerm;
00336                                 }
00337 
00338                         }
00339 
00340                 }
00341 
00342         }
00343 
00344         // this gets the worst case initial vector acros all sval
00345         int BlindLBInitializer::initBlindWorstCaseIntegrated(std::vector<alpha_vector>& weakAlphas)
00346         {
00347                 DEBUG_TRACE(cout << "BlindLBInitializer::initBlindWorstCaseIntegrated" << endl; );
00348                 
00349                 // set alpha to be a lower bound on the value of the best blind policy
00350                 double worstStateVal;
00351                 int safestAction = -1;
00352                 double worstCaseReward = -99e+20;
00353                 int numStates = pomdp->YStates->size();
00354                 int numStatesObs = pomdp->XStates->size();
00355                 // calculate worstCaseReward = max_a min_s R(s,a)
00356                 // safestAction = argmax_a min_s R(s,a)
00357                 FOR (a, pomdp->getNumActions()) 
00358                 {
00359                         worstStateVal = 99e+20;
00360 
00361                         FOR (sval, numStatesObs) 
00362                         {       
00363                                 SharedPointer<SparseMatrix>  rewardMatrix = pomdp->rewards->getMatrix(sval);
00364                                 FOR (s, numStates) 
00365                                 {
00366                                         worstStateVal = std::min(worstStateVal, rewardMatrix->operator ()(s,a));
00367                                 }
00368                         }
00369                         if (worstStateVal > worstCaseReward) 
00370                         {
00371                                 safestAction = a;
00372                                 worstCaseReward = worstStateVal;
00373                         }
00374                 }
00375 
00376 
00377                 double worstCaseLongTerm; 
00378 
00379                 assert(pomdp->getDiscount() < 1);
00380                 worstCaseLongTerm = worstCaseReward / (1 - pomdp->getDiscount());
00381 
00382                 FOR (sval, numStatesObs)
00383                 {
00384                         weakAlphas[sval].resize(numStates);
00385                         FOR (i, numStates) 
00386                         {
00387                                 // post-process: make sure the value for all terminal states
00388                                 // is exactly 0, since that is how the lbVal field of terminal
00389                                 // nodes is initialized.
00390                                 if (pomdp->isPOMDPTerminalState[sval][i]) 
00391                                 {
00392                                         weakAlphas[sval](i) = 0.0;
00393                                         //weakAlphas[sval].push_back = 0.0;
00394 
00395                                 }
00396                                 else
00397                                 {
00398                                         weakAlphas[sval](i) = worstCaseLongTerm;
00399                                 }
00400                                 //weakAlphas[sval].push_back = worstCaseLongTerm;
00401 
00402                         }
00403                         
00404                         DEBUG_TRACE(cout << "weakAlphas[sval] " << sval << endl; );
00405                         DEBUG_TRACE( weakAlphas[sval].write(cout)  << endl; );
00406                 }
00407 
00408                 DEBUG_TRACE(cout << "safestAction " << safestAction << endl; );
00409 
00410                 return safestAction;
00411         }
00412 
00413 
00414         void BlindLBInitializer::initBlind(double targetPrecision)
00415         {
00416                 DEBUG_TRACE(cout << "BlindLBInitializer::initBlind " << targetPrecision << endl; );
00417                 
00418                 int numStatesObs = pomdp->XStates->size();
00419                 int numStatesUnobs = pomdp->YStates->size();
00420 
00421 
00422                 std::vector<alpha_vector> als(numStatesObs), nextAls(numStatesObs), weakAls(numStatesObs);
00423                 alpha_vector tmp, diff, tmpAl, tmpNextAl, tmpNextAl1;
00424                 //belief_vector dummy; // ignored
00425                 double maxResidual, maxResidualthis;
00426 
00427                 initBlindWorstCaseIntegrated(weakAls); // SYL280809 prevly  //initBlindWorstCase(weakAls);
00428 
00429                 //  TODO:: bound->elapsed = bound->heurTimer.elapsed();
00430                 //  TODO:: printf("%.2fs initBlindWorstCase(sval, tmpAl) done for all svals\n", bound->elapsed);
00431 
00432                 // produce one alpha vector for each fixed policy "always take action a"
00433                 FOR (a, pomdp->actions->size()) 
00434                 {
00435                         als = weakAls;
00436 
00437 
00438                         // SYLMOD use weak bounds
00439                         do 
00440                         {
00441                                 maxResidual = 0; 
00442                                 FOR (sval, numStatesObs ) {     // for each sval, get nextAl[sval]
00443 
00444                                         tmpAl.resize(numStatesUnobs);
00445 
00446                                         const vector<int>& possibleNextSvals = pomdp->XTrans->getMatrix(a, sval)->nonEmptyColumns();
00447 
00448                                         // calculate nextAl[sval]
00449                                         // if TX[a][sval] is all zero, dont do any multiplication, and dont need
00450                                         // to add anything to tmpAl
00451                                         FOREACH (int, nextSvalIt, possibleNextSvals) 
00452                                         {
00453                                             int nextSval = *nextSvalIt;
00454                                             mult(tmpNextAl, *pomdp->YTrans->getMatrix(a, sval, nextSval), als[nextSval]);
00455                                             emult_column( tmpNextAl1, *pomdp->XTrans->getMatrix(a, sval), nextSval, tmpNextAl );
00456                                             tmpAl += tmpNextAl1;
00457                                         }
00458 
00459                                         tmpAl *= pomdp->discount;
00460 
00461                                         copy_from_column(tmp, *(pomdp->rewards->getMatrix(sval)), a);
00462                                         tmpAl += tmp;
00463 
00464                                         // calculate residual
00465                                         diff = tmpAl;
00466                                         diff -= als[sval];
00467                                         maxResidualthis = diff.norm_inf();
00468                                         if (maxResidualthis > maxResidual)      
00469                                                 maxResidual = maxResidualthis;
00470 
00471                                         nextAls[sval] = tmpAl;
00472                                 }
00473 
00474                                 als = nextAls;  
00475 
00476                                 DEBUG_TRACE(cout << "maxResidual " << maxResidual << endl; );
00477                                 //cout << "maxResidual " << maxResidual << endl;
00478                                 DEBUG_TRACE(cout << "targetPrecision " << targetPrecision << endl; );
00479                         } while (maxResidual > targetPrecision);
00480 
00481 
00482                         FOR (sval, numStatesObs ) 
00483                         {
00484                                 if (a == 0) 
00485                                 {
00486                                         bound->set[sval]->planes.clear();
00487                                 }
00488 
00489                                 DEBUG_TRACE(cout << "als[sval] " << sval << endl; );
00490                                 DEBUG_TRACE(als[sval].write(cout) << endl; );
00491 
00492                                 SharedPointer<AlphaPlane> plane (new AlphaPlane());
00493                                 copy(*plane->alpha, als[sval]);
00494                                 plane->action = a;
00495                                 plane->sval = sval;
00496 
00497 
00498                                 plane->setTimeStamp(0); 
00499 
00500                                 
00501                                 SARSOPAlphaPlaneTuple *tempTuple = (SARSOPAlphaPlaneTuple *)plane->solverData;
00502                                 tempTuple->certed = 0; //init certed count to 0
00503 
00504                                 bound->set[sval]->addAlphaPlane(plane);
00505                                 // bound->addAlphaPlane(dummy, al, a); //!!to be added in later
00506 
00507                         }       
00508                 }
00509 
00510 
00511         }
00512 
00513 }; // namespace momdp
00514