appl: SimulationEngine.cpp Source File

Go to the documentation of this file.
00001 #include <sstream>
00002 #include <fstream>
00003 #include "SimulationEngine.h"
00004 #include "AlphaVectorPolicy.h"
00005 #include "CPTimer.h"
00006 #include "solverUtils.h"
00007 
00008 using namespace std;
00009 using namespace momdp;
00010 
00011 namespace momdp
00012 {
00013     void printTuple(map<string, string> tuple, ofstream* streamOut){
00014         *streamOut << "(";
00015         for(map<string, string>::iterator iter = tuple.begin() ; iter != tuple.end() ; )
00016         {
00017             *streamOut << iter->second;
00018             if(++iter!=tuple.end())
00019                 *streamOut << ",";
00020         }
00021         *streamOut<<")" << endl;
00022     }
00023 
00024     SimulationEngine::SimulationEngine()
00025     {
00026     }
00027 
00028     SimulationEngine::~SimulationEngine(void)
00029     {
00030     }
00031 
00032     void SimulationEngine::checkTerminal(string p, string s, vector<int> &bhout, vector<int> &fhout) {
00033         if (s.substr(0,3) == "bt2") {
00034             if (s.substr(9,2) == "FH") {
00035                 int ind = atoi(p.substr(4,1).c_str());
00036                 fhout[ind]++;
00037             } else if (s.substr(9,2) == "BH") {
00038                 int ind = atoi(p.substr(4,1).c_str());
00039                 bhout[ind]++;
00040             }
00041         }
00042     }
00043 
00044     int SimulationEngine::getGreedyAction(vector<int> &bhout, vector<int> &fhout) {
00045         int greedyAction = 2; //start with BHL
00046         int currBest = bhout[0];
00047 
00048         vector<int> temp;
00049         for (int i=0;i<(int)bhout.size();i++){
00050             temp.push_back(bhout[i]);
00051         }
00052         for (int i=0;i<(int)fhout.size();i++){
00053             temp.push_back(fhout[i]);
00054         }
00055         
00056         for (int i=1; i<(int)temp.size();i++){
00057             if (temp[i]>currBest) {
00058                 currBest = temp[i];
00059                 greedyAction = 2+i;
00060             }
00061         }
00062 
00063         return greedyAction;
00064     }
00065 
00066     void SimulationEngine::setup(SharedPointer<MOMDP> problem, SharedPointer<AlphaVectorPolicy> policy, SolverParams * solverParams)
00067     {
00068         this->policy = policy;
00069         this->problem = problem;
00070         this->solverParams = solverParams;
00071     }
00072 
00073     void SimulationEngine::performActionObs(belief_vector& outBelObs, int action, const BeliefWithState& belSt) const 
00074     {
00075         // DEBUG_SIMSPEED_270409 skip calculating outprobs for x when there is only one possible x value
00076         if (problem->XStates->size() == 1) 
00077         {
00078             // clear out the entries
00079             outBelObs.resize(1);
00080             outBelObs.push_back(0,1.0); 
00081         } 
00082         else 
00083         {
00084             //problem->getTransitionMatrixX(action, belSt.sval);
00085             const SharedPointer<SparseMatrix>  transMatX = problem->XTrans->getMatrix(action, belSt.sval); 
00086             mult(outBelObs, *belSt.bvec, *transMatX);
00087         }
00088     }
00089 
00090   void SimulationEngine::performActionUnobs(belief_vector& outBelUnobs, int action, const BeliefWithState& belSt, int currObsState) const
00091   {
00092         const SharedPointer<SparseMatrix>  transMatY = problem->YTrans->getMatrix(action, belSt.sval, currObsState);
00093         mult(outBelUnobs, *belSt.bvec, *transMatY);
00094   }
00095 
00096     void SimulationEngine::getPossibleObservations(belief_vector& possObs, int action,  const BeliefWithState& belSt) const
00097     {
00098         //const SparseMatrix obsMat = problem->getObservationMatrix(action, belSt.sval);
00099                 const SharedPointer<SparseMatrix>  obsMat = problem->obsProb->getMatrix(action, belSt.sval);
00100         mult(possObs,  *belSt.bvec, *obsMat);
00101     }
00102 
00103 
00104     double SimulationEngine::getReward(const BeliefWithState& belst, int action)
00105     {
00106         //const SparseMatrix rewMat = problem->getRewardMatrix(belst.sval);
00107                 const SharedPointer<SparseMatrix>  rewMat = problem->rewards->getMatrix(belst.sval);
00108         return inner_prod_column(*rewMat, action, *belst.bvec);
00109     }
00110 
00111     string SimulationEngine::toString()
00112     {
00113         std::ostringstream mystrm; 
00114         mystrm << "action selector: (replaced by Policy) ";
00115         return mystrm.str();
00116     }
00117 
00118     void SimulationEngine::display(belief_vector& b, ostream& s)
00119     {
00120         for(unsigned int i = 0; i < b.filled(); i++)
00121         {
00122             s << b.data[i].index << " -> " << b.data[i].value << endl;
00123         }
00124     }
00125 
00126     int SimulationEngine::runFor(int iters, ofstream* streamOut, double& reward, double& expReward)
00127     { 
00128         DEBUG_TRACE(cout << "runFor" << endl; );
00129         DEBUG_TRACE(cout << "iters " << iters << endl; );
00130         // DEBUG_TRACE(cout << "startVec sval " << startVec.sval << endl; );
00131         // DEBUG_TRACE(startVec.bvec->write(cout) << endl;);
00132         DEBUG_TRACE(cout << "startBeliefX" << endl; );
00133         DEBUG_TRACE(startBeliefX.write(cout) << endl;);
00134 
00135         //    double reward = 0, expReward = 0;
00136         bool enableFiling = false;
00137         if(streamOut == NULL)
00138         {
00139             enableFiling = false;
00140         }
00141         else
00142         {
00143             enableFiling = true;
00144         }
00145 
00146         // actual system state
00147         SharedPointer<BeliefWithState> actStateCompl (new BeliefWithState());
00148         SharedPointer<BeliefWithState> actNewStateCompl (new BeliefWithState());
00149 
00150         // policy follower state
00151         // belief with state
00152         SharedPointer<BeliefWithState> nextBelSt;
00153         SharedPointer<BeliefWithState> currBelSt (new BeliefWithState());// for policy follower based on known x value
00154                                                                          // set sval to -1 if x value is not known
00155 
00156         // belief over x. May not be used depending on model type and commandline flags, but declared here anyways.
00157         DenseVector currBelX; // belief over x
00158 
00159         // get starting actStateCompl, the complete state (X and Y values)
00160         if (problem->initialBeliefStval->sval == -1) // check if the initial starting state for X is fixed
00161         {
00162           // random starting state for X
00163           const SharedPointer<DenseVector>& startBeliefX = problem->initialBeliefX;
00164           actStateCompl->sval = chooseFromDistribution(*startBeliefX);
00165           copy(currBelX, *startBeliefX);
00166         }
00167         else
00168         {
00169           // initial starting state for X is fixed
00170           actStateCompl->sval = problem->initialBeliefStval->sval;
00171         }
00172 
00173         // now choose a starting unobserved state for the actual system
00174         SharedPointer<SparseVector> startBeliefVec;
00175         if (problem->initialBeliefStval->bvec)
00176           startBeliefVec = problem->initialBeliefStval->bvec;
00177         else
00178           startBeliefVec = problem->initialBeliefYByX[actStateCompl->sval];
00179         int currUnobsState = chooseFromDistribution(*startBeliefVec);
00180         int belSize = startBeliefVec->size();
00181 
00182         actStateCompl->bvec->resize(belSize);
00183         actStateCompl->bvec->push_back(currUnobsState, 1.0);
00184 
00185         DEBUG_TRACE( cout << "actStateCompl sval " << actStateCompl->sval << endl; );
00186         DEBUG_TRACE( actStateCompl->bvec->write(cout) << endl; );
00187 
00188         currBelSt->sval = actStateCompl->sval;
00189         copy(*currBelSt->bvec, *startBeliefVec);
00190 
00191         DEBUG_TRACE( cout << "currBelSt sval " << currBelSt->sval << endl; );
00192         DEBUG_TRACE( currBelSt->bvec->write(cout) << endl; );
00193 
00194         double mult=1;
00195         CPTimer lapTimer;
00196 
00197         // we now have actStateCompl (starting stateUnobs&stateObs) for "actual state" system
00198         // "policy follower" system has currBelSt (starting beliefUnobs&stateObs) OR currBelSt (starting beliefUnobs&-1) and currBelX (starting beliefObs) if initial x is a belief and not a state
00199 
00200         unsigned int firstAction;
00201 
00202         double gamma = problem->getDiscount();
00203 
00204         int xDim = 3;
00205         vector<int> bhout(xDim,0);
00206         vector<int> fhout(xDim,0);
00207         for(int timeIter = 0; timeIter < iters; timeIter++)
00208         { 
00209             DEBUG_TRACE( cout << "timeIter " << timeIter << endl; );
00210 
00211             if(enableFiling && timeIter == 0)
00212             {
00213                 *streamOut << ">>> begin\n";
00214             }
00215 
00216             // get action according to policy and current belief and state
00217             int currAction;
00218             //-----------------------------
00219             if (timeIter == 0)
00220             {
00221 
00222                if(solverParams->useLookahead)
00223                 {
00224                     if (currBelSt->sval == -1) // special case for first time step where X is a distribution
00225                         currAction = policy->getBestActionLookAhead(currBelSt->bvec, currBelX);
00226                     else 
00227                         currAction = policy->getBestActionLookAhead(*currBelSt);
00228                 }
00229                 else
00230                 {
00231                     if (currBelSt->sval == -1) // special case for first time step where X is a distribution
00232                         currAction = policy->getBestAction(currBelSt->bvec, currBelX);
00233                     else 
00234                         currAction = policy->getBestAction(*currBelSt);
00235                 }
00236             }   
00237             else
00238             {
00239 
00240                 if(solverParams->useLookahead)
00241                 {
00242                         currAction = policy->getBestActionLookAhead(*currBelSt);
00243                         
00244                 }
00245                 else
00246                 {
00247                         currAction = policy->getBestAction(*currBelSt);
00248                 }
00249             }
00250 
00251             //if (currAction>1 && currAction<8) {
00252             //    currAction = getGreedyAction(bhout, fhout);
00253             //    //currAction = (rand()%6)+2;
00254             //    //cout<<"GREEDY ACTION: "<<currAction<<endl;
00255             //}
00256 
00257             if(currAction < 0 )
00258             {
00259                 cout << "You are using a MDP Policy, please make sure you are using a MDP policy together with one-step look ahead option turned on" << endl;
00260                 return -1;
00261             }
00262             if (timeIter == 0)
00263             {
00264                 firstAction = currAction;
00265             }
00266 
00267             // this is the reward for the "actual state" system
00268             double currReward = getReward(*actStateCompl, currAction);
00269 
00270             DEBUG_TRACE( cout << "currAction " << currAction << endl; );
00271             DEBUG_TRACE( cout << "actStateCompl sval " << actStateCompl->sval << endl; );
00272             DEBUG_TRACE( actStateCompl->bvec->write(cout) << endl; );
00273 
00274             DEBUG_TRACE( cout << "currReward " << currReward << endl; );
00275             expReward += mult*currReward;
00276             mult *= gamma;
00277             reward += currReward;
00278 
00279             DEBUG_TRACE( cout << "expReward " << expReward << endl; );
00280             DEBUG_TRACE( cout << "reward " << reward << endl; );
00281 
00282 
00283             // actualActionUpdObs is belief of the fully observered state
00284             belief_vector actualActionUpdUnobs(belSize), actualActionUpdObs(problem->XStates->size()) ;
00285             performActionObs(actualActionUpdObs, currAction, *actStateCompl);
00286 
00287             DEBUG_TRACE( cout << "actualActionUpdObs " << endl; );
00288             DEBUG_TRACE( actualActionUpdObs.write(cout) << endl; );
00289 
00290             // the actual next state for the observed variable
00291             actNewStateCompl->sval = (unsigned int) chooseFromDistribution(actualActionUpdObs, ((double)rand()/RAND_MAX));
00292 
00293             // now update actualActionUpdUnobs, which is the belief of unobserved states,
00294             // based on prev belif and curr observed state
00295             performActionUnobs(actualActionUpdUnobs, currAction, *actStateCompl, actNewStateCompl->sval);
00296             
00297             DEBUG_TRACE( cout << "actualActionUpdUnobs " << endl; );
00298             DEBUG_TRACE( actualActionUpdUnobs.write(cout) << endl; );
00299 
00300             // the actual next state for the unobserved variable
00301             int newUnobsState = chooseFromDistribution(actualActionUpdUnobs, ((double)rand()/RAND_MAX));
00302 
00303             DEBUG_TRACE( cout << "newUnobsState "<< newUnobsState << endl; );
00304 
00305             actNewStateCompl->bvec->resize(belSize);
00306             actNewStateCompl->bvec->push_back(newUnobsState, 1.0);
00307 
00308             DEBUG_TRACE( cout << "actNewStateCompl sval "<< actNewStateCompl->sval << endl; );
00309             DEBUG_TRACE( actNewStateCompl->bvec->write(cout) << endl; );
00310 
00311             // get observations based on actual next states for observed and unobserved variable
00312             belief_vector obsPoss;
00313             getPossibleObservations(obsPoss, currAction, *actNewStateCompl);  
00314 
00315 
00316             DEBUG_TRACE( cout << "obsPoss"<< endl; );
00317             DEBUG_TRACE( obsPoss.write(cout) << endl; );
00318 
00319             int currObservation = chooseFromDistribution(obsPoss, ((double)rand()/RAND_MAX));
00320 
00321             DEBUG_TRACE( cout << "currObservation "<< currObservation << endl; );
00322 
00323             //*************************************************************
00324             map<string, string> aa = problem->getActionsSymbols(currAction);
00325             //cout<<"CURRENT ACTION INDEX: "<<currAction<<" ";
00326             //cout<<aa["pOneAction"]<<endl;
00327             //cout<<"OBS ";
00328             map<string, string> bb = problem->getObservationsSymbols(currObservation);
00329             map<string, string> cc = problem->getFactoredObservedStatesSymbols(actStateCompl->sval);
00330             //cout<<cc["pTwo_0"]<<" "<<bb["obs_ballDp2S"]<<endl;
00331             //cout<<"BEFORE";
00332             for (int ii=0;ii<xDim;ii++){
00333                 //cout<<" BHOUT "<<bhout[ii];
00334             }
00335             for (int ii=0;ii<xDim;ii++){
00336                 //cout<<" FHOUT "<<fhout[ii];
00337             }
00338             //cout<<endl;
00339             checkTerminal(cc["pTwo_0"],bb["obs_ballDp2S"],bhout,fhout);
00340             //cout<<"AFTER ";
00341             for (int ii=0;ii<xDim;ii++){
00342                 //cout<<" BHOUT "<<bhout[ii];
00343             }
00344             for (int ii=0;ii<xDim;ii++){
00345                 //cout<<" FHOUT "<<fhout[ii];
00346             }
00347             //cout<<endl;
00348             //*************************************************************
00349 
00350             if(enableFiling)
00351             {
00352                 //initial states and belief, before any action
00353                 if (timeIter == 0) 
00354                 {
00355                     //actual X state, X might be a distribution at first time step
00356                     map<string, string> obsState = problem->getFactoredObservedStatesSymbols(actStateCompl->sval);
00357                     if(obsState.size()>0){
00358                         streamOut->width(4);*streamOut<<left<<"X"<<":";
00359                         printTuple(obsState, streamOut);
00360                     }
00361 
00362                     //actual Y state
00363                     streamOut->width(4);*streamOut<<left<<"Y"<<":";
00364                     map<string, string> unobsState = problem->getFactoredUnobservedStatesSymbols(currUnobsState);
00365                     printTuple(unobsState, streamOut);
00366 
00367                     // if initial belief X is a distribution at first time step
00368                     if (currBelSt->sval == -1) {
00369                         SparseVector currBelXSparse;
00370                         copy(currBelXSparse, currBelX);
00371                         int mostProbX  = currBelXSparse.argmax();       //get the most probable Y state
00372                         streamOut->width(4);*streamOut<<left<<"ML X"<<":";
00373                         map<string, string> mostProbXState = problem->getFactoredObservedStatesSymbols(mostProbX);
00374                         printTuple(mostProbXState, streamOut);
00375                     }
00376 
00377                     //initial belief Y state
00378                     int mostProbY  = currBelSt->bvec->argmax();         //get the most probable Y state
00379                     double prob = currBelSt->bvec->operator()(mostProbY);       //get its probability
00380                     streamOut->width(4);*streamOut<<left<<"ML Y"<<":";
00381                     map<string, string> mostProbYState = problem->getFactoredUnobservedStatesSymbols(mostProbY);
00382                     printTuple(mostProbYState, streamOut);
00383                 }
00384 
00385                 streamOut->width(4);*streamOut<<left<<"A"<<":";
00386                 map<string, string> actState = problem->getActionsSymbols(currAction);
00387                 printTuple(actState, streamOut);        
00388                 
00389                 streamOut->width(4);*streamOut<<left<<"R"<<":";
00390                 *streamOut << currReward<<endl;
00391             }
00392 
00393             // now that we have the action, state of observed variable, and observation,
00394             // we can update the belief of unobserved variable
00395             if (timeIter == 0) {  // check to see if the initial X is a distribution or a known state
00396                 if (currBelSt->sval == -1) // special case for first time step where X is a distribution
00397                         nextBelSt = problem->beliefTransition->nextBelief(currBelSt->bvec, currBelX, currAction, currObservation, actNewStateCompl->sval);
00398                 else
00399                         nextBelSt = problem->beliefTransition->nextBelief(currBelSt, currAction, currObservation, actNewStateCompl->sval);
00400             } else 
00401                 nextBelSt = problem->beliefTransition->nextBelief(currBelSt, currAction, currObservation, actNewStateCompl->sval);
00402 
00403             //problem->getNextBeliefStval(nextBelSt, currBelSt, currAction, currObservation, actNewStateCompl->sval);
00404 
00405 
00406             if(enableFiling)
00407             {
00408                 if(timeIter == iters - 1)
00409                 {
00410                     *streamOut << "terminated\n";
00411                 }
00412 
00413                 //actual X state after action
00414                 map<string, string> obsState = problem->getFactoredObservedStatesSymbols(actNewStateCompl->sval);
00415                 if(obsState.size()>0){
00416                     streamOut->width(4);*streamOut<<left<<"X"<<":";
00417                     printTuple(obsState, streamOut);
00418                 }
00419 
00420                 //actual Y state after action
00421                 streamOut->width(4);*streamOut<<left<<"Y"<<":";
00422                 map<string, string> unobsState = problem->getFactoredUnobservedStatesSymbols(newUnobsState);
00423                 printTuple(unobsState, streamOut);
00424                 
00425                 //observation after action
00426                 streamOut->width(4);*streamOut<<left<<"O"<<":";
00427                 map<string, string> obs = problem->getObservationsSymbols(currObservation);
00428                 printTuple(obs, streamOut);
00429                 
00430                 //get most probable Y state from belief after applying action A 
00431                 int mostProbY  = nextBelSt->bvec->argmax();     //get the most probable Y state
00432                 double prob = nextBelSt->bvec->operator()(mostProbY);   //get its probability
00433                 streamOut->width(4);*streamOut<<left<<"ML Y"<<":";
00434                 map<string, string> mostProbYState = problem->getFactoredUnobservedStatesSymbols(mostProbY);
00435                 printTuple(mostProbYState, streamOut);
00436 
00437                 if(timeIter == iters - 1)
00438                 {
00439                     //timeval timeInRunFor = getTime() - prevTime;                              
00440                     //*streamOut << "----- time: " << timevalToSeconds(timeInRunFor) <<endl;
00441                     double lapTime = lapTimer.elapsed();
00442                     *streamOut << "----- time: " << lapTime <<endl;
00443                 }
00444             } 
00445             //actual states
00446             currUnobsState = newUnobsState; //Y state, hidden
00447             actStateCompl->sval = actNewStateCompl->sval;
00448             copy(*actStateCompl->bvec, *actNewStateCompl->bvec);
00449             
00450             //belief states
00451             copy(*currBelSt->bvec, *nextBelSt->bvec);
00452             currBelSt->sval = nextBelSt->sval;
00453 
00454             // added to stop simulation when at terminal state
00455             if(problem->getIsTerminalState(*actStateCompl))
00456             {
00457                 // Terminal state
00458                 // reward all 0, transfer back to it self
00459                 // TODO Consider reward not all 0 case
00460                 //cout <<"Terminal State!! timeIter " << timeIter << endl;
00461                 if(enableFiling)
00462                     *streamOut << "Reached terminal state" << endl;
00463                 break;
00464             }
00465 
00466         }
00467 
00468 
00469         return firstAction;
00470     }
00471 };