rl_common: core.hh Source File

Go to the documentation of this file.
00001 #ifndef _RLCORE_H_
00002 #define _RLCORE_H_
00003 
00004 #include "Random.h"
00005 #include <vector>
00006 #include <map>
00007 
00008 
00017 // types of models
00018 #define RMAX        0
00019 #define TABULAR     0
00020 #define SLF         1
00021 #define C45TREE     2
00022 #define SINGLETREE  3
00023 #define SVM         4
00024 #define STUMP       5
00025 #define M5MULTI     6
00026 #define M5SINGLE    7
00027 #define M5ALLMULTI  8
00028 #define M5ALLSINGLE 9 
00029 #define LSTMULTI    10
00030 #define LSTSINGLE   11
00031 #define ALLM5TYPES  12
00032 #define GPREGRESS   13
00033 #define GPTREE      14
00034 
00035 const std::string modelNames[] = {
00036   "Tabular",
00037   "SLF",
00038   "C4.5 Tree",
00039   "Single Tree",
00040   "SVM",
00041   "Stump",
00042   "M5 Tree",
00043   "M5 Tree",
00044   "M5 Tree",
00045   "M5 Tree",
00046   "LS Tree",
00047   "LS Tree",
00048   "M5 Combo",
00049   "GP Regression",
00050   "GP Tree"
00051 };
00052 
00053 // types of model combos
00054 #define AVERAGE        1
00055 #define WEIGHTAVG      2
00056 #define BEST           3
00057 #define SEPARATE       4 // sep model for planning, and forest for uncertainty
00058 
00059 const std::string comboNames[] = {
00060   "Average",
00061   "Weighted Average",
00062   "Best",
00063   "Separate"
00064 };
00065 
00066 // types of exploration
00067 #define EXPLORE_UNKNOWN    0
00068 #define TWO_MODE           1
00069 #define TWO_MODE_PLUS_R    2
00070 #define CONTINUOUS_BONUS   3
00071 #define THRESHOLD_BONUS    4
00072 #define CONTINUOUS_BONUS_R 5
00073 #define THRESHOLD_BONUS_R  6
00074 #define NO_EXPLORE         7
00075 #define GREEDY             7
00076 #define EPSILONGREEDY      8
00077 #define VISITS_CONF        9
00078 #define UNVISITED_BONUS    11
00079 #define UNVISITED_ACT_BONUS 13
00080 #define DIFF_AND_VISIT_BONUS 16
00081 #define NOVEL_STATE_BONUS    18
00082 #define DIFF_AND_NOVEL_BONUS 19
00083 
00084 const std::string exploreNames[] = {
00085   "Explore Unknowns",
00086   "Two Modes",
00087   "Two Models +R",
00088   "Continuous Bonus",
00089   "Threshold Bonus",
00090   "Continuous Bonus +R",
00091   "Threshold Bonus +R",
00092   "Greedy",
00093   "Epsilon-Greedy",
00094   "Visits Confidence",
00095   "Type 10",
00096   "Unvisited State Bonus",
00097   "Type 12", 
00098   "Unvisited Action Bonus",
00099   "Type 14",
00100   "Type 15",
00101   "Model Diff & Visit Bonus",
00102   "Type 17",
00103   "FeatDist Bonus",
00104   "Model Diff & FeatDist Bonus"
00105 };
00106 
00107 // types of planners
00108 #define VALUE_ITERATION    0
00109 #define POLICY_ITERATION   1
00110 #define PRI_SWEEPING       2
00111 #define UCT                3
00112 #define ET_UCT             4
00113 #define ET_UCT_WITH_ENV    5
00114 #define SWEEPING_UCT_HYBRID 6
00115 #define CMAC_PLANNER       7
00116 #define NN_PLANNER         8
00117 #define MOD_PRI_SWEEPING   9
00118 #define ET_UCT_L1          10
00119 #define UCT_WITH_L         11
00120 #define UCT_WITH_ENV       12
00121 #define PARALLEL_ET_UCT    13
00122 #define ET_UCT_ACTUAL      14
00123 #define ET_UCT_CORNERS     15
00124 #define PAR_ETUCT_ACTUAL   16
00125 #define PAR_ETUCT_CORNERS  17
00126 #define POMDP_ETUCT        18
00127 #define POMDP_PAR_ETUCT    19
00128 #define MBS_VI             20
00129 
00130 const std::string plannerNames[] = {
00131   "Value Iteration",
00132   "Policy Iteration",
00133   "Prioritized Sweeping",
00134   "UCT",
00135   "UCT",
00136   "UCT",
00137   "Sweeping UCT Hybrid",
00138   "CMACs",
00139   "NN",
00140   "Mod. Pri Sweeping",
00141   "UCT L=1",
00142   "UCT L",
00143   "UCT Env",
00144   "Parallel UCT",
00145   "Real-Valued UCT",
00146   "Corner UCT",
00147   "Parallel Real-Valued UCT",
00148   "Parallel Corner UCT",
00149   "Delayed UCT",
00150   "Parallel Delayed UCT",
00151   "Model Based Simulation - VI"
00152 };
00153   
00154 
00155 
00156 #define EPSILON   1e-5
00157 
00159 struct experience {
00160   std::vector<float> s;
00161   int act;
00162   float reward;
00163   std::vector<float> next;
00164   bool terminal;
00165 };
00166 
00168 struct modelPair {
00169   std::vector<float> in;
00170   std::vector<float> out;
00171 };
00172 
00174 struct classPair {
00175   std::vector<float> in;
00176   float out;
00177 };
00178 
00185 class Environment {
00186 public:
00190   virtual const std::vector<float> &sensation() const = 0;
00191 
00195   virtual float apply(int action) = 0;
00196 
00201   virtual bool terminal() const = 0;
00202 
00207   virtual void reset() = 0;
00208 
00211   virtual int getNumActions() = 0;
00212 
00215   virtual void getMinMaxFeatures(std::vector<float> *minFeat,
00216                                  std::vector<float> *maxFeat) = 0;
00217 
00219   virtual void getMinMaxReward(float *minR, float *maxR) = 0;
00220 
00222   virtual bool isEpisodic(){ return true; };
00223 
00225   virtual std::vector<experience> getSeedings()
00226   {
00227     std::vector<experience> e;
00228     return e;
00229   } ;
00230 
00232   virtual void setSensation(std::vector<float> s){};
00233 
00234   virtual ~Environment() {};
00235 
00236 };
00237 
00241 class Agent {
00242 public:
00248   virtual int first_action(const std::vector<float> &s) = 0;
00249 
00257   virtual int next_action(float r, const std::vector<float> &s) = 0;
00258 
00265   virtual void last_action(float r) = 0;
00266 
00268   virtual void setDebug(bool d) = 0;
00269 
00271   virtual void seedExp(std::vector<experience> seeds) {};
00272 
00274   virtual void savePolicy(const char* filename) {};
00275 
00276   virtual ~Agent() {};
00277 };
00278 
00280 class Model {
00281 public:
00283   virtual bool trainInstances(std::vector<modelPair> &instances) = 0;
00284 
00286   virtual bool trainInstance(modelPair &instance) = 0;
00287 
00289   virtual std::vector<float> testInstance(const std::vector<float> &in) = 0;
00290 
00291   virtual ~Model() {};
00292 };
00293 
00295 class Classifier {
00296 public:
00298   virtual bool trainInstances(std::vector<classPair> &instances) = 0;
00299 
00301   virtual bool trainInstance(classPair &instance) = 0;
00302 
00304   virtual void testInstance(const std::vector<float> &in, std::map<float, float>* retval) = 0;
00305 
00307   virtual float getConf(const std::vector<float> &in) = 0;
00308 
00310   virtual Classifier* getCopy() = 0;
00311 
00312   virtual ~Classifier() {};
00313 };
00314 
00318 struct StateActionInfo {
00319   bool known;
00320   float reward;
00321   float termProb;
00322   int frameUpdated;
00323 
00324   // map from outcome state to probability
00325   std::map< std::vector<float> , float> transitionProbs;
00326 
00327   StateActionInfo(){
00328     known = false;
00329     reward = 0.0;
00330     termProb = 0.0;
00331     frameUpdated = -1;
00332   };
00333 };
00334 
00335 
00337 class MDPModel {
00338 public:
00340   virtual bool updateWithExperiences(std::vector<experience> &instances) = 0;
00341 
00343   virtual bool updateWithExperience(experience &instance) = 0;
00344 
00346   virtual float getStateActionInfo(const std::vector<float> &state, int action, StateActionInfo* retval) = 0;
00347 
00349   virtual MDPModel* getCopy() = 0;
00350   virtual ~MDPModel() {};
00351 };
00352 
00354 class Planner {
00355 public:
00357   virtual void setModel(MDPModel* model) = 0;
00358 
00360   virtual bool updateModelWithExperience(const std::vector<float>& last,
00361                                          int act,
00362                                          const std::vector<float>& curr,
00363                                          float reward, bool terminal) = 0;
00364 
00366   virtual void planOnNewModel() = 0;
00367 
00369   virtual int getBestAction(const std::vector<float> &s) = 0;
00370 
00372   virtual void savePolicy(const char* filename) {};
00373 
00375   virtual void setSeeding(bool seeding) {};
00376 
00378   virtual void setFirst() {};
00379 
00383   std::vector<float>::iterator
00384   random_max_element(std::vector<float>::iterator start,
00385                      std::vector<float>::iterator end) {
00386     const float Q_EPSILON = 1e-4;
00387     
00388     std::vector<float>::iterator max =
00389     std::max_element(start, end);
00390 
00391     // find # within epsilon of max
00392     int nfound = 0;
00393     for (std::vector<float>::iterator it = start; it != end; it++){
00394       if (fabs(*it - *max) < Q_EPSILON){
00395         nfound++;
00396       }
00397     }
00398     
00399     // only 1: take it
00400     if (nfound == 1)
00401       return max;
00402 
00403     // take one of close to max at random
00404     for (std::vector<float>::iterator it = start; it != end; it++){
00405       if (fabs(*it - *max) < Q_EPSILON){
00406         if (rng.uniform() < (1.0 / (float)nfound)){
00407           return it;
00408         }
00409         nfound--;
00410       }
00411     }
00412     
00413     return max;
00414   };
00415 
00416   virtual ~Planner() {};
00417   
00418   Random rng;
00419 
00420 };
00421 
00422 
00423 #endif