00001 #ifndef _RLCORE_H_
00002 #define _RLCORE_H_
00003
00004 #include "Random.h"
00005 #include <vector>
00006 #include <map>
00007
00008
00017
00018 #define RMAX 0
00019 #define TABULAR 0
00020 #define SLF 1
00021 #define C45TREE 2
00022 #define SINGLETREE 3
00023 #define SVM 4
00024 #define STUMP 5
00025 #define M5MULTI 6
00026 #define M5SINGLE 7
00027 #define M5ALLMULTI 8
00028 #define M5ALLSINGLE 9
00029 #define LSTMULTI 10
00030 #define LSTSINGLE 11
00031 #define ALLM5TYPES 12
00032 #define GPREGRESS 13
00033 #define GPTREE 14
00034
00035 const std::string modelNames[] = {
00036 "Tabular",
00037 "SLF",
00038 "C4.5 Tree",
00039 "Single Tree",
00040 "SVM",
00041 "Stump",
00042 "M5 Tree",
00043 "M5 Tree",
00044 "M5 Tree",
00045 "M5 Tree",
00046 "LS Tree",
00047 "LS Tree",
00048 "M5 Combo",
00049 "GP Regression",
00050 "GP Tree"
00051 };
00052
00053
00054 #define AVERAGE 1
00055 #define WEIGHTAVG 2
00056 #define BEST 3
00057 #define SEPARATE 4 // sep model for planning, and forest for uncertainty
00058
00059 const std::string comboNames[] = {
00060 "Average",
00061 "Weighted Average",
00062 "Best",
00063 "Separate"
00064 };
00065
00066
00067 #define EXPLORE_UNKNOWN 0
00068 #define TWO_MODE 1
00069 #define TWO_MODE_PLUS_R 2
00070 #define CONTINUOUS_BONUS 3
00071 #define THRESHOLD_BONUS 4
00072 #define CONTINUOUS_BONUS_R 5
00073 #define THRESHOLD_BONUS_R 6
00074 #define NO_EXPLORE 7
00075 #define GREEDY 7
00076 #define EPSILONGREEDY 8
00077 #define VISITS_CONF 9
00078 #define UNVISITED_BONUS 11
00079 #define UNVISITED_ACT_BONUS 13
00080 #define DIFF_AND_VISIT_BONUS 16
00081 #define NOVEL_STATE_BONUS 18
00082 #define DIFF_AND_NOVEL_BONUS 19
00083
00084 const std::string exploreNames[] = {
00085 "Explore Unknowns",
00086 "Two Modes",
00087 "Two Models +R",
00088 "Continuous Bonus",
00089 "Threshold Bonus",
00090 "Continuous Bonus +R",
00091 "Threshold Bonus +R",
00092 "Greedy",
00093 "Epsilon-Greedy",
00094 "Visits Confidence",
00095 "Type 10",
00096 "Unvisited State Bonus",
00097 "Type 12",
00098 "Unvisited Action Bonus",
00099 "Type 14",
00100 "Type 15",
00101 "Model Diff & Visit Bonus",
00102 "Type 17",
00103 "FeatDist Bonus",
00104 "Model Diff & FeatDist Bonus"
00105 };
00106
00107
00108 #define VALUE_ITERATION 0
00109 #define POLICY_ITERATION 1
00110 #define PRI_SWEEPING 2
00111 #define UCT 3
00112 #define ET_UCT 4
00113 #define ET_UCT_WITH_ENV 5
00114 #define SWEEPING_UCT_HYBRID 6
00115 #define CMAC_PLANNER 7
00116 #define NN_PLANNER 8
00117 #define MOD_PRI_SWEEPING 9
00118 #define ET_UCT_L1 10
00119 #define UCT_WITH_L 11
00120 #define UCT_WITH_ENV 12
00121 #define PARALLEL_ET_UCT 13
00122 #define ET_UCT_ACTUAL 14
00123 #define ET_UCT_CORNERS 15
00124 #define PAR_ETUCT_ACTUAL 16
00125 #define PAR_ETUCT_CORNERS 17
00126 #define POMDP_ETUCT 18
00127 #define POMDP_PAR_ETUCT 19
00128 #define MBS_VI 20
00129
00130 const std::string plannerNames[] = {
00131 "Value Iteration",
00132 "Policy Iteration",
00133 "Prioritized Sweeping",
00134 "UCT",
00135 "UCT",
00136 "UCT",
00137 "Sweeping UCT Hybrid",
00138 "CMACs",
00139 "NN",
00140 "Mod. Pri Sweeping",
00141 "UCT L=1",
00142 "UCT L",
00143 "UCT Env",
00144 "Parallel UCT",
00145 "Real-Valued UCT",
00146 "Corner UCT",
00147 "Parallel Real-Valued UCT",
00148 "Parallel Corner UCT",
00149 "Delayed UCT",
00150 "Parallel Delayed UCT",
00151 "Model Based Simulation - VI"
00152 };
00153
00154
00155
00156 #define EPSILON 1e-5
00157
00159 struct experience {
00160 std::vector<float> s;
00161 int act;
00162 float reward;
00163 std::vector<float> next;
00164 bool terminal;
00165 };
00166
00168 struct modelPair {
00169 std::vector<float> in;
00170 std::vector<float> out;
00171 };
00172
00174 struct classPair {
00175 std::vector<float> in;
00176 float out;
00177 };
00178
00185 class Environment {
00186 public:
00190 virtual const std::vector<float> &sensation() const = 0;
00191
00195 virtual float apply(int action) = 0;
00196
00201 virtual bool terminal() const = 0;
00202
00207 virtual void reset() = 0;
00208
00211 virtual int getNumActions() = 0;
00212
00215 virtual void getMinMaxFeatures(std::vector<float> *minFeat,
00216 std::vector<float> *maxFeat) = 0;
00217
00219 virtual void getMinMaxReward(float *minR, float *maxR) = 0;
00220
00222 virtual bool isEpisodic(){ return true; };
00223
00225 virtual std::vector<experience> getSeedings()
00226 {
00227 std::vector<experience> e;
00228 return e;
00229 } ;
00230
00232 virtual void setSensation(std::vector<float> s){};
00233
00234 virtual ~Environment() {};
00235
00236 };
00237
00241 class Agent {
00242 public:
00248 virtual int first_action(const std::vector<float> &s) = 0;
00249
00257 virtual int next_action(float r, const std::vector<float> &s) = 0;
00258
00265 virtual void last_action(float r) = 0;
00266
00268 virtual void setDebug(bool d) = 0;
00269
00271 virtual void seedExp(std::vector<experience> seeds) {};
00272
00274 virtual void savePolicy(const char* filename) {};
00275
00276 virtual ~Agent() {};
00277 };
00278
00280 class Model {
00281 public:
00283 virtual bool trainInstances(std::vector<modelPair> &instances) = 0;
00284
00286 virtual bool trainInstance(modelPair &instance) = 0;
00287
00289 virtual std::vector<float> testInstance(const std::vector<float> &in) = 0;
00290
00291 virtual ~Model() {};
00292 };
00293
00295 class Classifier {
00296 public:
00298 virtual bool trainInstances(std::vector<classPair> &instances) = 0;
00299
00301 virtual bool trainInstance(classPair &instance) = 0;
00302
00304 virtual void testInstance(const std::vector<float> &in, std::map<float, float>* retval) = 0;
00305
00307 virtual float getConf(const std::vector<float> &in) = 0;
00308
00310 virtual Classifier* getCopy() = 0;
00311
00312 virtual ~Classifier() {};
00313 };
00314
00318 struct StateActionInfo {
00319 bool known;
00320 float reward;
00321 float termProb;
00322 int frameUpdated;
00323
00324
00325 std::map< std::vector<float> , float> transitionProbs;
00326
00327 StateActionInfo(){
00328 known = false;
00329 reward = 0.0;
00330 termProb = 0.0;
00331 frameUpdated = -1;
00332 };
00333 };
00334
00335
00337 class MDPModel {
00338 public:
00340 virtual bool updateWithExperiences(std::vector<experience> &instances) = 0;
00341
00343 virtual bool updateWithExperience(experience &instance) = 0;
00344
00346 virtual float getStateActionInfo(const std::vector<float> &state, int action, StateActionInfo* retval) = 0;
00347
00349 virtual MDPModel* getCopy() = 0;
00350 virtual ~MDPModel() {};
00351 };
00352
00354 class Planner {
00355 public:
00357 virtual void setModel(MDPModel* model) = 0;
00358
00360 virtual bool updateModelWithExperience(const std::vector<float>& last,
00361 int act,
00362 const std::vector<float>& curr,
00363 float reward, bool terminal) = 0;
00364
00366 virtual void planOnNewModel() = 0;
00367
00369 virtual int getBestAction(const std::vector<float> &s) = 0;
00370
00372 virtual void savePolicy(const char* filename) {};
00373
00375 virtual void setSeeding(bool seeding) {};
00376
00378 virtual void setFirst() {};
00379
00383 std::vector<float>::iterator
00384 random_max_element(std::vector<float>::iterator start,
00385 std::vector<float>::iterator end) {
00386 const float Q_EPSILON = 1e-4;
00387
00388 std::vector<float>::iterator max =
00389 std::max_element(start, end);
00390
00391
00392 int nfound = 0;
00393 for (std::vector<float>::iterator it = start; it != end; it++){
00394 if (fabs(*it - *max) < Q_EPSILON){
00395 nfound++;
00396 }
00397 }
00398
00399
00400 if (nfound == 1)
00401 return max;
00402
00403
00404 for (std::vector<float>::iterator it = start; it != end; it++){
00405 if (fabs(*it - *max) < Q_EPSILON){
00406 if (rng.uniform() < (1.0 / (float)nfound)){
00407 return it;
00408 }
00409 nfound--;
00410 }
00411 }
00412
00413 return max;
00414 };
00415
00416 virtual ~Planner() {};
00417
00418 Random rng;
00419
00420 };
00421
00422
00423 #endif