00001
00002
00003 #include "QLearningActionSelector.h"
00004
00005 #include "RewardFunction.h"
00006
00007 #include <cstdlib>
00008 #include <iterator>
00009 #include <algorithm>
00010
00011 #include <sstream>
00012 #include <iostream>
00013 #include <fstream>
00014
00015 #define ROSOUTPUT 0
00016 #if ROSOUTPUT
00017 #include <ros/console.h>
00018 #else
00019 #include <iostream>
00020
00021
00022
00023
00024 #define ROS_DEBUG( X )
00025 #define ROS_DEBUG_STREAM( X )
00026 #define ROS_INFO_STREAM( X )
00027 #endif
00028 #define EPSILON 0.1
00029
00030 using namespace actasp;
00031 using namespace std;
00032
00033 namespace bwi_krexec {
00034
00035 struct CompareValues {
00036
00037 CompareValues(QLearningActionSelector::ActionValueMap& value) : value(value) {}
00038
00039 bool operator()(const AspFluent& first, const AspFluent& second) {
00040 return value[first] < value[second];
00041 }
00042
00043 QLearningActionSelector::ActionValueMap& value;
00044 };
00045
00046 QLearningActionSelector::QLearningActionSelector(double alpha, RewardFunction<State> *reward,
00047 actasp::AspKR *reasoner, DefaultActionValue *defval) :
00048 reasoner(reasoner),
00049 defval(defval),
00050 alpha(alpha),
00051 reward(reward),
00052 value(),
00053 initial(),
00054 final(),
00055 previousAction("noaction(0)"),
00056 count(0) {}
00057
00058
00059 struct CompareSecond {
00060 bool operator()(const pair<AspFluent, double>& first, const pair<AspFluent, double>& second) {
00061 return first.second < second.second;
00062 }
00063 };
00064
00065 actasp::ActionSet::const_iterator QLearningActionSelector::choose(const actasp::ActionSet& options) throw() {
00066
00067 if (!(initial.empty() || final.empty())) {
00068
00069 ActionValueMap::const_iterator bestValuePair = max_element(value[final].begin(), value[final].end(),CompareSecond());
00070
00071 double bestValue = 0;
00072 if(bestValuePair != value[final].end())
00073 bestValue = bestValuePair->second;
00074
00075 double rew = reward->r(initial,previousAction,final);
00076
00077
00078 ROS_INFO_STREAM("old value: " << value[initial][previousAction]);
00079 ROS_INFO_STREAM("reward: " << rew);
00080
00081 value[initial][previousAction] = (1 - alpha) * value[initial][previousAction] + alpha * (rew + bestValue);
00082
00083 ROS_INFO_STREAM("new value: " << value[initial][previousAction]);
00084
00085 initial.clear();
00086 final.clear();
00087
00088 }
00089
00090
00091 stringstream ss;
00092 ss << "Evaluating options: ";
00093 copy(options.begin(), options.end(), ostream_iterator<string>(ss, " "));
00094 ss << endl;
00095
00096 AnswerSet currentState = reasoner->currentStateQuery(vector<AspRule>());
00097 State state(currentState.getFluents().begin(), currentState.getFluents().end());
00098
00099 ActionSet::const_iterator optIt = options.begin();
00100 for (; optIt != options.end(); ++optIt) {
00101 ActionValueMap &thisState = value[state];
00102
00103 if(thisState.find(*optIt) == thisState.end()) {
00104
00105 thisState[*optIt] = defval->value(*optIt);
00106 }
00107 ss << value[state][*optIt] << " ";
00108 }
00109
00110 ROS_INFO_STREAM(ss.str());
00111
00112 double prob = EPSILON;
00113
00114 if (rand() <= prob * RAND_MAX) {
00115 ActionSet::const_iterator chosen = options.begin();
00116 advance(chosen, rand() % options.size());
00117
00118 return chosen;
00119 }
00120
00121 actasp::ActionSet::const_iterator best = max_element(options.begin(), options.end(),CompareValues(value[state]));
00122
00123 return best;
00124
00125 }
00126
00127 void QLearningActionSelector::actionStarted(const AspFluent&) throw() {
00128 initial.clear();
00129
00130 AnswerSet currentState = reasoner->currentStateQuery(vector<AspRule>());
00131 initial.insert(currentState.getFluents().begin(), currentState.getFluents().end());
00132 }
00133
00134
00135 void QLearningActionSelector::actionTerminated(const AspFluent& action) throw() {
00136 AnswerSet currentState = reasoner->currentStateQuery(vector<AspRule>());
00137 final.clear();
00138 final.insert(currentState.getFluents().begin(), currentState.getFluents().end());
00139 previousAction = action;
00140 }
00141
00142 void QLearningActionSelector::episodeEnded() {
00143 if(initial.empty())
00144 return;
00145
00146 ROS_INFO_STREAM("old value: " << value[initial][previousAction]);
00147 value[initial][previousAction] = (1 - alpha) * value[initial][previousAction] + alpha * reward->r(initial,previousAction,final);
00148 ROS_INFO_STREAM("new value: " << value[initial][previousAction]);
00149
00150 initial.clear();
00151 final.clear();
00152 ++count;
00153 }
00154
00155
00156 void QLearningActionSelector::readFrom(std::istream & fromStream) throw() {
00157
00158 ROS_DEBUG("Loading value function");
00159
00160 const string whiteSpaces(" \t");
00161
00162 value.clear();
00163
00164 while (fromStream.good() && !fromStream.eof()) {
00165
00166 string stateLine;
00167 getline(fromStream,stateLine);
00168
00169 size_t firstChar = min(stateLine.find_first_of(whiteSpaces),static_cast<size_t>(0));
00170 size_t lastChar = min(stateLine.find_last_not_of(whiteSpaces),stateLine.size());
00171 stateLine = stateLine.substr(firstChar,lastChar-firstChar+1);
00172
00173 stringstream stateStream(stateLine);
00174
00175
00176
00177 if (stateLine.empty())
00178 return;
00179
00180 State state;
00181 copy(istream_iterator<string>(stateStream), istream_iterator<string>(), inserter(state, state.begin()));
00182
00183
00184 string actionLine;
00185 getline(fromStream,actionLine);
00186
00187 while (actionLine.find("-----") == string::npos) {
00188
00189 size_t firstChar = min(actionLine.find_first_of(whiteSpaces), static_cast<size_t>(0));
00190 size_t lastChar = min(actionLine.find_last_not_of(whiteSpaces),actionLine.size());
00191 actionLine = actionLine.substr(firstChar,lastChar-firstChar+1);
00192
00193 stringstream actionStream(actionLine);
00194
00195 double actionValue;
00196 string fluentString;
00197
00198 actionStream >> actionValue >> fluentString;
00199
00200 AspFluent action(fluentString);
00201
00202 value[state].insert(make_pair(action,actionValue));
00203
00204 getline(fromStream,actionLine);
00205 }
00206
00207 }
00208 }
00209
00210
00211 void QLearningActionSelector::writeTo(std::ostream & toStream) throw() {
00212
00213 ROS_DEBUG("Storing value function");
00214
00215 StateActionMap::const_iterator stateIt = value.begin();
00216
00217 ofstream stat("stats.txt", ios::app);
00218 AspFluent initialState("pos(2,0,0)");
00219
00220 for (; stateIt != value.end(); ++stateIt) {
00221
00222
00223 if(stateIt->first.find(initialState) != stateIt->first.end()) {
00224 ActionValueMap::const_iterator actionIt= stateIt->second.begin();
00225 for (; actionIt != stateIt->second.end(); ++actionIt) {
00226 if(stateIt->first.find(initialState) != stateIt->first.end())
00227 stat << value[stateIt->first][actionIt->first] << " ";
00228 }
00229
00230 }
00231
00232
00233
00234
00235 copy(stateIt->first.begin(), stateIt->first.end(), ostream_iterator<string>(toStream, " "));
00236
00237
00238 ActionValueMap::const_iterator actionIt = stateIt->second.begin();
00239 for (; actionIt != stateIt->second.end(); ++actionIt) {
00240
00241 toStream << endl;
00242
00243
00244
00245 toStream << actionIt->second << " " << actionIt->first.toString();
00246
00247 }
00248
00249
00250 toStream << endl << "-----" << endl;
00251 }
00252 stat << endl;
00253 stat.close();
00254
00255 }
00256 }