2 Example of Q-table learning with a simple discretized 1-pendulum environment. 6 from dpendulum
import DPendulum
7 import matplotlib.pyplot
as plt
12 RANDOM_SEED = int((time.time()%10)*1000)
13 print "Seed = %d" % RANDOM_SEED
14 np.random.seed(RANDOM_SEED)
27 Q = np.zeros([env.nx,env.nu])
30 '''Roll-out from random state using greedy policy.''' 32 for i
in range(maxiter):
36 if r==1:
print 'Reward!';
break 38 signal.signal(signal.SIGTSTP,
lambda x,y:
rendertrial())
41 for episode
in range(1,NEPISODES):
44 for steps
in range(NSTEPS):
45 u = np.argmax(Q[x,:] + np.random.randn(1,NU)/episode)
46 x2,reward = env.step(u)
49 Qref = reward + DECAY_RATE*np.max(Q[x2,:])
52 Q[x,u] += LEARNING_RATE*(Qref-Q[x,u])
58 if not episode%20:
print 'Episode #%d done with %d sucess' % (episode,sum(h_rwd[-20:]))
60 print "Total rate of success: %.3f" % (sum(h_rwd)/NEPISODES)
62 plt.plot( np.cumsum(h_rwd)/range(1,NEPISODES) )
def rendertrial(maxiter=100)