2 Example of Q-table learning with a simple discretized 1-pendulum environment.     6 from dpendulum 
import DPendulum
     7 import matplotlib.pyplot 
as plt
    12 RANDOM_SEED = int((time.time()%10)*1000)
    13 print "Seed = %d" % RANDOM_SEED
    14 np.random.seed(RANDOM_SEED)
    27 Q     = np.zeros([env.nx,env.nu])       
    30     '''Roll-out from random state using greedy policy.'''    32     for i 
in range(maxiter):
    36         if r==1: 
print 'Reward!'; 
break    38 signal.signal(signal.SIGTSTP, 
lambda x,y:
rendertrial()) 
    41 for episode 
in range(1,NEPISODES):
    44     for steps 
in range(NSTEPS):
    45         u         = np.argmax(Q[x,:] + np.random.randn(1,NU)/episode) 
    46         x2,reward = env.step(u)
    49         Qref = reward + DECAY_RATE*np.max(Q[x2,:])
    52         Q[x,u] += LEARNING_RATE*(Qref-Q[x,u])
    58     if not episode%20: 
print 'Episode #%d done with %d sucess' % (episode,sum(h_rwd[-20:]))
    60 print "Total rate of success: %.3f" % (sum(h_rwd)/NEPISODES)
    62 plt.plot( np.cumsum(h_rwd)/range(1,NEPISODES) )
 def rendertrial(maxiter=100)