2 Example of Q-table learning with a simple discretized 1-pendulum environment. 
    8 import matplotlib.pyplot 
as plt
 
   10 from dpendulum 
import DPendulum
 
   13 RANDOM_SEED = int((time.time() % 10) * 1000)
 
   14 print(f
"Seed = {RANDOM_SEED}")
 
   15 np.random.seed(RANDOM_SEED)
 
   28 Q = np.zeros([env.nx, env.nu])  
 
   32     """Roll-out from random state using greedy policy.""" 
   34     for i 
in range(maxiter):
 
   35         a = np.argmax(Q[s, :])
 
   48 for episode 
in range(1, NEPISODES):
 
   51     for steps 
in range(NSTEPS):
 
   53             Q[x, :] + np.random.randn(1, NU) / episode
 
   55         x2, reward = env.step(u)
 
   58         Qref = reward + DECAY_RATE * np.max(Q[x2, :])
 
   61         Q[x, u] += LEARNING_RATE * (Qref - Q[x, u])
 
   69         print(f
"Episode #{episode} done with {sum(h_rwd[-20:])} sucess")
 
   71 print(f
"Total rate of success: {sum(h_rwd) / NEPISODES:.3f}")
 
   73 plt.plot(np.cumsum(h_rwd) / range(1, NEPISODES))