2 Example of Q-table learning with a simple discretized 1-pendulum environment.
8 import matplotlib.pyplot
as plt
10 from dpendulum
import DPendulum
13 RANDOM_SEED = int((time.time() % 10) * 1000)
14 print(f
"Seed = {RANDOM_SEED}")
15 np.random.seed(RANDOM_SEED)
28 Q = np.zeros([env.nx, env.nu])
32 """Roll-out from random state using greedy policy."""
34 for i
in range(maxiter):
35 a = np.argmax(Q[s, :])
48 for episode
in range(1, NEPISODES):
51 for steps
in range(NSTEPS):
53 Q[x, :] + np.random.randn(1, NU) / episode
55 x2, reward = env.step(u)
58 Qref = reward + DECAY_RATE * np.max(Q[x2, :])
61 Q[x, u] += LEARNING_RATE * (Qref - Q[x, u])
69 print(f
"Episode #{episode} done with {sum(h_rwd[-20:])} sucess")
71 print(f
"Total rate of success: {sum(h_rwd) / NEPISODES:.3f}")
73 plt.plot(np.cumsum(h_rwd) / range(1, NEPISODES))