2 Example of Q-table learning with a simple discretized 1-pendulum environment.
8 import matplotlib.pyplot
as plt
10 from dpendulum
import DPendulum
13 RANDOM_SEED = int((time.time() % 10) * 1000)
14 print(
"Seed = %d" % RANDOM_SEED)
15 np.random.seed(RANDOM_SEED)
28 Q = np.zeros([env.nx, env.nu])
32 """Roll-out from random state using greedy policy."""
34 for i
in range(maxiter):
35 a = np.argmax(Q[s, :])
48 for episode
in range(1, NEPISODES):
51 for steps
in range(NSTEPS):
53 Q[x, :] + np.random.randn(1, NU) / episode
55 x2, reward = env.step(u)
58 Qref = reward + DECAY_RATE * np.max(Q[x2, :])
61 Q[x, u] += LEARNING_RATE * (Qref - Q[x, u])
69 print(
"Episode #%d done with %d sucess" % (episode, sum(h_rwd[-20:])))
71 print(
"Total rate of success: %.3f" % (sum(h_rwd) / NEPISODES))
73 plt.plot(np.cumsum(h_rwd) / range(1, NEPISODES))