2 Example of Q-table learning with a simple discretized 1-pendulum environment using a
9 import matplotlib.pyplot
as plt
11 import tensorflow
as tf
12 from dpendulum
import DPendulum
15 RANDOM_SEED = int((time.time() % 10) * 1000)
16 print(f
"Seed = {RANDOM_SEED}")
17 np.random.seed(RANDOM_SEED)
18 tf.set_random_seed(RANDOM_SEED)
35 x = tf.placeholder(shape=[1, NX], dtype=tf.float32)
36 W = tf.Variable(tf.random_uniform([NX, NU], 0, 0.01, seed=100))
37 qvalue = tf.matmul(x, W)
38 u = tf.argmax(qvalue, 1)
40 qref = tf.placeholder(shape=[1, NU], dtype=tf.float32)
41 loss = tf.reduce_sum(tf.square(qref - qvalue))
42 optim = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
53 tf.reset_default_graph()
55 sess = tf.InteractiveSession()
56 tf.global_variables_initializer().
run()
60 """Return a vector which is 0 everywhere except index <i> set to 1."""
63 [(i == ix)
for i
in range(n)],
70 u += int(np.random.randn() * 10 / (i / 50 + 10))
71 return np.clip(u, 0, NU - 1)
76 for i
in range(maxiter):
77 u = sess.run(qvalue.u, feed_dict={qvalue.x:
onehot(x)})
93 for episode
in range(1, NEPISODES):
97 for step
in range(NSTEPS - 1):
99 u = sess.run(qvalue.u, feed_dict={qvalue.x:
onehot(x)})[0]
101 x2, reward = env.step(u)
104 Q2 = sess.run(qvalue.qvalue, feed_dict={qvalue.x:
onehot(x2)})
105 Qref = sess.run(qvalue.qvalue, feed_dict={qvalue.x:
onehot(x)})
106 Qref[0, u] = reward + DECAY_RATE * np.max(Q2)
109 sess.run(qvalue.optim, feed_dict={qvalue.x:
onehot(x), qvalue.qref: Qref})
118 print(f
"Episode #{episode} done with {sum(h_rwd[-20:])} sucess")
120 print(f
"Total rate of success: {sum(h_rwd) / NEPISODES:.3f}")
122 plt.plot(np.cumsum(h_rwd) / range(1, NEPISODES))