2 Example of Q-table learning with a simple discretized 1-pendulum environment using a
9 import matplotlib.pyplot
as plt
11 import tensorflow
as tf
12 from dpendulum
import DPendulum
15 RANDOM_SEED = int((time.time() % 10) * 1000)
16 print(
"Seed = %d" % RANDOM_SEED)
17 np.random.seed(RANDOM_SEED)
18 tf.set_random_seed(RANDOM_SEED)
35 x = tf.placeholder(shape=[1, NX], dtype=tf.float32)
36 W = tf.Variable(tf.random_uniform([NX, NU], 0, 0.01, seed=100))
37 qvalue = tf.matmul(x, W)
38 u = tf.argmax(qvalue, 1)
40 qref = tf.placeholder(shape=[1, NU], dtype=tf.float32)
41 loss = tf.reduce_sum(tf.square(qref - qvalue))
42 optim = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
52 tf.reset_default_graph()
54 sess = tf.InteractiveSession()
55 tf.global_variables_initializer().
run()
59 """Return a vector which is 0 everywhere except index <i> set to 1."""
62 [(i == ix)
for i
in range(n)],
69 u += int(np.random.randn() * 10 / (i / 50 + 10))
70 return np.clip(u, 0, NU - 1)
75 for i
in range(maxiter):
76 u = sess.run(qvalue.u, feed_dict={qvalue.x:
onehot(x)})
92 for episode
in range(1, NEPISODES):
96 for step
in range(NSTEPS - 1):
97 u = sess.run(qvalue.u, feed_dict={qvalue.x:
onehot(x)})[0]
99 x2, reward = env.step(u)
102 Q2 = sess.run(qvalue.qvalue, feed_dict={qvalue.x:
onehot(x2)})
103 Qref = sess.run(qvalue.qvalue, feed_dict={qvalue.x:
onehot(x)})
104 Qref[0, u] = reward + DECAY_RATE * np.max(Q2)
107 sess.run(qvalue.optim, feed_dict={qvalue.x:
onehot(x), qvalue.qref: Qref})
116 print(
"Episode #%d done with %d sucess" % (episode, sum(h_rwd[-20:])))
118 print(
"Total rate of success: %.3f" % (sum(h_rwd) / NEPISODES))
120 plt.plot(np.cumsum(h_rwd) / range(1, NEPISODES))