2 Example of Q-table learning with a simple discretized 1-pendulum environment using a 
    9 import matplotlib.pyplot 
as plt
 
   11 import tensorflow 
as tf
 
   12 from dpendulum 
import DPendulum
 
   15 RANDOM_SEED = int((time.time() % 10) * 1000)
 
   16 print(f
"Seed = {RANDOM_SEED}")
 
   17 np.random.seed(RANDOM_SEED)
 
   18 tf.set_random_seed(RANDOM_SEED)
 
   35         x = tf.placeholder(shape=[1, NX], dtype=tf.float32)
 
   36         W = tf.Variable(tf.random_uniform([NX, NU], 0, 0.01, seed=100))
 
   37         qvalue = tf.matmul(x, W)
 
   38         u = tf.argmax(qvalue, 1)
 
   40         qref = tf.placeholder(shape=[1, NU], dtype=tf.float32)
 
   41         loss = tf.reduce_sum(tf.square(qref - qvalue))
 
   42         optim = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
 
   53 tf.reset_default_graph()
 
   55 sess = tf.InteractiveSession()
 
   56 tf.global_variables_initializer().
run()
 
   60     """Return a vector which is 0 everywhere except index <i> set to 1.""" 
   63             [(i == ix) 
for i 
in range(n)],
 
   70     u += int(np.random.randn() * 10 / (i / 50 + 10))
 
   71     return np.clip(u, 0, NU - 1)
 
   76     for i 
in range(maxiter):
 
   77         u = sess.run(qvalue.u, feed_dict={qvalue.x: 
onehot(x)})
 
   93 for episode 
in range(1, NEPISODES):
 
   97     for step 
in range(NSTEPS - 1):
 
   99         u = sess.run(qvalue.u, feed_dict={qvalue.x: 
onehot(x)})[0]
 
  101         x2, reward = env.step(u)
 
  104         Q2 = sess.run(qvalue.qvalue, feed_dict={qvalue.x: 
onehot(x2)})
 
  105         Qref = sess.run(qvalue.qvalue, feed_dict={qvalue.x: 
onehot(x)})
 
  106         Qref[0, u] = reward + DECAY_RATE * np.max(Q2)
 
  109         sess.run(qvalue.optim, feed_dict={qvalue.x: 
onehot(x), qvalue.qref: Qref})
 
  118         print(f
"Episode #{episode} done with {sum(h_rwd[-20:])} sucess")
 
  120 print(f
"Total rate of success: {sum(h_rwd) / NEPISODES:.3f}")
 
  122 plt.plot(np.cumsum(h_rwd) / range(1, NEPISODES))