2 Example of Q-table learning with a simple discretized 1-pendulum environment using a linear Q network. 7 import tensorflow
as tf
8 import matplotlib.pyplot
as plt
9 from dpendulum
import DPendulum
14 RANDOM_SEED = int((time.time()%10)*1000)
15 print "Seed = %d" % RANDOM_SEED
16 np.random.seed(RANDOM_SEED)
17 tf.set_random_seed(RANDOM_SEED)
33 x = tf.placeholder(shape=[1,NX],dtype=tf.float32)
34 W = tf.Variable(tf.random_uniform([NX,NU],0,0.01,seed=100))
35 qvalue = tf.matmul(x,W)
36 u = tf.argmax(qvalue,1)
38 qref = tf.placeholder(shape=[1,NU],dtype=tf.float32)
39 loss = tf.reduce_sum(tf.square(qref - qvalue))
40 optim = tf.train.GradientDescentOptimizer(LEARNING_RATE).minimize(loss)
49 tf.reset_default_graph()
51 sess = tf.InteractiveSession()
52 tf.global_variables_initializer().run()
55 '''Return a vector which is 0 everywhere except index <i> set to 1.''' 56 return np.array([[ (i==ix)
for i
in range(n) ],],np.float)
59 u += int(np.random.randn()*10/(i/50+10))
60 return np.clip(u,0,NU-1)
64 for i
in range(maxiter):
65 u = sess.run(qvalue.u,feed_dict={ qvalue.x:
onehot(x) })
68 if r==1:
print 'Reward!';
break 69 signal.signal(signal.SIGTSTP,
lambda x,y:
rendertrial())
75 for episode
in range(1,NEPISODES):
79 for step
in range(NSTEPS-1):
80 u = sess.run(qvalue.u,feed_dict={ qvalue.x:
onehot(x) })[0]
82 x2,reward = env.step(u)
85 Q2 = sess.run(qvalue.qvalue,feed_dict={ qvalue.x:
onehot(x2) })
86 Qref = sess.run(qvalue.qvalue,feed_dict={ qvalue.x:
onehot(x ) })
87 Qref[0,u] = reward + DECAY_RATE*np.max(Q2)
90 sess.run(qvalue.optim,feed_dict={ qvalue.x :
onehot(x),
98 if not episode%20:
print 'Episode #%d done with %d sucess' % (episode,sum(h_rwd[-20:]))
100 print "Total rate of success: %.3f" % (sum(h_rwd)/NEPISODES)
102 plt.plot( np.cumsum(h_rwd)/range(1,NEPISODES) )
def rendertrial(maxiter=100)