2 Deep actor-critic network, 3 From "Continuous control with deep reinforcement learning", by Lillicrap et al, arXiv:1509.02971 6 from pendulum
import Pendulum
7 import tensorflow
as tf
11 from collections
import deque
14 import matplotlib.pyplot
as plt
17 RANDOM_SEED = int((time.time()%10)*1000)
18 print "Seed = %d" % RANDOM_SEED
19 np .random.seed (RANDOM_SEED)
20 tf .set_random_seed (RANDOM_SEED)
21 random.seed (RANDOM_SEED)
22 n_init = tflearn.initializations.truncated_normal(seed=RANDOM_SEED)
23 u_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003,\
29 QVALUE_LEARNING_RATE = 0.001
30 POLICY_LEARNING_RATE = 0.0001
47 nvars = len(tf.trainable_variables())
49 x = tflearn.input_data(shape=[
None, NX])
50 u = tflearn.input_data(shape=[
None, NU])
52 netx1 = tflearn.fully_connected(x, NH1, weights_init=n_init, activation=
'relu')
53 netx2 = tflearn.fully_connected(netx1, NH2, weights_init=n_init)
54 netu1 = tflearn.fully_connected(u, NH1, weights_init=n_init, activation=
'linear')
55 netu2 = tflearn.fully_connected(netu1, NH2, weights_init=n_init)
56 net = tflearn.activation (netx2+netu2,activation=
'relu')
57 qvalue = tflearn.fully_connected(net, 1, weights_init=u_init)
63 self.
hidens = [ netx1, netx2, netu1, netu2 ]
66 qref = tf.placeholder(tf.float32, [
None, 1])
67 loss = tflearn.mean_square(qref, self.
qvalue)
68 optim = tf.train.AdamOptimizer(QVALUE_LEARNING_RATE).minimize(loss)
69 gradient = tf.gradients(self.
qvalue, self.
u)[0] / float(BATCH_SIZE)
78 [ target.assign( tau*ref + (1-tau)*target ) \
79 for target,ref
in zip(self.
variables,nominalNet.variables) ]
84 nvars = len(tf.trainable_variables())
86 x = tflearn.input_data(shape=[
None, NX])
87 net = tflearn.fully_connected(x, NH1, activation=
'relu', weights_init=n_init)
88 net = tflearn.fully_connected(net, NH2, activation=
'relu', weights_init=n_init)
89 policy = tflearn.fully_connected(net, NU, activation=
'tanh', weights_init=u_init)*env.umax
97 qgradient = tf.placeholder(tf.float32, [
None, NU])
99 optim = tf.train.AdamOptimizer(POLICY_LEARNING_RATE).\
100 apply_gradients(zip(grad,self.
variables))
108 [ target.assign( tau*ref + (1-tau)*target ) \
109 for target,ref
in zip(self.
variables,nominalNet.variables) ]
121 replayDeque = deque()
131 sess = tf.InteractiveSession()
132 tf.global_variables_initializer().run()
141 for i
in range(maxiter):
142 u = sess.run(policy.policy, feed_dict={ policy.x: x.T })
143 x, reward = env.step(u)
147 if verbose:
print 'Lasted ',i,
' timestep -- total reward:',rsum
148 signal.signal(signal.SIGTSTP,
lambda x,y:
rendertrial())
156 for episode
in range(1,NEPISODES):
160 for step
in range(NSTEPS):
161 u = sess.run(policy.policy, feed_dict={ policy.x: x })
162 u += 1. / (1. + episode + step)
168 if len(replayDeque)>REPLAY_SIZE: replayDeque.popleft()
171 if done
or np.linalg.norm(x-x2)<1e-3:
break 175 if len(replayDeque) > BATCH_SIZE:
176 batch = random.sample(replayDeque,BATCH_SIZE)
177 x_batch = np.vstack([ b.x
for b
in batch ])
178 u_batch = np.vstack([ b.u
for b
in batch ])
179 r_batch = np.vstack([ b.reward
for b
in batch ])
180 d_batch = np.vstack([ b.done
for b
in batch ])
181 x2_batch = np.vstack([ b.x2
for b
in batch ])
184 u2_batch = sess.run(policyTarget.policy, feed_dict={ policyTarget .x : x2_batch})
185 q2_batch = sess.run(qvalueTarget.qvalue, feed_dict={ qvalueTarget.x : x2_batch,
186 qvalueTarget.u : u2_batch })
187 qref_batch = r_batch + (d_batch==
False)*(DECAY_RATE*q2_batch)
190 sess.run(qvalue.optim, feed_dict={ qvalue.x : x_batch,
192 qvalue.qref : qref_batch })
195 u_targ = sess.run(policy.policy, feed_dict={ policy.x : x_batch} )
196 qgrad = sess.run(qvalue.gradient, feed_dict={ qvalue.x : x_batch,
199 sess.run(policy.optim,feed_dict= { policy.x : x_batch,
200 policy.qgradient : qgrad })
203 sess.run(policyTarget. update_variables)
204 sess.run(qvalueTarget.update_variables)
209 maxq = np.max( sess.run(qvalue.qvalue,feed_dict={ qvalue.x : x_batch,
210 qvalue.u : u_batch }) ) \
211 if 'x_batch' in locals()
else 0
212 print 'Ep#{:3d}: lasted {:d} steps, reward={:3.0f}, max qvalue={:2.3f}' \
213 .format(episode, step,rsum, maxq)
221 print "Average reward during trials: %.3f" % (sum(h_rwd)/NEPISODES)
223 plt.plot( np.cumsum(h_rwd)/range(1,NEPISODES) )
def setupTargetAssign(self, nominalNet, tau=UPDATE_RATE)
— Q-value and policy networks
def rendertrial(maxiter=NSTEPS, verbose=True)
def __init__(self, x, u, r, d, x2)
def setupTargetAssign(self, nominalNet, tau=UPDATE_RATE)