2 Deep actor-critic network,
3 From "Continuous control with deep reinforcement learning",
4 by Lillicrap et al, arXiv:1509.02971
10 from collections
import deque
12 import matplotlib.pyplot
as plt
14 import tensorflow
as tf
16 from pendulum
import Pendulum
19 RANDOM_SEED = int((time.time() % 10) * 1000)
20 print(f
"Seed = {RANDOM_SEED}")
21 np.random.seed(RANDOM_SEED)
22 tf.set_random_seed(RANDOM_SEED)
23 random.seed(RANDOM_SEED)
24 n_init = tflearn.initializations.truncated_normal(seed=RANDOM_SEED)
25 u_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003, seed=RANDOM_SEED)
30 QVALUE_LEARNING_RATE = 0.001
31 POLICY_LEARNING_RATE = 0.0001
49 nvars = len(tf.trainable_variables())
51 x = tflearn.input_data(shape=[
None, NX])
52 u = tflearn.input_data(shape=[
None, NU])
54 netx1 = tflearn.fully_connected(x, NH1, weights_init=n_init, activation=
"relu")
55 netx2 = tflearn.fully_connected(netx1, NH2, weights_init=n_init)
56 netu1 = tflearn.fully_connected(
57 u, NH1, weights_init=n_init, activation=
"linear"
59 netu2 = tflearn.fully_connected(netu1, NH2, weights_init=n_init)
60 net = tflearn.activation(netx2 + netu2, activation=
"relu")
61 qvalue = tflearn.fully_connected(net, 1, weights_init=u_init)
68 self.
hidens = [netx1, netx2, netu1, netu2]
71 qref = tf.placeholder(tf.float32, [
None, 1])
72 loss = tflearn.mean_square(qref, self.
qvalue)
73 optim = tf.train.AdamOptimizer(QVALUE_LEARNING_RATE).minimize(loss)
74 gradient = tf.gradients(self.
qvalue, self.
u)[0] / float(BATCH_SIZE)
86 target.assign(tau * ref + (1 - tau) * target)
87 for target, ref
in zip(self.
variables, nominalNet.variables)
94 nvars = len(tf.trainable_variables())
96 x = tflearn.input_data(shape=[
None, NX])
97 net = tflearn.fully_connected(x, NH1, activation=
"relu", weights_init=n_init)
98 net = tflearn.fully_connected(net, NH2, activation=
"relu", weights_init=n_init)
100 tflearn.fully_connected(net, NU, activation=
"tanh", weights_init=u_init)
110 qgradient = tf.placeholder(tf.float32, [
None, NU])
112 optim = tf.train.AdamOptimizer(POLICY_LEARNING_RATE).apply_gradients(
123 target.assign(tau * ref + (1 - tau) * target)
124 for target, ref
in zip(self.
variables, nominalNet.variables)
139 replayDeque = deque()
149 sess = tf.InteractiveSession()
150 tf.global_variables_initializer().
run()
160 for i
in range(maxiter):
161 u = sess.run(policy.policy, feed_dict={policy.x: x.T})
162 x, reward = env.step(u)
167 print(
"Lasted ", i,
" timestep -- total reward:", rsum)
180 for episode
in range(1, NEPISODES):
184 for step
in range(NSTEPS):
186 u = sess.run(policy.policy, feed_dict={policy.x: x})
187 u += 1.0 / (1.0 + episode + step)
193 replayDeque.append(
ReplayItem(x, u, r, done, x2))
194 if len(replayDeque) > REPLAY_SIZE:
195 replayDeque.popleft()
198 if done
or np.linalg.norm(x - x2) < 1e-3:
203 if len(replayDeque) > BATCH_SIZE:
204 batch = random.sample(
205 replayDeque, BATCH_SIZE
207 x_batch = np.vstack([b.x
for b
in batch])
208 u_batch = np.vstack([b.u
for b
in batch])
209 r_batch = np.vstack([b.reward
for b
in batch])
210 d_batch = np.vstack([b.done
for b
in batch])
211 x2_batch = np.vstack([b.x2
for b
in batch])
215 policyTarget.policy, feed_dict={policyTarget.x: x2_batch}
219 feed_dict={qvalueTarget.x: x2_batch, qvalueTarget.u: u2_batch},
221 qref_batch = r_batch + (
not d_batch) * (DECAY_RATE * q2_batch)
229 qvalue.qref: qref_batch,
234 u_targ = sess.run(policy.policy, feed_dict={policy.x: x_batch})
236 qvalue.gradient, feed_dict={qvalue.x: x_batch, qvalue.u: u_targ}
240 policy.optim, feed_dict={policy.x: x_batch, policy.qgradient: qgrad}
244 sess.run(policyTarget.update_variables)
245 sess.run(qvalueTarget.update_variables)
252 sess.run(qvalue.qvalue, feed_dict={qvalue.x: x_batch, qvalue.u: u_batch})
254 if "x_batch" in locals()
258 f
"Ep#{episode:3d}: lasted {step:d} steps, "
259 f
"reward={rsum:3.0f}, max qvalue={maxq:2.3f}"
264 if not (episode + 1) % 20:
269 print(f
"Average reward during trials: {sum(h_rwd) / NEPISODES:.3f}")
271 plt.plot(np.cumsum(h_rwd) / range(1, NEPISODES))