continuous.py
Go to the documentation of this file.
1 """
2 Deep actor-critic network,
3 From "Continuous control with deep reinforcement learning",
4 by Lillicrap et al, arXiv:1509.02971
5 """
6 
7 import random
8 import signal
9 import time
10 from collections import deque
11 
12 import matplotlib.pyplot as plt
13 import numpy as np
14 import tensorflow as tf
15 import tflearn
16 from pendulum import Pendulum
17 
18 # --- Random seed
19 RANDOM_SEED = int((time.time() % 10) * 1000)
20 print(f"Seed = {RANDOM_SEED}")
21 np.random.seed(RANDOM_SEED)
22 tf.set_random_seed(RANDOM_SEED)
23 random.seed(RANDOM_SEED)
24 n_init = tflearn.initializations.truncated_normal(seed=RANDOM_SEED)
25 u_init = tflearn.initializations.uniform(minval=-0.003, maxval=0.003, seed=RANDOM_SEED)
26 
27 # --- Hyper paramaters
28 NEPISODES = 100 # Max training steps
29 NSTEPS = 100 # Max episode length
30 QVALUE_LEARNING_RATE = 0.001 # Base learning rate for the Q-value Network
31 POLICY_LEARNING_RATE = 0.0001 # Base learning rate for the policy network
32 DECAY_RATE = 0.99 # Discount factor
33 UPDATE_RATE = 0.01 # Homotopy rate to update the networks
34 REPLAY_SIZE = 10000 # Size of replay buffer
35 BATCH_SIZE = 64 # Number of points to be fed in stochastic gradient
36 NH1 = NH2 = 250 # Hidden layer size
37 
38 # --- Environment
39 env = Pendulum(1) # Continuous pendulum
40 env.withSinCos = True # State is dim-3: (cosq,sinq,qdot) ...
41 NX = env.nobs # ... training converges with q,qdot with 2x more neurones.
42 NU = env.nu # Control is dim-1: joint torque
43 
44 # --- Q-value and policy networks
45 
46 
48  def __init__(self):
49  nvars = len(tf.trainable_variables())
50 
51  x = tflearn.input_data(shape=[None, NX])
52  u = tflearn.input_data(shape=[None, NU])
53 
54  netx1 = tflearn.fully_connected(x, NH1, weights_init=n_init, activation="relu")
55  netx2 = tflearn.fully_connected(netx1, NH2, weights_init=n_init)
56  netu1 = tflearn.fully_connected(
57  u, NH1, weights_init=n_init, activation="linear"
58  )
59  netu2 = tflearn.fully_connected(netu1, NH2, weights_init=n_init)
60  net = tflearn.activation(netx2 + netu2, activation="relu")
61  qvalue = tflearn.fully_connected(net, 1, weights_init=u_init)
62 
63  self.x = x # Network state <x> input in Q(x,u)
64  self.u = u # Network control <u> input in Q(x,u)
65  self.qvalue = qvalue # Network output <Q>
66  # Variables to be trained
67  self.variables = tf.trainable_variables()[nvars:]
68  self.hidens = [netx1, netx2, netu1, netu2] # Hidden layers for debug
69 
70  def setupOptim(self):
71  qref = tf.placeholder(tf.float32, [None, 1])
72  loss = tflearn.mean_square(qref, self.qvalue)
73  optim = tf.train.AdamOptimizer(QVALUE_LEARNING_RATE).minimize(loss)
74  gradient = tf.gradients(self.qvalue, self.u)[0] / float(BATCH_SIZE)
75 
76  self.qref = qref # Reference Q-values
77  self.optim = optim # Optimizer
78  self.gradient = (
79  # Gradient of Q wrt the control dQ/du (for policy training)
80  gradient
81  )
82  return self
83 
84  def setupTargetAssign(self, nominalNet, tau=UPDATE_RATE):
86  target.assign(tau * ref + (1 - tau) * target)
87  for target, ref in zip(self.variables, nominalNet.variables)
88  ]
89  return self
90 
91 
93  def __init__(self):
94  nvars = len(tf.trainable_variables())
95 
96  x = tflearn.input_data(shape=[None, NX])
97  net = tflearn.fully_connected(x, NH1, activation="relu", weights_init=n_init)
98  net = tflearn.fully_connected(net, NH2, activation="relu", weights_init=n_init)
99  policy = (
100  tflearn.fully_connected(net, NU, activation="tanh", weights_init=u_init)
101  * env.umax
102  )
103 
104  self.x = x # Network input <x> in Pi(x)
105  self.policy = policy # Network output <Pi>
106  # Variables to be trained
107  self.variables = tf.trainable_variables()[nvars:]
108 
109  def setupOptim(self):
110  qgradient = tf.placeholder(tf.float32, [None, NU])
111  grad = tf.gradients(self.policy, self.variables, -qgradient)
112  optim = tf.train.AdamOptimizer(POLICY_LEARNING_RATE).apply_gradients(
113  zip(grad, self.variables)
114  )
115 
116  # Q-value gradient wrt control (input value)
117  self.qgradient = qgradient
118  self.optim = optim # Optimizer
119  return self
120 
121  def setupTargetAssign(self, nominalNet, tau=UPDATE_RATE):
123  target.assign(tau * ref + (1 - tau) * target)
124  for target, ref in zip(self.variables, nominalNet.variables)
125  ]
126  return self
127 
128 
129 # --- Replay memory
131  def __init__(self, x, u, r, d, x2):
132  self.x = x
133  self.u = u
134  self.reward = r
135  self.done = d
136  self.x2 = x2
137 
138 
139 replayDeque = deque()
140 
141 # --- Tensor flow initialization
142 
143 policy = PolicyNetwork().setupOptim()
144 policyTarget = PolicyNetwork().setupTargetAssign(policy)
145 
146 qvalue = QValueNetwork().setupOptim()
147 qvalueTarget = QValueNetwork().setupTargetAssign(qvalue)
148 
149 sess = tf.InteractiveSession()
150 tf.global_variables_initializer().run()
151 
152 # Uncomment to save or restore networks
153 # tf.train.Saver().restore(sess, "netvalues/actorcritic.pre.ckpt")
154 # tf.train.Saver().save (sess, "netvalues/actorcritic.full.ckpt")
155 
156 
157 def rendertrial(maxiter=NSTEPS, verbose=True):
158  x = env.reset()
159  rsum = 0.0
160  for i in range(maxiter):
161  u = sess.run(policy.policy, feed_dict={policy.x: x.T})
162  x, reward = env.step(u)
163  env.render()
164  time.sleep(1e-2)
165  rsum += reward
166  if verbose:
167  print("Lasted ", i, " timestep -- total reward:", rsum)
168 
169 
170 signal.signal(
171  signal.SIGTSTP, lambda x, y: rendertrial()
172 ) # Roll-out when CTRL-Z is pressed
173 
174 # History of search
175 h_rwd = []
176 h_qva = []
177 h_ste = []
178 
179 # --- Training
180 for episode in range(1, NEPISODES):
181  x = env.reset().T
182  rsum = 0.0
183 
184  for step in range(NSTEPS):
185  # Greedy policy ...
186  u = sess.run(policy.policy, feed_dict={policy.x: x})
187  u += 1.0 / (1.0 + episode + step) # ... with noise
188  x2, r = env.step(u)
189  x2 = x2.T
190  done = False # pendulum scenario is endless.
191 
192  # Feed replay memory ...
193  replayDeque.append(ReplayItem(x, u, r, done, x2))
194  if len(replayDeque) > REPLAY_SIZE:
195  replayDeque.popleft() # ... with FIFO forgetting.
196 
197  rsum += r
198  if done or np.linalg.norm(x - x2) < 1e-3:
199  break # Break when pendulum is still.
200  x = x2
201 
202  # Start optimizing networks when memory size > batch size.
203  if len(replayDeque) > BATCH_SIZE:
204  batch = random.sample(
205  replayDeque, BATCH_SIZE
206  ) # Random batch from replay memory.
207  x_batch = np.vstack([b.x for b in batch])
208  u_batch = np.vstack([b.u for b in batch])
209  r_batch = np.vstack([b.reward for b in batch])
210  d_batch = np.vstack([b.done for b in batch])
211  x2_batch = np.vstack([b.x2 for b in batch])
212 
213  # Compute Q(x,u) from target network
214  u2_batch = sess.run(
215  policyTarget.policy, feed_dict={policyTarget.x: x2_batch}
216  )
217  q2_batch = sess.run(
218  qvalueTarget.qvalue,
219  feed_dict={qvalueTarget.x: x2_batch, qvalueTarget.u: u2_batch},
220  )
221  qref_batch = r_batch + (not d_batch) * (DECAY_RATE * q2_batch)
222 
223  # Update qvalue to solve HJB constraint: q = r + q'
224  sess.run(
225  qvalue.optim,
226  feed_dict={
227  qvalue.x: x_batch,
228  qvalue.u: u_batch,
229  qvalue.qref: qref_batch,
230  },
231  )
232 
233  # Compute approximate policy gradient ...
234  u_targ = sess.run(policy.policy, feed_dict={policy.x: x_batch})
235  qgrad = sess.run(
236  qvalue.gradient, feed_dict={qvalue.x: x_batch, qvalue.u: u_targ}
237  )
238  # ... and take an optimization step along this gradient.
239  sess.run(
240  policy.optim, feed_dict={policy.x: x_batch, policy.qgradient: qgrad}
241  )
242 
243  # Update target networks by homotopy.
244  sess.run(policyTarget.update_variables)
245  sess.run(qvalueTarget.update_variables)
246 
247  # \\\END_FOR step in range(NSTEPS)
248 
249  # Display and logging (not mandatory).
250  maxq = (
251  np.max(
252  sess.run(qvalue.qvalue, feed_dict={qvalue.x: x_batch, qvalue.u: u_batch})
253  )
254  if "x_batch" in locals()
255  else 0
256  )
257  print(
258  f"Ep#{episode:3d}: lasted {step:d} steps, "
259  f"reward={rsum:3.0f}, max qvalue={maxq:2.3f}"
260  )
261  h_rwd.append(rsum)
262  h_qva.append(maxq)
263  h_ste.append(step)
264  if not (episode + 1) % 20:
265  rendertrial(100)
266 
267 # \\\END_FOR episode in range(NEPISODES)
268 
269 print(f"Average reward during trials: {sum(h_rwd) / NEPISODES:.3f}")
270 rendertrial()
271 plt.plot(np.cumsum(h_rwd) / range(1, NEPISODES))
272 plt.show()
continuous.PolicyNetwork.__init__
def __init__(self)
Definition: continuous.py:93
continuous.QValueNetwork.__init__
def __init__(self)
Definition: continuous.py:48
continuous.QValueNetwork.u
u
Definition: continuous.py:64
continuous.QValueNetwork.variables
variables
Definition: continuous.py:67
continuous.ReplayItem.x2
x2
Definition: continuous.py:136
omniidl_be_python_with_docstring.run
def run(tree, args)
Definition: cmake/hpp/idl/omniidl_be_python_with_docstring.py:140
continuous.QValueNetwork.setupOptim
def setupOptim(self)
Definition: continuous.py:70
continuous.ReplayItem.x
x
Definition: continuous.py:132
continuous.QValueNetwork.gradient
gradient
Definition: continuous.py:78
continuous.QValueNetwork
Definition: continuous.py:47
continuous.QValueNetwork.setupTargetAssign
def setupTargetAssign(self, nominalNet, tau=UPDATE_RATE)
Definition: continuous.py:84
continuous.QValueNetwork.hidens
hidens
Definition: continuous.py:68
continuous.PolicyNetwork.variables
variables
Definition: continuous.py:107
pendulum.Pendulum
Definition: pendulum.py:43
continuous.ReplayItem.u
u
Definition: continuous.py:133
continuous.ReplayItem.reward
reward
Definition: continuous.py:134
continuous.PolicyNetwork.update_variables
update_variables
Definition: continuous.py:122
continuous.QValueNetwork.update_variables
update_variables
Definition: continuous.py:85
continuous.ReplayItem.done
done
Definition: continuous.py:135
continuous.PolicyNetwork
Definition: continuous.py:92
continuous.PolicyNetwork.qgradient
qgradient
Definition: continuous.py:117
continuous.QValueNetwork.optim
optim
Definition: continuous.py:77
continuous.PolicyNetwork.x
x
Definition: continuous.py:104
continuous.PolicyNetwork.policy
policy
Definition: continuous.py:105
continuous.ReplayItem.__init__
def __init__(self, x, u, r, d, x2)
Definition: continuous.py:131
continuous.PolicyNetwork.optim
optim
Definition: continuous.py:118
continuous.QValueNetwork.qvalue
qvalue
Definition: continuous.py:65
continuous.ReplayItem
Definition: continuous.py:130
continuous.rendertrial
def rendertrial(maxiter=NSTEPS, verbose=True)
Definition: continuous.py:157
continuous.QValueNetwork.qref
qref
Definition: continuous.py:76
continuous.PolicyNetwork.setupOptim
def setupOptim(self)
Definition: continuous.py:109
continuous.PolicyNetwork.setupTargetAssign
def setupTargetAssign(self, nominalNet, tau=UPDATE_RATE)
Definition: continuous.py:121
continuous.QValueNetwork.x
x
Definition: continuous.py:63


pinocchio
Author(s):
autogenerated on Thu Apr 10 2025 02:42:17