cpu_monitor.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 #
3 # Software License Agreement (BSD License)
4 #
5 # Copyright (c) 2009, Willow Garage, Inc.
6 # All rights reserved.
7 #
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions
10 # are met:
11 #
12 # * Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # * Redistributions in binary form must reproduce the above
15 # copyright notice, this list of conditions and the following
16 # disclaimer in the documentation and/or other materials provided
17 # with the distribution.
18 # * Neither the name of the Willow Garage nor the names of its
19 # contributors may be used to endorse or promote products derived
20 # from this software without specific prior written permission.
21 #
22 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 # POSSIBILITY OF SUCH DAMAGE.
34 
35 
36 
37 from __future__ import with_statement, print_function
38 import roslib
39 roslib.load_manifest('pr2_computer_monitor')
40 
41 import rospy
42 
43 import traceback
44 import threading
45 from threading import Timer
46 import sys, os, time
47 from time import sleep
48 import subprocess
49 import string
50 
51 import socket
52 
53 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
54 
55 
57 if sys.version_info[:3] == (2, 7, 3):
58  import threading
59  threading._DummyThread._Thread__stop = lambda x: 42
60 
61 
62 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
63 
64 # Output entire IPMI data set
65 def check_ipmi():
66  diag_vals = []
67  diag_msgs = []
68  diag_level = DiagnosticStatus.OK
69 
70  try:
71  p = subprocess.Popen('sudo ipmitool sdr',
72  stdout = subprocess.PIPE,
73  stderr = subprocess.PIPE, shell = True)
74  stdout, stderr = p.communicate()
75  stdout = stdout.decode()
76  stderr = stderr.decode()
77  retcode = p.returncode
78 
79  if retcode != 0:
80  diag_level = DiagnosticStatus.ERROR
81  diag_msgs = [ 'ipmitool Error' ]
82  diag_vals = [ KeyValue(key = 'IPMI Error', value = stderr) ]
83  return diag_vals, diag_msgs, diag_level
84 
85  lines = stdout.split('\n')
86  if len(lines) < 2:
87  diag_vals = [ KeyValue(key = 'ipmitool status', value = 'No output') ]
88 
89  diag_msgs = [ 'No ipmitool response' ]
90  diag_level = DiagnosticStatus.ERROR
91 
92  return diag_vals, diag_msgs, diag_level
93 
94  for ln in lines:
95  if len(ln) < 3:
96  continue
97 
98  words = ln.split('|')
99  if len(words) < 3:
100  continue
101 
102  name = words[0].strip()
103  ipmi_val = words[1].strip()
104  stat_byte = words[2].strip()
105 
106  # CPU temps
107  if words[0].startswith('CPU') and words[0].strip().endswith('Temp'):
108  if words[1].strip().endswith('degrees C'):
109  tmp = ipmi_val.rstrip(' degrees C').lstrip()
110  if tmp.isnumeric():
111  temperature = float(tmp)
112  diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
113 
114  cpu_name = name.split()[0]
115  if temperature >= 80 and temperature < 89:
116  diag_level = max(diag_level, DiagnosticStatus.WARN)
117  if diag_msgs.count('CPU Hot') == 0:
118  diag_msgs.append('CPU Warm')
119 
120  if temperature >= 89: # CPU should shut down here
121  diag_level = max(diag_level, DiagnosticStatus.ERROR)
122  diag_msgs.append('CPU Hot')
123  # Don't keep CPU Warm in list if CPU is hot
124  if diag_msgs.count('CPU Warm') > 0:
125  idx = diag_msgs.index('CPU Warm')
126  diag_msgs.pop(idx)
127  else:
128  diag_vals.append(KeyValue(key = name, value = words[1]))
129 
130 
131  # MP, BP, FP temps
132  if name == 'MB Temp' or name == 'BP Temp' or name == 'FP Temp':
133  if ipmi_val.endswith('degrees C'):
134  tmp = ipmi_val.rstrip(' degrees C').lstrip()
135  diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
136  # Give temp warning
137  dev_name = name.split()[0]
138  if tmp.isnumeric():
139  temperature = float(tmp)
140 
141  if temperature >= 60 and temperature < 75:
142  diag_level = max(diag_level, DiagnosticStatus.WARN)
143  diag_msgs.append('%s Warm' % dev_name)
144 
145  if temperature >= 75:
146  diag_level = max(diag_level, DiagnosticStatus.ERROR)
147  diag_msgs.append('%s Hot' % dev_name)
148  else:
149  diag_level = max(diag_level, DiagnosticStatus.ERROR)
150  diag_msgs.append('%s Error' % dev_name)
151  else:
152  diag_vals.append(KeyValue(key = name, value = ipmi_val))
153 
154  # CPU fan speeds
155  if (name.startswith('CPU') and name.endswith('Fan')) or name == 'MB Fan':
156  if ipmi_val.endswith('RPM'):
157  rpm = ipmi_val.rstrip(' RPM').lstrip()
158  if rpm.isnumeric():
159  if int(rpm) == 0:
160  diag_level = max(diag_level, DiagnosticStatus.ERROR)
161  diag_msgs.append('CPU Fan Off')
162 
163  diag_vals.append(KeyValue(key = name + ' RPM', value = rpm))
164  else:
165  diag_vals.append(KeyValue(key = name, value = ipmi_val))
166 
167  # If CPU is hot we get an alarm from ipmitool, report that too
168  # CPU should shut down if we get a hot alarm, so report as error
169  if name.startswith('CPU') and name.endswith('hot'):
170  if ipmi_val == '0x01':
171  diag_vals.append(KeyValue(key = name, value = 'OK'))
172  else:
173  diag_vals.append(KeyValue(key = name, value = 'Hot'))
174  diag_level = max(diag_level, DiagnosticStatus.ERROR)
175  diag_msgs.append('CPU Hot Alarm')
176 
177  except Exception as e:
178  diag_vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
179  diag_level = DiagnosticStatus.ERROR
180  diag_msgs.append('Exception')
181 
182  return diag_vals, diag_msgs, diag_level
183 
184 
185 
189 def check_core_temps(sys_temp_strings):
190  diag_vals = []
191  diag_level = 0
192  diag_msgs = []
193 
194  for index, temp_str in enumerate(sys_temp_strings):
195  if len(temp_str) < 5:
196  continue
197 
198  cmd = 'cat %s' % temp_str
199  p = subprocess.Popen(cmd, stdout = subprocess.PIPE,
200  stderr = subprocess.PIPE, shell = True)
201  stdout, stderr = p.communicate()
202  stdout = stdout.decode()
203  stderr = stderr.decode()
204  retcode = p.returncode
205 
206  if retcode != 0:
207  diag_level = DiagnosticStatus.ERROR
208  diag_msg = [ 'Core Temp Error' ]
209  diag_vals = [ KeyValue(key = 'Core Temp Error', value = stderr),
210  KeyValue(key = 'Output', value = stdout) ]
211  return diag_vals, diag_msgs, diag_level
212 
213  tmp = stdout.strip()
214  if tmp.isnumeric():
215  temp = float(tmp) / 1000
216  diag_vals.append(KeyValue(key = 'Core %d Temp' % index, value = str(temp)))
217 
218  if temp >= 85 and temp < 90:
219  diag_level = max(diag_level, DiagnosticStatus.WARN)
220  diag_msgs.append('Warm')
221  if temp >= 90:
222  diag_level = max(diag_level, DiagnosticStatus.ERROR)
223  diag_msgs.append('Hot')
224  else:
225  diag_level = max(diag_level, DiagnosticStatus.ERROR) # Error if not numeric value
226  diag_vals.append(KeyValue(key = 'Core %s Temp' % index, value = tmp))
227 
228  return diag_vals, diag_msgs, diag_level
229 
230 
231 def check_clock_speed(enforce_speed):
232  vals = []
233  msgs = []
234  lvl = DiagnosticStatus.OK
235 
236  try:
237  p = subprocess.Popen('cat /proc/cpuinfo | grep MHz',
238  stdout = subprocess.PIPE,
239  stderr = subprocess.PIPE, shell = True)
240  stdout, stderr = p.communicate()
241  stdout = stdout.decode()
242  stderr = stderr.decode()
243  retcode = p.returncode
244 
245  if retcode != 0:
246  lvl = DiagnosticStatus.ERROR
247  msgs = [ 'Clock speed error' ]
248  vals = [ KeyValue(key = 'Clock speed error', value = stderr),
249  KeyValue(key = 'Output', value = stdout) ]
250 
251  return (vals, msgs, lvl)
252 
253  for index, ln in enumerate(stdout.split('\n')):
254  words = ln.split(':')
255  if len(words) < 2:
256  continue
257 
258  speed = words[1].strip().split('.')[0] # Conversion to float doesn't work with decimal
259  vals.append(KeyValue(key = 'Core %d MHz' % index, value = speed))
260  if speed.isnumeric():
261  mhz = float(speed)
262 
263  if mhz < 2240 and mhz > 2150:
264  lvl = max(lvl, DiagnosticStatus.WARN)
265  if mhz <= 2150:
266  lvl = max(lvl, DiagnosticStatus.ERROR)
267  else:
268  # Automatically give error if speed isn't a number
269  lvl = max(lvl, DiagnosticStatus.ERROR)
270 
271  if not enforce_speed:
272  lvl = DiagnosticStatus.OK
273 
274  if lvl == DiagnosticStatus.WARN and enforce_speed:
275  msgs = [ 'Core slowing' ]
276  elif lvl == DiagnosticStatus.ERROR and enforce_speed:
277  msgs = [ 'Core throttled' ]
278 
279  except Exception as e:
280  rospy.logerr(traceback.format_exc())
281  lvl = DiagnosticStatus.ERROR
282  msgs.append('Exception')
283  vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
284 
285  return vals, msgs, lvl
286 
287 
288 # Add msgs output, too
289 
290 def check_uptime(load1_threshold, load5_threshold):
291  level = DiagnosticStatus.OK
292  vals = []
293 
294  load_dict = { 0: 'OK', 1: 'High Load', 2: 'Very High Load' }
295 
296  try:
297  p = subprocess.Popen('uptime', stdout = subprocess.PIPE,
298  stderr = subprocess.PIPE, shell = True)
299  stdout, stderr = p.communicate()
300  stdout = stdout.decode()
301  stderr = stderr.decode()
302  retcode = p.returncode
303 
304  if retcode != 0:
305  vals.append(KeyValue(key = 'uptime Failed', value = stderr))
306  return DiagnosticStatus.ERROR, vals
307 
308  upvals = stdout.split()
309  load1 = upvals[-3].rstrip(',')
310  load5 = upvals[-2].rstrip(',')
311  load15 = upvals[-1]
312  num_users = upvals[-7]
313 
314  # Give warning if we go over load limit
315  if float(load1) > load1_threshold or float(load5) > load5_threshold:
316  level = DiagnosticStatus.WARN
317 
318  vals.append(KeyValue(key = 'Load Average Status', value = load_dict[level]))
319  vals.append(KeyValue(key = '1 min Load Average', value = load1))
320  vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(load1_threshold)))
321  vals.append(KeyValue(key = '5 min Load Average', value = load5))
322  vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(load5_threshold)))
323  vals.append(KeyValue(key = '15 min Load Average', value = load15))
324  vals.append(KeyValue(key = 'Number of Users', value = num_users))
325 
326  except Exception as e:
327  rospy.logerr(traceback.format_exc())
328  level = DiagnosticStatus.ERROR
329  vals.append(KeyValue(key = 'Load Average Status', value = traceback.format_exc()))
330 
331  return level, load_dict[level], vals
332 
333 # Add msgs output
334 
336  values = []
337  level = DiagnosticStatus.OK
338  msg = ''
339 
340  mem_dict = { 0: 'OK', 1: 'Low Memory', 2: 'Very Low Memory' }
341 
342  try:
343  p = subprocess.Popen('free -m',
344  stdout = subprocess.PIPE,
345  stderr = subprocess.PIPE, shell = True)
346  stdout, stderr = p.communicate()
347  stdout = stdout.decode()
348  retcode = p.returncode
349 
350  if retcode != 0:
351  values.append(KeyValue(key = "\"free -m\" Call Error", value = str(retcode)))
352  return DiagnosticStatus.ERROR, values
353 
354  rows = stdout.split('\n')
355  data = rows[1].split()
356  total_mem = data[1]
357  used_mem = data[2]
358  free_mem = data[3]
359 
360  level = DiagnosticStatus.OK
361  if float(free_mem) < 25:
362  level = DiagnosticStatus.WARN
363  if float(free_mem) < 1:
364  level = DiagnosticStatus.ERROR
365 
366  values.append(KeyValue(key = 'Memory Status', value = mem_dict[level]))
367  values.append(KeyValue(key = 'Total Memory', value = total_mem))
368  values.append(KeyValue(key = 'Used Memory', value = used_mem))
369  values.append(KeyValue(key = 'Free Memory', value = free_mem))
370 
371  msg = mem_dict[level]
372  except Exception as e:
373  rospy.logerr(traceback.format_exc())
374  msg = 'Memory Usage Check Error'
375  values.append(KeyValue(key = msg, value = str(e)))
376  level = DiagnosticStatus.ERROR
377 
378  return level, mem_dict[level], values
379 
380 
381 
382 
384 usage_old = 0
385 has_warned_mpstat = False
386 has_error_core_count = False
387 def check_mpstat(core_count = -1):
388  vals = []
389  mp_level = DiagnosticStatus.OK
390 
391  load_dict = { 0: 'OK', 1: 'High Load', 2: 'Error' }
392 
393  try:
394  p = subprocess.Popen('mpstat -P ALL 1 1',
395  stdout = subprocess.PIPE,
396  stderr = subprocess.PIPE, shell = True)
397  stdout, stderr = p.communicate()
398  stdout = stdout.decode()
399  stderr = stderr.decode()
400  retcode = p.returncode
401 
402  if retcode != 0:
403  global has_warned_mpstat
404  if not has_warned_mpstat:
405  rospy.logerr("mpstat failed to run for cpu_monitor. Return code %d.", retcode)
406  has_warned_mpstat = True
407 
408  mp_level = DiagnosticStatus.ERROR
409  vals.append(KeyValue(key = '\"mpstat\" Call Error', value = str(retcode)))
410  return mp_level, 'Unable to Check CPU Usage', vals
411 
412  # Check which column '%idle' is, #4539
413  # mpstat output changed between 8.06 and 8.1
414  rows = stdout.split('\n')
415  col_names = rows[2].split()
416  idle_col = -1 if (len(col_names) > 2 and col_names[-1] == '%idle') else -2
417 
418  num_cores = 0
419  cores_loaded = 0
420  for index, row in enumerate(stdout.split('\n')):
421  if index < 3:
422  continue
423 
424  # Skip row containing 'all' data
425  if row.find('all') > -1:
426  continue
427 
428  lst = row.split()
429  if len(lst) < 8:
430  continue
431 
432 
433  if lst[0].startswith('Average'):
434  continue
435 
436  cpu_name = '%d' % (num_cores)
437  idle = lst[idle_col].replace(',', '.')
438  user = lst[3].replace(',', '.')
439  nice = lst[4].replace(',', '.')
440  system = lst[5].replace(',', '.')
441 
442  core_level = 0
443  usage = float(user) + float(nice)
444  if usage > 1000: # wrong reading, use old reading instead
445  rospy.logwarn('Read cpu usage of %f percent. Reverting to previous reading of %f percent'%(usage, usage_old))
446  usage = usage_old
447  usage_old = usage
448 
449  if usage > 90.0:
450  cores_loaded += 1
451  core_level = DiagnosticStatus.WARN
452  if usage > 110.0:
453  core_level = DiagnosticStatus.ERROR
454 
455  vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level]))
456  vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = user))
457  vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = nice))
458  vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = system))
459  vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = idle))
460 
461  num_cores += 1
462 
463  # Warn for high load only if we have <= 2 cores that aren't loaded
464  if num_cores - cores_loaded <= 2 and num_cores > 2:
465  mp_level = DiagnosticStatus.WARN
466 
467  # Check the number of cores if core_count > 0, #4850
468  if core_count > 0 and core_count != num_cores:
469  mp_level = DiagnosticStatus.ERROR
470  global has_error_core_count
471  if not has_error_core_count:
472  rospy.logerr('Error checking number of cores. Expected %d, got %d. Computer may have not booted properly.',
473  core_count, num_cores)
474  has_error_core_count = True
475  return DiagnosticStatus.ERROR, 'Incorrect number of CPU cores', vals
476 
477  except Exception as e:
478  mp_level = DiagnosticStatus.ERROR
479  vals.append(KeyValue(key = 'mpstat Exception', value = str(e)))
480 
481  return mp_level, load_dict[mp_level], vals
482 
483 
486  temp_vals = []
487  try:
488  p = subprocess.Popen('find /sys/devices -name temp1_input',
489  stdout = subprocess.PIPE,
490  stderr = subprocess.PIPE, shell = True)
491  stdout, stderr = p.communicate()
492  stdout = stdout.decode()
493  stderr = stderr.decode()
494  retcode = p.returncode
495 
496  if retcode != 0:
497  rospy.logerr('Error find core temp locations: %s' % stderr)
498  return []
499 
500  for ln in stdout.split('\n'):
501  temp_vals.append(ln.strip())
502 
503  return temp_vals
504  except:
505  rospy.logerr('Exception finding temp vals: %s' % traceback.format_exc())
506  return []
507 
508 def update_status_stale(stat, last_update_time):
509  time_since_update = rospy.get_time() - last_update_time
510 
511  stale_status = 'OK'
512  if time_since_update > 20 and time_since_update <= 35:
513  stale_status = 'Lagging'
514  if stat.level == DiagnosticStatus.OK:
515  stat.message = stale_status
516  elif stat.message.find(stale_status) < 0:
517  stat.message = ', '.join([stat.message, stale_status])
518  stat.level = max(stat.level, DiagnosticStatus.WARN)
519  if time_since_update > 35:
520  stale_status = 'Stale'
521  if stat.level == DiagnosticStatus.OK:
522  stat.message = stale_status
523  elif stat.message.find(stale_status) < 0:
524  stat.message = ', '.join([stat.message, stale_status])
525  stat.level = max(stat.level, DiagnosticStatus.ERROR)
526 
527 
528  stat.values.pop(0)
529  stat.values.pop(0)
530  stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
531  stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
532 
533 
534 class CPUMonitor():
535  def __init__(self, hostname, diag_hostname):
536  self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10)
537 
538  self._mutex = threading.Lock()
539 
540  self._check_ipmi = rospy.get_param('~check_ipmi_tool', True)
541  self._enforce_speed = rospy.get_param('~enforce_clock_speed', True)
542 
543  self._check_core_temps = rospy.get_param('~check_core_temps', False)
544  if self._check_core_temps:
545  rospy.logwarn('Checking CPU core temperatures is deprecated. This will be removed in D-turtle')
546  self._check_nfs = rospy.get_param('~check_nfs', False)
547  if self._check_nfs:
548  rospy.logwarn('NFS checking is deprecated for CPU monitor. This will be removed in D-turtle')
549 
550  self._load1_threshold = rospy.get_param('~load1_threshold', 5.0)
551  self._load5_threshold = rospy.get_param('~load5_threshold', 3.0)
552 
553  self._num_cores = rospy.get_param('~num_cores', 8.0)
554 
555  self._temps_timer = None
556  self._usage_timer = None
557  self._nfs_timer = None
558 
559  # Get temp_input files
561 
562  # CPU stats
563  self._temp_stat = DiagnosticStatus()
564  self._temp_stat.name = '%s CPU Temperature' % diag_hostname
565  self._temp_stat.level = 1
566  self._temp_stat.hardware_id = hostname
567  self._temp_stat.message = 'No Data'
568  self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
569  KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
570 
571  self._usage_stat = DiagnosticStatus()
572  self._usage_stat.name = '%s CPU Usage' % diag_hostname
573  self._usage_stat.level = 1
574  self._usage_stat.hardware_id = hostname
575  self._usage_stat.message = 'No Data'
576  self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
577  KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
578 
579  self._nfs_stat = DiagnosticStatus()
580  self._nfs_stat.name = '%s NFS IO' % diag_hostname
581  self._nfs_stat.level = 1
582  self._nfs_stat.hardware_id = hostname
583  self._nfs_stat.message = 'No Data'
584  self._nfs_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
585  KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
586 
589  self._last_nfs_time = 0
591 
592  # Start checking everything
593  self.check_temps()
594  if self._check_nfs:
595  self.check_nfs_stat()
596  self.check_usage()
597 
598  # Restart temperature checking
600  rospy.logerr('Restarting temperature check thread in cpu_monitor. This should not happen')
601  try:
602  with self._mutex:
603  if self._temps_timer:
604  self._temps_timer.cancel()
605 
606  self.check_temps()
607  except Exception as e:
608  rospy.logerr('Unable to restart temp thread. Error: %s' % traceback.format_exc())
609 
610 
611 
612  def cancel_timers(self):
613  if self._temps_timer:
614  self._temps_timer.cancel()
615 
616  if self._nfs_timer:
617  self._nfs_timer.cancel()
618 
619  if self._usage_timer:
620  self._usage_timer.cancel()
621 
622  def check_nfs_stat(self):
623  if rospy.is_shutdown():
624  with self._mutex:
625  self.cancel_timers()
626  return
627 
628  nfs_level = 0
629  msg = 'OK'
630  vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
631  KeyValue(key = 'Time Since Last Update', value = str(0) )]
632 
633  try:
634  p = subprocess.Popen('iostat -n',
635  stdout = subprocess.PIPE,
636  stderr = subprocess.PIPE, shell = True)
637  stdout, stderr = p.communicate()
638  stdout = stdout.decode()
639  stderr = stderr.decode()
640  retcode = p.returncode
641 
642  if retcode != 0:
643  nfs_level = DiagnosticStatus.ERROR
644  msg = 'iostat Error'
645  vals.append(KeyValue(key = '\"iostat -n\" Call Error', value = str(e)))
646  stdout = ''
647 
648 
649  for index, row in enumerate(stdout.split('\n')):
650  if index < 3:
651  continue
652 
653  lst = row.split()
654  if len(lst) < 7:
655  continue
656 
657  file_sys = lst[0]
658  read_blk = lst[1]
659  write_blk = lst[2]
660  read_blk_dir = lst[3]
661  write_blk_dir = lst[4]
662  r_blk_srv = lst[5]
663  w_blk_srv = lst[6]
664 
665  vals.append(KeyValue(
666  key = '%s Read Blks/s' % file_sys, value=read_blk))
667  vals.append(KeyValue(
668  key = '%s Write Blks/s' % file_sys, value=write_blk))
669  vals.append(KeyValue(
670  key = '%s Read Blk dir/s' % file_sys, value=read_blk_dir))
671  vals.append(KeyValue(
672  key = '%s Write Blks dir/s' % file_sys, value=write_blk_dir))
673  vals.append(KeyValue(
674  key = '%s Read Blks srv/s' % file_sys, value=r_blk_srv))
675  vals.append(KeyValue(
676  key = '%s Write Blks srv/s' % file_sys, value=w_blk_srv))
677 
678  except Exception as e:
679  rospy.logerr(traceback.format_exc())
680  nfs_level = DiagnosticStatus.ERROR
681  msg = 'Exception'
682  vals.append(KeyValue(key = 'Exception', value = str(e)))
683 
684  with self._mutex:
685  self._nfs_stat.level = nfs_level
686  self._nfs_stat.message = msg
687  self._nfs_stat.values = vals
688 
689  self._last_nfs_time = rospy.get_time()
690 
691  if not rospy.is_shutdown():
692  self._nfs_timer = threading.Timer(5.0, self.check_nfs_stat)
693  self._nfs_timer.start()
694  else:
695  self.cancel_timers()
696 
697 
698 
699  def check_temps(self):
700  if rospy.is_shutdown():
701  with self._mutex:
702  self.cancel_timers()
703  return
704 
705  diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
706  KeyValue(key = 'Time Since Last Update', value = str(0) ) ]
707  diag_msgs = []
708  diag_level = 0
709 
710  if self._check_ipmi:
711  ipmi_vals, ipmi_msgs, ipmi_level = check_ipmi()
712  diag_vals.extend(ipmi_vals)
713  diag_msgs.extend(ipmi_msgs)
714  diag_level = max(diag_level, ipmi_level)
715 
716  if self._check_core_temps:
717  core_vals, core_msgs, core_level = check_core_temps(self._temp_vals)
718  diag_vals.extend(core_vals)
719  diag_msgs.extend(core_msgs)
720  diag_level = max(diag_level, core_level)
721 
722  clock_vals, clock_msgs, clock_level = check_clock_speed(self._enforce_speed)
723  diag_vals.extend(clock_vals)
724  diag_msgs.extend(clock_msgs)
725  diag_level = max(diag_level, clock_level)
726 
727  diag_log = set(diag_msgs)
728  if len(diag_log) > 0:
729  message = ', '.join(diag_log)
730  else:
731  message = stat_dict[diag_level]
732 
733  with self._mutex:
734  self._last_temp_time = rospy.get_time()
735 
736  self._temp_stat.level = diag_level
737  self._temp_stat.message = message
738  self._temp_stat.values = diag_vals
739 
740  if not rospy.is_shutdown():
741  self._temps_timer = threading.Timer(5.0, self.check_temps)
742  self._temps_timer.start()
743  else:
744  self.cancel_timers()
745 
746  def check_usage(self):
747  if rospy.is_shutdown():
748  with self._mutex:
749  self.cancel_timers()
750  return
751 
752  diag_level = 0
753  diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
754  KeyValue(key = 'Time Since Last Update', value = 0 )]
755  diag_msgs = []
756 
757  # Check mpstat
758  mp_level, mp_msg, mp_vals = check_mpstat(self._num_cores)
759  diag_vals.extend(mp_vals)
760  if mp_level > 0:
761  diag_msgs.append(mp_msg)
762  diag_level = max(diag_level, mp_level)
763 
764  # Check uptime
765  uptime_level, up_msg, up_vals = check_uptime(self._load1_threshold, self._load5_threshold)
766  diag_vals.extend(up_vals)
767  if uptime_level > 0:
768  diag_msgs.append(up_msg)
769  diag_level = max(diag_level, uptime_level)
770 
771  # Check memory
772  mem_level, mem_msg, mem_vals = check_memory()
773  diag_vals.extend(mem_vals)
774  if mem_level > 0:
775  diag_msgs.append(mem_msg)
776  diag_level = max(diag_level, mem_level)
777 
778  if diag_msgs and diag_level > 0:
779  usage_msg = ', '.join(set(diag_msgs))
780  else:
781  usage_msg = stat_dict[diag_level]
782 
783  # Update status
784  with self._mutex:
785  self._last_usage_time = rospy.get_time()
786  self._usage_stat.level = diag_level
787  self._usage_stat.values = diag_vals
788 
789  self._usage_stat.message = usage_msg
790 
791  if not rospy.is_shutdown():
792  self._usage_timer = threading.Timer(5.0, self.check_usage)
793  self._usage_timer.start()
794  else:
795  self.cancel_timers()
796 
797  def publish_stats(self):
798  with self._mutex:
799  # Update everything with last update times
802  if self._check_nfs:
804 
805  msg = DiagnosticArray()
806  msg.header.stamp = rospy.get_rostime()
807  msg.status.append(self._temp_stat)
808  msg.status.append(self._usage_stat)
809  if self._check_nfs:
810  msg.status.append(self._nfs_stat)
811 
812  if rospy.get_time() - self._last_publish_time > 0.5:
813  self._diag_pub.publish(msg)
814  self._last_publish_time = rospy.get_time()
815 
816 
817  # Restart temperature checking if it goes stale, #4171
818  # Need to run this without mutex
819  if rospy.get_time() - self._last_temp_time > 90:
820  self._restart_temp_check()
821 
822 
823 if __name__ == '__main__':
824  hostname = socket.gethostname()
825 
826  import optparse
827  parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]")
828  parser.add_option("--diag-hostname", dest="diag_hostname",
829  help="Computer name in diagnostics output (ex: 'c1')",
830  metavar="DIAG_HOSTNAME",
831  action="store", default = hostname)
832  options, args = parser.parse_args(rospy.myargv())
833 
834  try:
835  rospy.init_node('cpu_monitor_%s' % hostname)
836  except rospy.exceptions.ROSInitException:
837  print('CPU monitor is unable to initialize node. Master may not be running.', file=sys.stderr)
838  sys.exit(0)
839 
840  cpu_node = CPUMonitor(hostname, options.diag_hostname)
841 
842  rate = rospy.Rate(1.0)
843  try:
844  while not rospy.is_shutdown():
845  rate.sleep()
846  cpu_node.publish_stats()
847  except KeyboardInterrupt:
848  pass
849  except Exception as e:
850  traceback.print_exc()
851  rospy.logerr(traceback.format_exc())
852 
853  cpu_node.cancel_timers()
854  sys.exit(0)
855 
856 
857 
858 
859 
860 
861 
cpu_monitor.check_core_temps
def check_core_temps(sys_temp_strings)
Check CPU core temps.
Definition: cpu_monitor.py:189
cpu_monitor.update_status_stale
def update_status_stale(stat, last_update_time)
Definition: cpu_monitor.py:508
cpu_monitor.CPUMonitor._diag_pub
_diag_pub
Definition: cpu_monitor.py:536
cpu_monitor.CPUMonitor._temp_stat
_temp_stat
Definition: cpu_monitor.py:563
cpu_monitor.CPUMonitor._nfs_stat
_nfs_stat
Definition: cpu_monitor.py:579
cpu_monitor.check_memory
def check_memory()
Uses 'free -m' to check free memory.
Definition: cpu_monitor.py:335
cpu_monitor.CPUMonitor.__init__
def __init__(self, hostname, diag_hostname)
Definition: cpu_monitor.py:535
cpu_monitor.CPUMonitor._last_usage_time
_last_usage_time
Definition: cpu_monitor.py:588
cpu_monitor.check_ipmi
def check_ipmi()
Definition: cpu_monitor.py:65
cpu_monitor.CPUMonitor._last_publish_time
_last_publish_time
Definition: cpu_monitor.py:590
cpu_monitor.CPUMonitor._last_temp_time
_last_temp_time
Definition: cpu_monitor.py:587
cpu_monitor.CPUMonitor._usage_stat
_usage_stat
Definition: cpu_monitor.py:571
cpu_monitor.CPUMonitor._mutex
_mutex
Definition: cpu_monitor.py:538
cpu_monitor.CPUMonitor._temp_vals
_temp_vals
Definition: cpu_monitor.py:560
cpu_monitor.check_mpstat
def check_mpstat(core_count=-1)
Definition: cpu_monitor.py:387
cpu_monitor.CPUMonitor._nfs_timer
_nfs_timer
Definition: cpu_monitor.py:557
cpu_monitor.get_core_temp_names
def get_core_temp_names()
Returns names for core temperature files Returns list of names, each name can be read like file.
Definition: cpu_monitor.py:485
cpu_monitor.CPUMonitor._last_nfs_time
_last_nfs_time
Definition: cpu_monitor.py:589
cpu_monitor.CPUMonitor
Definition: cpu_monitor.py:534
cpu_monitor.CPUMonitor._check_core_temps
_check_core_temps
Definition: cpu_monitor.py:543
cpu_monitor.CPUMonitor._enforce_speed
_enforce_speed
Definition: cpu_monitor.py:541
cpu_monitor.CPUMonitor.check_usage
def check_usage(self)
Definition: cpu_monitor.py:746
cpu_monitor.CPUMonitor._restart_temp_check
def _restart_temp_check(self)
Definition: cpu_monitor.py:599
cpu_monitor.CPUMonitor._check_nfs
_check_nfs
Definition: cpu_monitor.py:546
cpu_monitor.CPUMonitor.check_nfs_stat
def check_nfs_stat(self)
Definition: cpu_monitor.py:622
cpu_monitor.CPUMonitor._load5_threshold
_load5_threshold
Definition: cpu_monitor.py:551
cpu_monitor.check_uptime
def check_uptime(load1_threshold, load5_threshold)
Uses 'uptime' to see load average.
Definition: cpu_monitor.py:290
cpu_monitor.CPUMonitor._num_cores
_num_cores
Definition: cpu_monitor.py:553
cpu_monitor.CPUMonitor._load1_threshold
_load1_threshold
Definition: cpu_monitor.py:550
cpu_monitor.CPUMonitor._usage_timer
_usage_timer
Definition: cpu_monitor.py:556
cpu_monitor.CPUMonitor.check_temps
def check_temps(self)
Call every 10sec at minimum.
Definition: cpu_monitor.py:699
cpu_monitor.CPUMonitor._temps_timer
_temps_timer
Definition: cpu_monitor.py:555
cpu_monitor.CPUMonitor.cancel_timers
def cancel_timers(self)
Must have the lock to cancel everything.
Definition: cpu_monitor.py:612
cpu_monitor.check_clock_speed
def check_clock_speed(enforce_speed)
Checks clock speed from reading from CPU info.
Definition: cpu_monitor.py:231
cpu_monitor.CPUMonitor.publish_stats
def publish_stats(self)
Definition: cpu_monitor.py:797
cpu_monitor.CPUMonitor._check_ipmi
_check_ipmi
Definition: cpu_monitor.py:540


pr2_computer_monitor
Author(s): Kevin Watts (watts@willowgarage.com)
autogenerated on Thu Sep 26 2024 02:53:24