cpu_monitor.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 #
3 # Software License Agreement (BSD License)
4 #
5 # Copyright (c) 2009, Willow Garage, Inc.
6 # All rights reserved.
7 #
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions
10 # are met:
11 #
12 # * Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # * Redistributions in binary form must reproduce the above
15 # copyright notice, this list of conditions and the following
16 # disclaimer in the documentation and/or other materials provided
17 # with the distribution.
18 # * Neither the name of the Willow Garage nor the names of its
19 # contributors may be used to endorse or promote products derived
20 # from this software without specific prior written permission.
21 #
22 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 # POSSIBILITY OF SUCH DAMAGE.
34 
35 ##\author Kevin Watts
36 
37 from __future__ import with_statement
38 import roslib
39 roslib.load_manifest('pr2_computer_monitor')
40 
41 import rospy
42 
43 import traceback
44 import threading
45 from threading import Timer
46 import sys, os, time
47 from time import sleep
48 import subprocess
49 import string
50 
51 import socket
52 
53 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
54 
55 ##### monkey-patch to suppress threading error message in python 2.7.3
56 ##### See http://stackoverflow.com/questions/13193278/understand-python-threading-bug
57 if sys.version_info[:3] == (2, 7, 3):
58  import threading
59  threading._DummyThread._Thread__stop = lambda x: 42
60 #####
61 
62 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
63 
64 # Output entire IPMI data set
65 def check_ipmi():
66  diag_vals = []
67  diag_msgs = []
68  diag_level = DiagnosticStatus.OK
69 
70  try:
71  p = subprocess.Popen('sudo ipmitool sdr',
72  stdout = subprocess.PIPE,
73  stderr = subprocess.PIPE, shell = True)
74  stdout, stderr = p.communicate()
75  retcode = p.returncode
76 
77  if retcode != 0:
78  diag_level = DiagnosticStatus.ERROR
79  diag_msgs = [ 'ipmitool Error' ]
80  diag_vals = [ KeyValue(key = 'IPMI Error', value = stderr) ]
81  return diag_vals, diag_msgs, diag_level
82 
83  lines = stdout.split('\n')
84  if len(lines) < 2:
85  diag_vals = [ KeyValue(key = 'ipmitool status', value = 'No output') ]
86 
87  diag_msgs = [ 'No ipmitool response' ]
88  diag_level = DiagnosticStatus.ERROR
89 
90  return diag_vals, diag_msgs, diag_level
91 
92  for ln in lines:
93  if len(ln) < 3:
94  continue
95 
96  words = ln.split('|')
97  if len(words) < 3:
98  continue
99 
100  name = words[0].strip()
101  ipmi_val = words[1].strip()
102  stat_byte = words[2].strip()
103 
104  # CPU temps
105  if words[0].startswith('CPU') and words[0].strip().endswith('Temp'):
106  if words[1].strip().endswith('degrees C'):
107  tmp = ipmi_val.rstrip(' degrees C').lstrip()
108  if unicode(tmp).isnumeric():
109  temperature = float(tmp)
110  diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
111 
112  cpu_name = name.split()[0]
113  if temperature >= 80 and temperature < 89:
114  diag_level = max(diag_level, DiagnosticStatus.WARN)
115  if diag_msgs.count('CPU Hot') == 0:
116  diag_msgs.append('CPU Warm')
117 
118  if temperature >= 89: # CPU should shut down here
119  diag_level = max(diag_level, DiagnosticStatus.ERROR)
120  diag_msgs.append('CPU Hot')
121  # Don't keep CPU Warm in list if CPU is hot
122  if diag_msgs.count('CPU Warm') > 0:
123  idx = diag_msgs.index('CPU Warm')
124  diag_msgs.pop(idx)
125  else:
126  diag_vals.append(KeyValue(key = name, value = words[1]))
127 
128 
129  # MP, BP, FP temps
130  if name == 'MB Temp' or name == 'BP Temp' or name == 'FP Temp':
131  if ipmi_val.endswith('degrees C'):
132  tmp = ipmi_val.rstrip(' degrees C').lstrip()
133  diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
134  # Give temp warning
135  dev_name = name.split()[0]
136  if unicode(tmp).isnumeric():
137  temperature = float(tmp)
138 
139  if temperature >= 60 and temperature < 75:
140  diag_level = max(diag_level, DiagnosticStatus.WARN)
141  diag_msgs.append('%s Warm' % dev_name)
142 
143  if temperature >= 75:
144  diag_level = max(diag_level, DiagnosticStatus.ERROR)
145  diag_msgs.append('%s Hot' % dev_name)
146  else:
147  diag_level = max(diag_level, DiagnosticStatus.ERROR)
148  diag_msgs.append('%s Error' % dev_name)
149  else:
150  diag_vals.append(KeyValue(key = name, value = ipmi_val))
151 
152  # CPU fan speeds
153  if (name.startswith('CPU') and name.endswith('Fan')) or name == 'MB Fan':
154  if ipmi_val.endswith('RPM'):
155  rpm = ipmi_val.rstrip(' RPM').lstrip()
156  if unicode(rpm).isnumeric():
157  if int(rpm) == 0:
158  diag_level = max(diag_level, DiagnosticStatus.ERROR)
159  diag_msgs.append('CPU Fan Off')
160 
161  diag_vals.append(KeyValue(key = name + ' RPM', value = rpm))
162  else:
163  diag_vals.append(KeyValue(key = name, value = ipmi_val))
164 
165  # If CPU is hot we get an alarm from ipmitool, report that too
166  # CPU should shut down if we get a hot alarm, so report as error
167  if name.startswith('CPU') and name.endswith('hot'):
168  if ipmi_val == '0x01':
169  diag_vals.append(KeyValue(key = name, value = 'OK'))
170  else:
171  diag_vals.append(KeyValue(key = name, value = 'Hot'))
172  diag_level = max(diag_level, DiagnosticStatus.ERROR)
173  diag_msgs.append('CPU Hot Alarm')
174 
175  except Exception as e:
176  diag_vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
177  diag_level = DiagnosticStatus.ERROR
178  diag_msgs.append('Exception')
179 
180  return diag_vals, diag_msgs, diag_level
181 
182 
183 ##\brief Check CPU core temps
184 
187 def check_core_temps(sys_temp_strings):
188  diag_vals = []
189  diag_level = 0
190  diag_msgs = []
191 
192  for index, temp_str in enumerate(sys_temp_strings):
193  if len(temp_str) < 5:
194  continue
195 
196  cmd = 'cat %s' % temp_str
197  p = subprocess.Popen(cmd, stdout = subprocess.PIPE,
198  stderr = subprocess.PIPE, shell = True)
199  stdout, stderr = p.communicate()
200  retcode = p.returncode
201 
202  if retcode != 0:
203  diag_level = DiagnosticStatus.ERROR
204  diag_msg = [ 'Core Temp Error' ]
205  diag_vals = [ KeyValue(key = 'Core Temp Error', value = stderr),
206  KeyValue(key = 'Output', value = stdout) ]
207  return diag_vals, diag_msgs, diag_level
208 
209  tmp = stdout.strip()
210  if unicode(tmp).isnumeric():
211  temp = float(tmp) / 1000
212  diag_vals.append(KeyValue(key = 'Core %d Temp' % index, value = str(temp)))
213 
214  if temp >= 85 and temp < 90:
215  diag_level = max(diag_level, DiagnosticStatus.WARN)
216  diag_msgs.append('Warm')
217  if temp >= 90:
218  diag_level = max(diag_level, DiagnosticStatus.ERROR)
219  diag_msgs.append('Hot')
220  else:
221  diag_level = max(diag_level, DiagnosticStatus.ERROR) # Error if not numeric value
222  diag_vals.append(KeyValue(key = 'Core %s Temp' % index, value = tmp))
223 
224  return diag_vals, diag_msgs, diag_level
225 
226 ## Checks clock speed from reading from CPU info
227 def check_clock_speed(enforce_speed):
228  vals = []
229  msgs = []
230  lvl = DiagnosticStatus.OK
231 
232  try:
233  p = subprocess.Popen('cat /proc/cpuinfo | grep MHz',
234  stdout = subprocess.PIPE,
235  stderr = subprocess.PIPE, shell = True)
236  stdout, stderr = p.communicate()
237  retcode = p.returncode
238 
239  if retcode != 0:
240  lvl = DiagnosticStatus.ERROR
241  msgs = [ 'Clock speed error' ]
242  vals = [ KeyValue(key = 'Clock speed error', value = stderr),
243  KeyValue(key = 'Output', value = stdout) ]
244 
245  return (vals, msgs, lvl)
246 
247  for index, ln in enumerate(stdout.split('\n')):
248  words = ln.split(':')
249  if len(words) < 2:
250  continue
251 
252  speed = words[1].strip().split('.')[0] # Conversion to float doesn't work with decimal
253  vals.append(KeyValue(key = 'Core %d MHz' % index, value = speed))
254  if unicode(speed).isnumeric():
255  mhz = float(speed)
256 
257  if mhz < 2240 and mhz > 2150:
258  lvl = max(lvl, DiagnosticStatus.WARN)
259  if mhz <= 2150:
260  lvl = max(lvl, DiagnosticStatus.ERROR)
261  else:
262  # Automatically give error if speed isn't a number
263  lvl = max(lvl, DiagnosticStatus.ERROR)
264 
265  if not enforce_speed:
266  lvl = DiagnosticStatus.OK
267 
268  if lvl == DiagnosticStatus.WARN and enforce_speed:
269  msgs = [ 'Core slowing' ]
270  elif lvl == DiagnosticStatus.ERROR and enforce_speed:
271  msgs = [ 'Core throttled' ]
272 
273  except Exception as e:
274  rospy.logerr(traceback.format_exc())
275  lvl = DiagnosticStatus.ERROR
276  msgs.append('Exception')
277  vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
278 
279  return vals, msgs, lvl
280 
281 
282 # Add msgs output, too
283 ##\brief Uses 'uptime' to see load average
284 def check_uptime(load1_threshold, load5_threshold):
285  level = DiagnosticStatus.OK
286  vals = []
287 
288  load_dict = { 0: 'OK', 1: 'High Load', 2: 'Very High Load' }
289 
290  try:
291  p = subprocess.Popen('uptime', stdout = subprocess.PIPE,
292  stderr = subprocess.PIPE, shell = True)
293  stdout, stderr = p.communicate()
294  retcode = p.returncode
295 
296  if retcode != 0:
297  vals.append(KeyValue(key = 'uptime Failed', value = stderr))
298  return DiagnosticStatus.ERROR, vals
299 
300  upvals = stdout.split()
301  load1 = upvals[-3].rstrip(',')
302  load5 = upvals[-2].rstrip(',')
303  load15 = upvals[-1]
304  num_users = upvals[-7]
305 
306  # Give warning if we go over load limit
307  if float(load1) > load1_threshold or float(load5) > load5_threshold:
308  level = DiagnosticStatus.WARN
309 
310  vals.append(KeyValue(key = 'Load Average Status', value = load_dict[level]))
311  vals.append(KeyValue(key = '1 min Load Average', value = load1))
312  vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(load1_threshold)))
313  vals.append(KeyValue(key = '5 min Load Average', value = load5))
314  vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(load5_threshold)))
315  vals.append(KeyValue(key = '15 min Load Average', value = load15))
316  vals.append(KeyValue(key = 'Number of Users', value = num_users))
317 
318  except Exception as e:
319  rospy.logerr(traceback.format_exc())
320  level = DiagnosticStatus.ERROR
321  vals.append(KeyValue(key = 'Load Average Status', value = traceback.format_exc()))
322 
323  return level, load_dict[level], vals
324 
325 # Add msgs output
326 ##\brief Uses 'free -m' to check free memory
328  values = []
329  level = DiagnosticStatus.OK
330  msg = ''
331 
332  mem_dict = { 0: 'OK', 1: 'Low Memory', 2: 'Very Low Memory' }
333 
334  try:
335  p = subprocess.Popen('free -m',
336  stdout = subprocess.PIPE,
337  stderr = subprocess.PIPE, shell = True)
338  stdout, stderr = p.communicate()
339  retcode = p.returncode
340 
341  if retcode != 0:
342  values.append(KeyValue(key = "\"free -m\" Call Error", value = str(retcode)))
343  return DiagnosticStatus.ERROR, values
344 
345  rows = stdout.split('\n')
346  data = rows[1].split()
347  total_mem = data[1]
348  used_mem = data[2]
349  free_mem = data[3]
350 
351  level = DiagnosticStatus.OK
352  if float(free_mem) < 25:
353  level = DiagnosticStatus.WARN
354  if float(free_mem) < 1:
355  level = DiagnosticStatus.ERROR
356 
357  values.append(KeyValue(key = 'Memory Status', value = mem_dict[level]))
358  values.append(KeyValue(key = 'Total Memory', value = total_mem))
359  values.append(KeyValue(key = 'Used Memory', value = used_mem))
360  values.append(KeyValue(key = 'Free Memory', value = free_mem))
361 
362  msg = mem_dict[level]
363  except Exception as e:
364  rospy.logerr(traceback.format_exc())
365  msg = 'Memory Usage Check Error'
366  values.append(KeyValue(key = msg, value = str(e)))
367  level = DiagnosticStatus.ERROR
368 
369  return level, mem_dict[level], values
370 
371 
372 
373 ##\brief Use mpstat to find CPU usage
374 
375 usage_old = 0
376 has_warned_mpstat = False
377 has_error_core_count = False
378 def check_mpstat(core_count = -1):
379  vals = []
380  mp_level = DiagnosticStatus.OK
381 
382  load_dict = { 0: 'OK', 1: 'High Load', 2: 'Error' }
383 
384  try:
385  p = subprocess.Popen('mpstat -P ALL 1 1',
386  stdout = subprocess.PIPE,
387  stderr = subprocess.PIPE, shell = True)
388  stdout, stderr = p.communicate()
389  retcode = p.returncode
390 
391  if retcode != 0:
392  global has_warned_mpstat
393  if not has_warned_mpstat:
394  rospy.logerr("mpstat failed to run for cpu_monitor. Return code %d.", retcode)
395  has_warned_mpstat = True
396 
397  mp_level = DiagnosticStatus.ERROR
398  vals.append(KeyValue(key = '\"mpstat\" Call Error', value = str(retcode)))
399  return mp_level, 'Unable to Check CPU Usage', vals
400 
401  # Check which column '%idle' is, #4539
402  # mpstat output changed between 8.06 and 8.1
403  rows = stdout.split('\n')
404  col_names = rows[2].split()
405  idle_col = -1 if (len(col_names) > 2 and col_names[-1] == '%idle') else -2
406 
407  num_cores = 0
408  cores_loaded = 0
409  for index, row in enumerate(stdout.split('\n')):
410  if index < 3:
411  continue
412 
413  # Skip row containing 'all' data
414  if row.find('all') > -1:
415  continue
416 
417  lst = row.split()
418  if len(lst) < 8:
419  continue
420 
421  ## Ignore 'Average: ...' data
422  if lst[0].startswith('Average'):
423  continue
424 
425  cpu_name = '%d' % (num_cores)
426  idle = lst[idle_col].replace(',', '.')
427  user = lst[3].replace(',', '.')
428  nice = lst[4].replace(',', '.')
429  system = lst[5].replace(',', '.')
430 
431  core_level = 0
432  usage = float(user) + float(nice)
433  if usage > 1000: # wrong reading, use old reading instead
434  rospy.logwarn('Read cpu usage of %f percent. Reverting to previous reading of %f percent'%(usage, usage_old))
435  usage = usage_old
436  usage_old = usage
437 
438  if usage > 90.0:
439  cores_loaded += 1
440  core_level = DiagnosticStatus.WARN
441  if usage > 110.0:
442  core_level = DiagnosticStatus.ERROR
443 
444  vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level]))
445  vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = user))
446  vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = nice))
447  vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = system))
448  vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = idle))
449 
450  num_cores += 1
451 
452  # Warn for high load only if we have <= 2 cores that aren't loaded
453  if num_cores - cores_loaded <= 2 and num_cores > 2:
454  mp_level = DiagnosticStatus.WARN
455 
456  # Check the number of cores if core_count > 0, #4850
457  if core_count > 0 and core_count != num_cores:
458  mp_level = DiagnosticStatus.ERROR
459  global has_error_core_count
460  if not has_error_core_count:
461  rospy.logerr('Error checking number of cores. Expected %d, got %d. Computer may have not booted properly.',
462  core_count, num_cores)
463  has_error_core_count = True
464  return DiagnosticStatus.ERROR, 'Incorrect number of CPU cores', vals
465 
466  except Exception as e:
467  mp_level = DiagnosticStatus.ERROR
468  vals.append(KeyValue(key = 'mpstat Exception', value = str(e)))
469 
470  return mp_level, load_dict[mp_level], vals
471 
472 ## Returns names for core temperature files
473 ## Returns list of names, each name can be read like file
475  temp_vals = []
476  try:
477  p = subprocess.Popen('find /sys/devices -name temp1_input',
478  stdout = subprocess.PIPE,
479  stderr = subprocess.PIPE, shell = True)
480  stdout, stderr = p.communicate()
481  retcode = p.returncode
482 
483  if retcode != 0:
484  rospy.logerr('Error find core temp locations: %s' % stderr)
485  return []
486 
487  for ln in stdout.split('\n'):
488  temp_vals.append(ln.strip())
489 
490  return temp_vals
491  except:
492  rospy.logerr('Exception finding temp vals: %s' % traceback.format_exc())
493  return []
494 
495 def update_status_stale(stat, last_update_time):
496  time_since_update = rospy.get_time() - last_update_time
497 
498  stale_status = 'OK'
499  if time_since_update > 20 and time_since_update <= 35:
500  stale_status = 'Lagging'
501  if stat.level == DiagnosticStatus.OK:
502  stat.message = stale_status
503  elif stat.message.find(stale_status) < 0:
504  stat.message = ', '.join([stat.message, stale_status])
505  stat.level = max(stat.level, DiagnosticStatus.WARN)
506  if time_since_update > 35:
507  stale_status = 'Stale'
508  if stat.level == DiagnosticStatus.OK:
509  stat.message = stale_status
510  elif stat.message.find(stale_status) < 0:
511  stat.message = ', '.join([stat.message, stale_status])
512  stat.level = max(stat.level, DiagnosticStatus.ERROR)
513 
514 
515  stat.values.pop(0)
516  stat.values.pop(0)
517  stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
518  stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
519 
520 
521 class CPUMonitor():
522  def __init__(self, hostname, diag_hostname):
523  self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10)
524 
525  self._mutex = threading.Lock()
526 
527  self._check_ipmi = rospy.get_param('~check_ipmi_tool', True)
528  self._enforce_speed = rospy.get_param('~enforce_clock_speed', True)
529 
530  self._check_core_temps = rospy.get_param('~check_core_temps', False)
531  if self._check_core_temps:
532  rospy.logwarn('Checking CPU core temperatures is deprecated. This will be removed in D-turtle')
533  self._check_nfs = rospy.get_param('~check_nfs', False)
534  if self._check_nfs:
535  rospy.logwarn('NFS checking is deprecated for CPU monitor. This will be removed in D-turtle')
536 
537  self._load1_threshold = rospy.get_param('~load1_threshold', 5.0)
538  self._load5_threshold = rospy.get_param('~load5_threshold', 3.0)
539 
540  self._num_cores = rospy.get_param('~num_cores', 8.0)
541 
542  self._temps_timer = None
543  self._usage_timer = None
544  self._nfs_timer = None
545 
546  # Get temp_input files
548 
549  # CPU stats
550  self._temp_stat = DiagnosticStatus()
551  self._temp_stat.name = '%s CPU Temperature' % diag_hostname
552  self._temp_stat.level = 1
553  self._temp_stat.hardware_id = hostname
554  self._temp_stat.message = 'No Data'
555  self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
556  KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
557 
558  self._usage_stat = DiagnosticStatus()
559  self._usage_stat.name = '%s CPU Usage' % diag_hostname
560  self._usage_stat.level = 1
561  self._usage_stat.hardware_id = hostname
562  self._usage_stat.message = 'No Data'
563  self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
564  KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
565 
566  self._nfs_stat = DiagnosticStatus()
567  self._nfs_stat.name = '%s NFS IO' % diag_hostname
568  self._nfs_stat.level = 1
569  self._nfs_stat.hardware_id = hostname
570  self._nfs_stat.message = 'No Data'
571  self._nfs_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
572  KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
573 
576  self._last_nfs_time = 0
578 
579  # Start checking everything
580  self.check_temps()
581  if self._check_nfs:
582  self.check_nfs_stat()
583  self.check_usage()
584 
585  # Restart temperature checking
587  rospy.logerr('Restarting temperature check thread in cpu_monitor. This should not happen')
588  try:
589  with self._mutex:
590  if self._temps_timer:
591  self._temps_timer.cancel()
592 
593  self.check_temps()
594  except Exception as e:
595  rospy.logerr('Unable to restart temp thread. Error: %s' % traceback.format_exc())
596 
597 
598  ## Must have the lock to cancel everything
599  def cancel_timers(self):
600  if self._temps_timer:
601  self._temps_timer.cancel()
602 
603  if self._nfs_timer:
604  self._nfs_timer.cancel()
605 
606  if self._usage_timer:
607  self._usage_timer.cancel()
608 
609  def check_nfs_stat(self):
610  if rospy.is_shutdown():
611  with self._mutex:
612  self.cancel_timers()
613  return
614 
615  nfs_level = 0
616  msg = 'OK'
617  vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
618  KeyValue(key = 'Time Since Last Update', value = str(0) )]
619 
620  try:
621  p = subprocess.Popen('iostat -n',
622  stdout = subprocess.PIPE,
623  stderr = subprocess.PIPE, shell = True)
624  stdout, stderr = p.communicate()
625  retcode = p.returncode
626 
627  if retcode != 0:
628  nfs_level = DiagnosticStatus.ERROR
629  msg = 'iostat Error'
630  vals.append(KeyValue(key = '\"iostat -n\" Call Error', value = str(e)))
631  stdout = ''
632 
633 
634  for index, row in enumerate(stdout.split('\n')):
635  if index < 3:
636  continue
637 
638  lst = row.split()
639  if len(lst) < 7:
640  continue
641 
642  file_sys = lst[0]
643  read_blk = lst[1]
644  write_blk = lst[2]
645  read_blk_dir = lst[3]
646  write_blk_dir = lst[4]
647  r_blk_srv = lst[5]
648  w_blk_srv = lst[6]
649 
650  vals.append(KeyValue(
651  key = '%s Read Blks/s' % file_sys, value=read_blk))
652  vals.append(KeyValue(
653  key = '%s Write Blks/s' % file_sys, value=write_blk))
654  vals.append(KeyValue(
655  key = '%s Read Blk dir/s' % file_sys, value=read_blk_dir))
656  vals.append(KeyValue(
657  key = '%s Write Blks dir/s' % file_sys, value=write_blk_dir))
658  vals.append(KeyValue(
659  key = '%s Read Blks srv/s' % file_sys, value=r_blk_srv))
660  vals.append(KeyValue(
661  key = '%s Write Blks srv/s' % file_sys, value=w_blk_srv))
662 
663  except Exception as e:
664  rospy.logerr(traceback.format_exc())
665  nfs_level = DiagnosticStatus.ERROR
666  msg = 'Exception'
667  vals.append(KeyValue(key = 'Exception', value = str(e)))
668 
669  with self._mutex:
670  self._nfs_stat.level = nfs_level
671  self._nfs_stat.message = msg
672  self._nfs_stat.values = vals
673 
674  self._last_nfs_time = rospy.get_time()
675 
676  if not rospy.is_shutdown():
677  self._nfs_timer = threading.Timer(5.0, self.check_nfs_stat)
678  self._nfs_timer.start()
679  else:
680  self.cancel_timers()
681 
682 
683  ## Call every 10sec at minimum
684  def check_temps(self):
685  if rospy.is_shutdown():
686  with self._mutex:
687  self.cancel_timers()
688  return
689 
690  diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
691  KeyValue(key = 'Time Since Last Update', value = str(0) ) ]
692  diag_msgs = []
693  diag_level = 0
694 
695  if self._check_ipmi:
696  ipmi_vals, ipmi_msgs, ipmi_level = check_ipmi()
697  diag_vals.extend(ipmi_vals)
698  diag_msgs.extend(ipmi_msgs)
699  diag_level = max(diag_level, ipmi_level)
700 
701  if self._check_core_temps:
702  core_vals, core_msgs, core_level = check_core_temps(self._temp_vals)
703  diag_vals.extend(core_vals)
704  diag_msgs.extend(core_msgs)
705  diag_level = max(diag_level, core_level)
706 
707  clock_vals, clock_msgs, clock_level = check_clock_speed(self._enforce_speed)
708  diag_vals.extend(clock_vals)
709  diag_msgs.extend(clock_msgs)
710  diag_level = max(diag_level, clock_level)
711 
712  diag_log = set(diag_msgs)
713  if len(diag_log) > 0:
714  message = ', '.join(diag_log)
715  else:
716  message = stat_dict[diag_level]
717 
718  with self._mutex:
719  self._last_temp_time = rospy.get_time()
720 
721  self._temp_stat.level = diag_level
722  self._temp_stat.message = message
723  self._temp_stat.values = diag_vals
724 
725  if not rospy.is_shutdown():
726  self._temps_timer = threading.Timer(5.0, self.check_temps)
727  self._temps_timer.start()
728  else:
729  self.cancel_timers()
730 
731  def check_usage(self):
732  if rospy.is_shutdown():
733  with self._mutex:
734  self.cancel_timers()
735  return
736 
737  diag_level = 0
738  diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
739  KeyValue(key = 'Time Since Last Update', value = 0 )]
740  diag_msgs = []
741 
742  # Check mpstat
743  mp_level, mp_msg, mp_vals = check_mpstat(self._num_cores)
744  diag_vals.extend(mp_vals)
745  if mp_level > 0:
746  diag_msgs.append(mp_msg)
747  diag_level = max(diag_level, mp_level)
748 
749  # Check uptime
750  uptime_level, up_msg, up_vals = check_uptime(self._load1_threshold, self._load5_threshold)
751  diag_vals.extend(up_vals)
752  if uptime_level > 0:
753  diag_msgs.append(up_msg)
754  diag_level = max(diag_level, uptime_level)
755 
756  # Check memory
757  mem_level, mem_msg, mem_vals = check_memory()
758  diag_vals.extend(mem_vals)
759  if mem_level > 0:
760  diag_msgs.append(mem_msg)
761  diag_level = max(diag_level, mem_level)
762 
763  if diag_msgs and diag_level > 0:
764  usage_msg = ', '.join(set(diag_msgs))
765  else:
766  usage_msg = stat_dict[diag_level]
767 
768  # Update status
769  with self._mutex:
770  self._last_usage_time = rospy.get_time()
771  self._usage_stat.level = diag_level
772  self._usage_stat.values = diag_vals
773 
774  self._usage_stat.message = usage_msg
775 
776  if not rospy.is_shutdown():
777  self._usage_timer = threading.Timer(5.0, self.check_usage)
778  self._usage_timer.start()
779  else:
780  self.cancel_timers()
781 
782  def publish_stats(self):
783  with self._mutex:
784  # Update everything with last update times
787  if self._check_nfs:
789 
790  msg = DiagnosticArray()
791  msg.header.stamp = rospy.get_rostime()
792  msg.status.append(self._temp_stat)
793  msg.status.append(self._usage_stat)
794  if self._check_nfs:
795  msg.status.append(self._nfs_stat)
796 
797  if rospy.get_time() - self._last_publish_time > 0.5:
798  self._diag_pub.publish(msg)
799  self._last_publish_time = rospy.get_time()
800 
801 
802  # Restart temperature checking if it goes stale, #4171
803  # Need to run this without mutex
804  if rospy.get_time() - self._last_temp_time > 90:
805  self._restart_temp_check()
806 
807 
808 if __name__ == '__main__':
809  hostname = socket.gethostname()
810 
811  import optparse
812  parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]")
813  parser.add_option("--diag-hostname", dest="diag_hostname",
814  help="Computer name in diagnostics output (ex: 'c1')",
815  metavar="DIAG_HOSTNAME",
816  action="store", default = hostname)
817  options, args = parser.parse_args(rospy.myargv())
818 
819  try:
820  rospy.init_node('cpu_monitor_%s' % hostname)
821  except rospy.exceptions.ROSInitException:
822  print('CPU monitor is unable to initialize node. Master may not be running.', file=sys.stderr)
823  sys.exit(0)
824 
825  cpu_node = CPUMonitor(hostname, options.diag_hostname)
826 
827  rate = rospy.Rate(1.0)
828  try:
829  while not rospy.is_shutdown():
830  rate.sleep()
831  cpu_node.publish_stats()
832  except KeyboardInterrupt:
833  pass
834  except Exception as e:
835  traceback.print_exc()
836  rospy.logerr(traceback.format_exc())
837 
838  cpu_node.cancel_timers()
839  sys.exit(0)
840 
841 
842 
843 
844 
845 
846 
def get_core_temp_names()
Returns names for core temperature files Returns list of names, each name can be read like file...
Definition: cpu_monitor.py:474
def check_uptime(load1_threshold, load5_threshold)
Uses &#39;uptime&#39; to see load average.
Definition: cpu_monitor.py:284
def check_temps(self)
Call every 10sec at minimum.
Definition: cpu_monitor.py:684
def check_core_temps(sys_temp_strings)
Check CPU core temps.
Definition: cpu_monitor.py:187
def check_clock_speed(enforce_speed)
Checks clock speed from reading from CPU info.
Definition: cpu_monitor.py:227
def update_status_stale(stat, last_update_time)
Definition: cpu_monitor.py:495
def check_memory()
Uses &#39;free -m&#39; to check free memory.
Definition: cpu_monitor.py:327
def __init__(self, hostname, diag_hostname)
Definition: cpu_monitor.py:522
def _restart_temp_check(self)
Definition: cpu_monitor.py:586
def check_ipmi()
Definition: cpu_monitor.py:65
def check_mpstat(core_count=-1)
Definition: cpu_monitor.py:378
def cancel_timers(self)
Must have the lock to cancel everything.
Definition: cpu_monitor.py:599


pr2_computer_monitor
Author(s): Kevin Watts (watts@willowgarage.com)
autogenerated on Tue Jun 1 2021 02:50:51