00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 from __future__ import with_statement
00038 import roslib
00039 roslib.load_manifest('pr2_computer_monitor')
00040
00041 import rospy
00042
00043 import traceback
00044 import threading
00045 from threading import Timer
00046 import sys, os, time
00047 from time import sleep
00048 import subprocess
00049 import string
00050 import multiprocessing
00051 import socket
00052
00053 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
00054
00055
00056
00057 if sys.version_info[:3] == (2, 7, 3):
00058 import threading
00059 threading._DummyThread._Thread__stop = lambda x: 42
00060
00061
00062 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
00063
00064
00065 def check_ipmi():
00066 diag_vals = []
00067 diag_msgs = []
00068 diag_level = DiagnosticStatus.OK
00069
00070 try:
00071 p = subprocess.Popen('sudo ipmitool sdr',
00072 stdout = subprocess.PIPE,
00073 stderr = subprocess.PIPE, shell = True)
00074 stdout, stderr = p.communicate()
00075 retcode = p.returncode
00076
00077 if retcode != 0:
00078 diag_level = DiagnosticStatus.ERROR
00079 diag_msgs = [ 'ipmitool Error' ]
00080 diag_vals = [ KeyValue(key = 'IPMI Error', value = stderr) ]
00081 return diag_vals, diag_msgs, diag_level
00082
00083 lines = stdout.split('\n')
00084 if len(lines) < 2:
00085 diag_vals = [ KeyValue(key = 'ipmitool status', value = 'No output') ]
00086
00087 diag_msgs = [ 'No ipmitool response' ]
00088 diag_level = DiagnosticStatus.ERROR
00089
00090 return diag_vals, diag_msgs, diag_level
00091
00092 for ln in lines:
00093 if len(ln) < 3:
00094 continue
00095
00096 words = ln.split('|')
00097 if len(words) < 3:
00098 continue
00099
00100 name = words[0].strip()
00101 ipmi_val = words[1].strip()
00102 stat_byte = words[2].strip()
00103
00104
00105 if words[0].startswith('CPU') and words[0].strip().endswith('Temp'):
00106 if words[1].strip().endswith('degrees C'):
00107 tmp = ipmi_val.rstrip(' degrees C').lstrip()
00108 if unicode(tmp).isnumeric():
00109 temperature = float(tmp)
00110 diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00111
00112 cpu_name = name.split()[0]
00113 if temperature >= 80 and temperature < 89:
00114 diag_level = max(diag_level, DiagnosticStatus.WARN)
00115 if diag_msgs.count('CPU Hot') == 0:
00116 diag_msgs.append('CPU Warm')
00117
00118 if temperature >= 89:
00119 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00120 diag_msgs.append('CPU Hot')
00121
00122 if diag_msgs.count('CPU Warm') > 0:
00123 idx = diag_msgs.index('CPU Warm')
00124 diag_msgs.pop(idx)
00125 else:
00126 diag_vals.append(KeyValue(key = name, value = words[1]))
00127
00128
00129
00130 if name == 'MB Temp' or name == 'BP Temp' or name == 'FP Temp':
00131 if ipmi_val.endswith('degrees C'):
00132 tmp = ipmi_val.rstrip(' degrees C').lstrip()
00133 diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00134
00135 dev_name = name.split()[0]
00136 if unicode(tmp).isnumeric():
00137 temperature = float(tmp)
00138
00139 if temperature >= 60 and temperature < 75:
00140 diag_level = max(diag_level, DiagnosticStatus.WARN)
00141 diag_msgs.append('%s Warm' % dev_name)
00142
00143 if temperature >= 75:
00144 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00145 diag_msgs.append('%s Hot' % dev_name)
00146 else:
00147 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00148 diag_msgs.append('%s Error' % dev_name)
00149 else:
00150 diag_vals.append(KeyValue(key = name, value = ipmi_val))
00151
00152
00153 if (name.startswith('CPU') and name.endswith('Fan')) or name == 'MB Fan':
00154 if ipmi_val.endswith('RPM'):
00155 rpm = ipmi_val.rstrip(' RPM').lstrip()
00156 if unicode(rpm).isnumeric():
00157 if int(rpm) == 0:
00158 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00159 diag_msgs.append('CPU Fan Off')
00160
00161 diag_vals.append(KeyValue(key = name + ' RPM', value = rpm))
00162 else:
00163 diag_vals.append(KeyValue(key = name, value = ipmi_val))
00164
00165
00166
00167 if name.startswith('CPU') and name.endswith('hot'):
00168 if ipmi_val == '0x01':
00169 diag_vals.append(KeyValue(key = name, value = 'OK'))
00170 else:
00171 diag_vals.append(KeyValue(key = name, value = 'Hot'))
00172 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00173 diag_msgs.append('CPU Hot Alarm')
00174
00175 except Exception, e:
00176 diag_vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
00177 diag_level = DiagnosticStatus.ERROR
00178 diag_msgs.append('Exception')
00179
00180 return diag_vals, diag_msgs, diag_level
00181
00182
00183
00184
00185
00186
00187 def check_core_temps(sys_temp_strings):
00188 diag_vals = []
00189 diag_level = 0
00190 diag_msgs = []
00191
00192 for index, temp_str in enumerate(sys_temp_strings):
00193 if len(temp_str) < 5:
00194 continue
00195
00196 cmd = 'cat %s' % temp_str
00197 p = subprocess.Popen(cmd, stdout = subprocess.PIPE,
00198 stderr = subprocess.PIPE, shell = True)
00199 stdout, stderr = p.communicate()
00200 retcode = p.returncode
00201
00202 if retcode != 0:
00203 diag_level = DiagnosticStatus.ERROR
00204 diag_msg = [ 'Core Temp Error' ]
00205 diag_vals = [ KeyValue(key = 'Core Temp Error', value = stderr),
00206 KeyValue(key = 'Output', value = stdout) ]
00207 return diag_vals, diag_msgs, diag_level
00208
00209 tmp = stdout.strip()
00210 if unicode(tmp).isnumeric():
00211 temp = float(tmp) / 1000
00212 diag_vals.append(KeyValue(key = 'Core %d Temp' % index, value = str(temp)))
00213
00214 if temp >= 85 and temp < 90:
00215 diag_level = max(diag_level, DiagnosticStatus.WARN)
00216 diag_msgs.append('Warm')
00217 if temp >= 90:
00218 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00219 diag_msgs.append('Hot')
00220 else:
00221 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00222 diag_vals.append(KeyValue(key = 'Core %s Temp' % index, value = tmp))
00223
00224 return diag_vals, diag_msgs, diag_level
00225
00226
00227 def check_clock_speed(enforce_speed):
00228 vals = []
00229 msgs = []
00230 lvl = DiagnosticStatus.OK
00231
00232 try:
00233 p = subprocess.Popen('cat /proc/cpuinfo | grep MHz',
00234 stdout = subprocess.PIPE,
00235 stderr = subprocess.PIPE, shell = True)
00236 stdout, stderr = p.communicate()
00237 retcode = p.returncode
00238
00239 if retcode != 0:
00240 lvl = DiagnosticStatus.ERROR
00241 msgs = [ 'Clock speed error' ]
00242 vals = [ KeyValue(key = 'Clock speed error', value = stderr),
00243 KeyValue(key = 'Output', value = stdout) ]
00244
00245 return (vals, msgs, lvl)
00246
00247 for index, ln in enumerate(stdout.split('\n')):
00248 words = ln.split(':')
00249 if len(words) < 2:
00250 continue
00251
00252 speed = words[1].strip().split('.')[0]
00253 vals.append(KeyValue(key = 'Core %d MHz' % index, value = speed))
00254 if unicode(speed).isnumeric():
00255 mhz = float(speed)
00256
00257 if mhz < 2240 and mhz > 2150:
00258 lvl = max(lvl, DiagnosticStatus.WARN)
00259 if mhz <= 2150:
00260 lvl = max(lvl, DiagnosticStatus.ERROR)
00261 else:
00262
00263 lvl = max(lvl, DiagnosticStatus.ERROR)
00264
00265 if not enforce_speed:
00266 lvl = DiagnosticStatus.OK
00267
00268 if lvl == DiagnosticStatus.WARN and enforce_speed:
00269 msgs = [ 'Core slowing' ]
00270 elif lvl == DiagnosticStatus.ERROR and enforce_speed:
00271 msgs = [ 'Core throttled' ]
00272
00273 except Exception, e:
00274 rospy.logerr(traceback.format_exc())
00275 lvl = DiagnosticStatus.ERROR
00276 msgs.append('Exception')
00277 vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
00278
00279 return vals, msgs, lvl
00280
00281
00282
00283
00284 def check_uptime(load1_threshold, load5_threshold):
00285 level = DiagnosticStatus.OK
00286 vals = []
00287
00288 load_dict = { 0: 'OK', 1: 'High Load', 2: 'Very High Load' }
00289
00290 try:
00291 p = subprocess.Popen('uptime', stdout = subprocess.PIPE,
00292 stderr = subprocess.PIPE, shell = True)
00293 stdout, stderr = p.communicate()
00294 retcode = p.returncode
00295
00296 if retcode != 0:
00297 vals.append(KeyValue(key = 'uptime Failed', value = stderr))
00298 return DiagnosticStatus.ERROR, vals
00299
00300 upvals = stdout.split()
00301 load1 = upvals[-3].rstrip(',')
00302 load5 = upvals[-2].rstrip(',')
00303 load15 = upvals[-1]
00304 num_users = upvals[-7]
00305
00306
00307 if float(load1) > load1_threshold or float(load5) > load5_threshold:
00308 level = DiagnosticStatus.WARN
00309
00310 vals.append(KeyValue(key = 'Load Average Status', value = load_dict[level]))
00311 vals.append(KeyValue(key = '1 min Load Average', value = load1))
00312 vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(load1_threshold)))
00313 vals.append(KeyValue(key = '5 min Load Average', value = load5))
00314 vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(load5_threshold)))
00315 vals.append(KeyValue(key = '15 min Load Average', value = load15))
00316 vals.append(KeyValue(key = 'Number of Users', value = num_users))
00317
00318 except Exception, e:
00319 rospy.logerr(traceback.format_exc())
00320 level = DiagnosticStatus.ERROR
00321 vals.append(KeyValue(key = 'Load Average Status', value = traceback.format_exc()))
00322
00323 return level, load_dict[level], vals
00324
00325
00326
00327 def check_memory():
00328 values = []
00329 level = DiagnosticStatus.OK
00330 msg = ''
00331
00332 mem_dict = { 0: 'OK', 1: 'Low Memory', 2: 'Very Low Memory' }
00333
00334 try:
00335 p = subprocess.Popen('free -m',
00336 stdout = subprocess.PIPE,
00337 stderr = subprocess.PIPE, shell = True)
00338 stdout, stderr = p.communicate()
00339 retcode = p.returncode
00340
00341 if retcode != 0:
00342 values.append(KeyValue(key = "\"free -m\" Call Error", value = str(retcode)))
00343 return DiagnosticStatus.ERROR, values
00344
00345 rows = stdout.split('\n')
00346 data = rows[1].split()
00347 total_mem = data[1]
00348 used_mem = data[2]
00349 free_mem = data[3]
00350
00351 level = DiagnosticStatus.OK
00352 if float(free_mem) < 25:
00353 level = DiagnosticStatus.WARN
00354 if float(free_mem) < 1:
00355 level = DiagnosticStatus.ERROR
00356
00357 values.append(KeyValue(key = 'Memory Status', value = mem_dict[level]))
00358 values.append(KeyValue(key = 'Total Memory', value = total_mem))
00359 values.append(KeyValue(key = 'Used Memory', value = used_mem))
00360 values.append(KeyValue(key = 'Free Memory', value = free_mem))
00361
00362 msg = mem_dict[level]
00363 except Exception, e:
00364 rospy.logerr(traceback.format_exc())
00365 msg = 'Memory Usage Check Error'
00366 values.append(KeyValue(key = msg, value = str(e)))
00367 level = DiagnosticStatus.ERROR
00368
00369 return level, mem_dict[level], values
00370
00371
00372
00373
00374
00375 usage_old = 0
00376 has_warned_mpstat = False
00377 has_error_core_count = False
00378 def check_mpstat(core_count = -1):
00379 vals = []
00380 mp_level = DiagnosticStatus.OK
00381
00382 load_dict = { 0: 'OK', 1: 'High Load', 2: 'Error' }
00383
00384 try:
00385 p = subprocess.Popen('mpstat -P ALL 1 1',
00386 stdout = subprocess.PIPE,
00387 stderr = subprocess.PIPE, shell = True)
00388 stdout, stderr = p.communicate()
00389 retcode = p.returncode
00390
00391 if retcode != 0:
00392 global has_warned_mpstat
00393 if not has_warned_mpstat:
00394 rospy.logerr("mpstat failed to run for cpu_monitor. Return code %d.", retcode)
00395 has_warned_mpstat = True
00396
00397 mp_level = DiagnosticStatus.ERROR
00398 vals.append(KeyValue(key = '\"mpstat\" Call Error', value = str(retcode)))
00399 return mp_level, 'Unable to Check CPU Usage', vals
00400
00401
00402
00403 rows = stdout.split('\n')
00404 col_names = rows[2].split()
00405 idle_col = -1 if (len(col_names) > 2 and col_names[-1] == '%idle') else -2
00406
00407 num_cores = 0
00408 cores_loaded = 0
00409 for index, row in enumerate(stdout.split('\n')):
00410 if index < 3:
00411 continue
00412
00413
00414 if row.find('all') > -1:
00415 continue
00416
00417 lst = row.split()
00418 if len(lst) < 8:
00419 continue
00420
00421
00422 if lst[0].startswith('Average'):
00423 continue
00424
00425 cpu_name = '%d' % (num_cores)
00426 idle = lst[idle_col].replace(',', '.')
00427 user = lst[3].replace(',', '.')
00428 nice = lst[4].replace(',', '.')
00429 system = lst[5].replace(',', '.')
00430
00431 core_level = 0
00432 usage = float(user) + float(nice)
00433 if usage > 1000:
00434 rospy.logwarn('Read cpu usage of %f percent. Reverting to previous reading of %f percent'%(usage, usage_old))
00435 usage = usage_old
00436 usage_old = usage
00437
00438 if usage > 90.0:
00439 cores_loaded += 1
00440 core_level = DiagnosticStatus.WARN
00441 if usage > 110.0:
00442 core_level = DiagnosticStatus.ERROR
00443
00444 vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level]))
00445 vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = user))
00446 vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = nice))
00447 vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = system))
00448 vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = idle))
00449
00450 num_cores += 1
00451
00452
00453 if num_cores - cores_loaded <= 2 and num_cores > 2:
00454 mp_level = DiagnosticStatus.WARN
00455
00456
00457 if core_count > 0 and core_count != num_cores:
00458 mp_level = DiagnosticStatus.ERROR
00459 global has_error_core_count
00460 if not has_error_core_count:
00461 rospy.logerr('Error checking number of cores. Expected %d, got %d. Computer may have not booted properly.',
00462 core_count, num_cores)
00463 has_error_core_count = True
00464 return DiagnosticStatus.ERROR, 'Incorrect number of CPU cores', vals
00465
00466 except Exception, e:
00467 mp_level = DiagnosticStatus.ERROR
00468 vals.append(KeyValue(key = 'mpstat Exception', value = str(e)))
00469
00470 return mp_level, load_dict[mp_level], vals
00471
00472
00473
00474 def get_core_temp_names():
00475 temp_vals = []
00476 try:
00477 p = subprocess.Popen('find /sys/devices -name temp1_input',
00478 stdout = subprocess.PIPE,
00479 stderr = subprocess.PIPE, shell = True)
00480 stdout, stderr = p.communicate()
00481 retcode = p.returncode
00482
00483 if retcode != 0:
00484 rospy.logerr('Error find core temp locations: %s' % stderr)
00485 return []
00486
00487 for ln in stdout.split('\n'):
00488 temp_vals.append(ln.strip())
00489
00490 return temp_vals
00491 except:
00492 rospy.logerr('Exception finding temp vals: %s' % traceback.format_exc())
00493 return []
00494
00495 def update_status_stale(stat, last_update_time):
00496 time_since_update = rospy.get_time() - last_update_time
00497
00498 stale_status = 'OK'
00499 if time_since_update > 20 and time_since_update <= 35:
00500 stale_status = 'Lagging'
00501 if stat.level == DiagnosticStatus.OK:
00502 stat.message = stale_status
00503 elif stat.message.find(stale_status) < 0:
00504 stat.message = ', '.join([stat.message, stale_status])
00505 stat.level = max(stat.level, DiagnosticStatus.WARN)
00506 if time_since_update > 35:
00507 stale_status = 'Stale'
00508 if stat.level == DiagnosticStatus.OK:
00509 stat.message = stale_status
00510 elif stat.message.find(stale_status) < 0:
00511 stat.message = ', '.join([stat.message, stale_status])
00512 stat.level = max(stat.level, DiagnosticStatus.ERROR)
00513
00514
00515 stat.values.pop(0)
00516 stat.values.pop(0)
00517 stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
00518 stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
00519
00520
00521 class CPUMonitor():
00522 def __init__(self, hostname, diag_hostname):
00523 self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray)
00524
00525 self._mutex = threading.Lock()
00526
00527 self._check_ipmi = rospy.get_param('~check_ipmi_tool', True)
00528 self._enforce_speed = rospy.get_param('~enforce_clock_speed', True)
00529
00530 self._check_core_temps = rospy.get_param('~check_core_temps', False)
00531 if self._check_core_temps:
00532 rospy.logwarn('Checking CPU core temperatures is deprecated. This will be removed in D-turtle')
00533 self._check_nfs = rospy.get_param('~check_nfs', False)
00534 if self._check_nfs:
00535 rospy.logwarn('NFS checking is deprecated for CPU monitor. This will be removed in D-turtle')
00536
00537 self._load1_threshold = rospy.get_param('~load1_threshold', 5.0)
00538 self._load5_threshold = rospy.get_param('~load5_threshold', 3.0)
00539
00540 self._num_cores = rospy.get_param('~num_cores', 8.0)
00541 if self._num_cores == -1:
00542 self._num_cores = multiprocessing.cpu_count()
00543 self._temps_timer = None
00544 self._usage_timer = None
00545 self._nfs_timer = None
00546
00547
00548 self._temp_vals = get_core_temp_names()
00549
00550
00551 self._temp_stat = DiagnosticStatus()
00552 self._temp_stat.name = '%s CPU Temperature' % diag_hostname
00553 self._temp_stat.level = 1
00554 self._temp_stat.hardware_id = hostname
00555 self._temp_stat.message = 'No Data'
00556 self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00557 KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00558
00559 self._usage_stat = DiagnosticStatus()
00560 self._usage_stat.name = '%s CPU Usage' % diag_hostname
00561 self._usage_stat.level = 1
00562 self._usage_stat.hardware_id = hostname
00563 self._usage_stat.message = 'No Data'
00564 self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00565 KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00566
00567 self._nfs_stat = DiagnosticStatus()
00568 self._nfs_stat.name = '%s NFS IO' % diag_hostname
00569 self._nfs_stat.level = 1
00570 self._nfs_stat.hardware_id = hostname
00571 self._nfs_stat.message = 'No Data'
00572 self._nfs_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00573 KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00574
00575 self._last_temp_time = 0
00576 self._last_usage_time = 0
00577 self._last_nfs_time = 0
00578 self._last_publish_time = 0
00579
00580
00581 self.check_temps()
00582 if self._check_nfs:
00583 self.check_nfs_stat()
00584 self.check_usage()
00585
00586
00587 def _restart_temp_check(self):
00588 rospy.logerr('Restarting temperature check thread in cpu_monitor. This should not happen')
00589 try:
00590 with self._mutex:
00591 if self._temps_timer:
00592 self._temps_timer.cancel()
00593
00594 self.check_temps()
00595 except Exception, e:
00596 rospy.logerr('Unable to restart temp thread. Error: %s' % traceback.format_exc())
00597
00598
00599
00600 def cancel_timers(self):
00601 if self._temps_timer:
00602 self._temps_timer.cancel()
00603
00604 if self._nfs_timer:
00605 self._nfs_timer.cancel()
00606
00607 if self._usage_timer:
00608 self._usage_timer.cancel()
00609
00610 def check_nfs_stat(self):
00611 if rospy.is_shutdown():
00612 with self._mutex:
00613 self.cancel_timers()
00614 return
00615
00616 nfs_level = 0
00617 msg = 'OK'
00618 vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00619 KeyValue(key = 'Time Since Last Update', value = str(0) )]
00620
00621 try:
00622 p = subprocess.Popen('iostat -n',
00623 stdout = subprocess.PIPE,
00624 stderr = subprocess.PIPE, shell = True)
00625 stdout, stderr = p.communicate()
00626 retcode = p.returncode
00627
00628 if retcode != 0:
00629 nfs_level = DiagnosticStatus.ERROR
00630 msg = 'iostat Error'
00631 vals.append(KeyValue(key = '\"iostat -n\" Call Error', value = str(e)))
00632 stdout = ''
00633
00634
00635 for index, row in enumerate(stdout.split('\n')):
00636 if index < 3:
00637 continue
00638
00639 lst = row.split()
00640 if len(lst) < 7:
00641 continue
00642
00643 file_sys = lst[0]
00644 read_blk = lst[1]
00645 write_blk = lst[2]
00646 read_blk_dir = lst[3]
00647 write_blk_dir = lst[4]
00648 r_blk_srv = lst[5]
00649 w_blk_srv = lst[6]
00650
00651 vals.append(KeyValue(
00652 key = '%s Read Blks/s' % file_sys, value=read_blk))
00653 vals.append(KeyValue(
00654 key = '%s Write Blks/s' % file_sys, value=write_blk))
00655 vals.append(KeyValue(
00656 key = '%s Read Blk dir/s' % file_sys, value=read_blk_dir))
00657 vals.append(KeyValue(
00658 key = '%s Write Blks dir/s' % file_sys, value=write_blk_dir))
00659 vals.append(KeyValue(
00660 key = '%s Read Blks srv/s' % file_sys, value=r_blk_srv))
00661 vals.append(KeyValue(
00662 key = '%s Write Blks srv/s' % file_sys, value=w_blk_srv))
00663
00664 except Exception, e:
00665 rospy.logerr(traceback.format_exc())
00666 nfs_level = DiagnosticStatus.ERROR
00667 msg = 'Exception'
00668 vals.append(KeyValue(key = 'Exception', value = str(e)))
00669
00670 with self._mutex:
00671 self._nfs_stat.level = nfs_level
00672 self._nfs_stat.message = msg
00673 self._nfs_stat.values = vals
00674
00675 self._last_nfs_time = rospy.get_time()
00676
00677 if not rospy.is_shutdown():
00678 self._nfs_timer = threading.Timer(5.0, self.check_nfs_stat)
00679 self._nfs_timer.start()
00680 else:
00681 self.cancel_timers()
00682
00683
00684
00685 def check_temps(self):
00686 if rospy.is_shutdown():
00687 with self._mutex:
00688 self.cancel_timers()
00689 return
00690
00691 diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00692 KeyValue(key = 'Time Since Last Update', value = str(0) ) ]
00693 diag_msgs = []
00694 diag_level = 0
00695
00696 if self._check_ipmi:
00697 ipmi_vals, ipmi_msgs, ipmi_level = check_ipmi()
00698 diag_vals.extend(ipmi_vals)
00699 diag_msgs.extend(ipmi_msgs)
00700 diag_level = max(diag_level, ipmi_level)
00701
00702 if self._check_core_temps:
00703 core_vals, core_msgs, core_level = check_core_temps(self._temp_vals)
00704 diag_vals.extend(core_vals)
00705 diag_msgs.extend(core_msgs)
00706 diag_level = max(diag_level, core_level)
00707
00708 clock_vals, clock_msgs, clock_level = check_clock_speed(self._enforce_speed)
00709 diag_vals.extend(clock_vals)
00710 diag_msgs.extend(clock_msgs)
00711 diag_level = max(diag_level, clock_level)
00712
00713 diag_log = set(diag_msgs)
00714 if len(diag_log) > 0:
00715 message = ', '.join(diag_log)
00716 else:
00717 message = stat_dict[diag_level]
00718
00719 with self._mutex:
00720 self._last_temp_time = rospy.get_time()
00721
00722 self._temp_stat.level = diag_level
00723 self._temp_stat.message = message
00724 self._temp_stat.values = diag_vals
00725
00726 if not rospy.is_shutdown():
00727 self._temps_timer = threading.Timer(5.0, self.check_temps)
00728 self._temps_timer.start()
00729 else:
00730 self.cancel_timers()
00731
00732 def check_usage(self):
00733 if rospy.is_shutdown():
00734 with self._mutex:
00735 self.cancel_timers()
00736 return
00737
00738 diag_level = 0
00739 diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00740 KeyValue(key = 'Time Since Last Update', value = 0 )]
00741 diag_msgs = []
00742
00743
00744 mp_level, mp_msg, mp_vals = check_mpstat(self._num_cores)
00745 diag_vals.extend(mp_vals)
00746 if mp_level > 0:
00747 diag_msgs.append(mp_msg)
00748 diag_level = max(diag_level, mp_level)
00749
00750
00751 uptime_level, up_msg, up_vals = check_uptime(self._load1_threshold, self._load5_threshold)
00752 diag_vals.extend(up_vals)
00753 if uptime_level > 0:
00754 diag_msgs.append(up_msg)
00755 diag_level = max(diag_level, uptime_level)
00756
00757
00758 mem_level, mem_msg, mem_vals = check_memory()
00759 diag_vals.extend(mem_vals)
00760 if mem_level > 0:
00761 diag_msgs.append(mem_msg)
00762 diag_level = max(diag_level, mem_level)
00763
00764 if diag_msgs and diag_level > 0:
00765 usage_msg = ', '.join(set(diag_msgs))
00766 else:
00767 usage_msg = stat_dict[diag_level]
00768
00769
00770 with self._mutex:
00771 self._last_usage_time = rospy.get_time()
00772 self._usage_stat.level = diag_level
00773 self._usage_stat.values = diag_vals
00774
00775 self._usage_stat.message = usage_msg
00776
00777 if not rospy.is_shutdown():
00778 self._usage_timer = threading.Timer(5.0, self.check_usage)
00779 self._usage_timer.start()
00780 else:
00781 self.cancel_timers()
00782
00783 def publish_stats(self):
00784 with self._mutex:
00785
00786 update_status_stale(self._temp_stat, self._last_temp_time)
00787 update_status_stale(self._usage_stat, self._last_usage_time)
00788 if self._check_nfs:
00789 update_status_stale(self._nfs_stat, self._last_nfs_time)
00790
00791 msg = DiagnosticArray()
00792 msg.header.stamp = rospy.get_rostime()
00793 msg.status.append(self._temp_stat)
00794 msg.status.append(self._usage_stat)
00795 if self._check_nfs:
00796 msg.status.append(self._nfs_stat)
00797
00798 if rospy.get_time() - self._last_publish_time > 0.5:
00799 self._diag_pub.publish(msg)
00800 self._last_publish_time = rospy.get_time()
00801
00802
00803
00804
00805 if rospy.get_time() - self._last_temp_time > 90:
00806 self._restart_temp_check()
00807
00808
00809 if __name__ == '__main__':
00810 hostname = socket.gethostname()
00811
00812 import optparse
00813 parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]")
00814 parser.add_option("--diag-hostname", dest="diag_hostname",
00815 help="Computer name in diagnostics output (ex: 'c1')",
00816 metavar="DIAG_HOSTNAME",
00817 action="store", default = hostname)
00818 options, args = parser.parse_args(rospy.myargv())
00819
00820 try:
00821 rospy.init_node('cpu_monitor_%s' % hostname)
00822 except rospy.exceptions.ROSInitException:
00823 print >> sys.stderr, 'CPU monitor is unable to initialize node. Master may not be running.'
00824 sys.exit(0)
00825
00826 cpu_node = CPUMonitor(hostname, options.diag_hostname)
00827
00828 rate = rospy.Rate(1.0)
00829 try:
00830 while not rospy.is_shutdown():
00831 rate.sleep()
00832 cpu_node.publish_stats()
00833 except KeyboardInterrupt:
00834 pass
00835 except Exception, e:
00836 traceback.print_exc()
00837 rospy.logerr(traceback.format_exc())
00838
00839 cpu_node.cancel_timers()
00840 sys.exit(0)
00841
00842
00843
00844
00845
00846
00847