00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 from __future__ import with_statement
00038 import roslib
00039 roslib.load_manifest('pr2_computer_monitor')
00040
00041 import rospy
00042
00043 import traceback
00044 import threading
00045 from threading import Timer
00046 import sys, os, time
00047 from time import sleep
00048 import subprocess
00049 import string
00050
00051 import socket
00052
00053 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
00054
00055
00056
00057 if sys.version_info[:3] == (2, 7, 3):
00058 import threading
00059 threading._DummyThread._Thread__stop = lambda x: 42
00060
00061
00062 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
00063
00064
00065 def check_ipmi():
00066 diag_vals = []
00067 diag_msgs = []
00068 diag_level = DiagnosticStatus.OK
00069
00070 try:
00071 p = subprocess.Popen('sudo ipmitool sdr',
00072 stdout = subprocess.PIPE,
00073 stderr = subprocess.PIPE, shell = True)
00074 stdout, stderr = p.communicate()
00075 retcode = p.returncode
00076
00077 if retcode != 0:
00078 diag_level = DiagnosticStatus.ERROR
00079 diag_msgs = [ 'ipmitool Error' ]
00080 diag_vals = [ KeyValue(key = 'IPMI Error', value = stderr) ]
00081 return diag_vals, diag_msgs, diag_level
00082
00083 lines = stdout.split('\n')
00084 if len(lines) < 2:
00085 diag_vals = [ KeyValue(key = 'ipmitool status', value = 'No output') ]
00086
00087 diag_msgs = [ 'No ipmitool response' ]
00088 diag_level = DiagnosticStatus.ERROR
00089
00090 return diag_vals, diag_msgs, diag_level
00091
00092 for ln in lines:
00093 if len(ln) < 3:
00094 continue
00095
00096 words = ln.split('|')
00097 if len(words) < 3:
00098 continue
00099
00100 name = words[0].strip()
00101 ipmi_val = words[1].strip()
00102 stat_byte = words[2].strip()
00103
00104
00105 if words[0].startswith('CPU') and words[0].strip().endswith('Temp'):
00106 if words[1].strip().endswith('degrees C'):
00107 tmp = ipmi_val.rstrip(' degrees C').lstrip()
00108 if unicode(tmp).isnumeric():
00109 temperature = float(tmp)
00110 diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00111
00112 cpu_name = name.split()[0]
00113 if temperature >= 80 and temperature < 89:
00114 diag_level = max(diag_level, DiagnosticStatus.WARN)
00115 if diag_msgs.count('CPU Hot') == 0:
00116 diag_msgs.append('CPU Warm')
00117
00118 if temperature >= 89:
00119 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00120 diag_msgs.append('CPU Hot')
00121
00122 if diag_msgs.count('CPU Warm') > 0:
00123 idx = diag_msgs.index('CPU Warm')
00124 diag_msgs.pop(idx)
00125 else:
00126 diag_vals.append(KeyValue(key = name, value = words[1]))
00127
00128
00129
00130 if name == 'MB Temp' or name == 'BP Temp' or name == 'FP Temp':
00131 if ipmi_val.endswith('degrees C'):
00132 tmp = ipmi_val.rstrip(' degrees C').lstrip()
00133 diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00134
00135 dev_name = name.split()[0]
00136 if unicode(tmp).isnumeric():
00137 temperature = float(tmp)
00138
00139 if temperature >= 60 and temperature < 75:
00140 diag_level = max(diag_level, DiagnosticStatus.WARN)
00141 diag_msgs.append('%s Warm' % dev_name)
00142
00143 if temperature >= 75:
00144 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00145 diag_msgs.append('%s Hot' % dev_name)
00146 else:
00147 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00148 diag_msgs.append('%s Error' % dev_name)
00149 else:
00150 diag_vals.append(KeyValue(key = name, value = ipmi_val))
00151
00152
00153 if (name.startswith('CPU') and name.endswith('Fan')) or name == 'MB Fan':
00154 if ipmi_val.endswith('RPM'):
00155 rpm = ipmi_val.rstrip(' RPM').lstrip()
00156 if unicode(rpm).isnumeric():
00157 if int(rpm) == 0:
00158 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00159 diag_msgs.append('CPU Fan Off')
00160
00161 diag_vals.append(KeyValue(key = name + ' RPM', value = rpm))
00162 else:
00163 diag_vals.append(KeyValue(key = name, value = ipmi_val))
00164
00165
00166
00167 if name.startswith('CPU') and name.endswith('hot'):
00168 if ipmi_val == '0x01':
00169 diag_vals.append(KeyValue(key = name, value = 'OK'))
00170 else:
00171 diag_vals.append(KeyValue(key = name, value = 'Hot'))
00172 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00173 diag_msgs.append('CPU Hot Alarm')
00174
00175 except Exception, e:
00176 diag_vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
00177 diag_level = DiagnosticStatus.ERROR
00178 diag_msgs.append('Exception')
00179
00180 return diag_vals, diag_msgs, diag_level
00181
00182
00183
00184
00185
00186
00187 def check_core_temps(sys_temp_strings):
00188 diag_vals = []
00189 diag_level = 0
00190 diag_msgs = []
00191
00192 for index, temp_str in enumerate(sys_temp_strings):
00193 if len(temp_str) < 5:
00194 continue
00195
00196 cmd = 'cat %s' % temp_str
00197 p = subprocess.Popen(cmd, stdout = subprocess.PIPE,
00198 stderr = subprocess.PIPE, shell = True)
00199 stdout, stderr = p.communicate()
00200 retcode = p.returncode
00201
00202 if retcode != 0:
00203 diag_level = DiagnosticStatus.ERROR
00204 diag_msg = [ 'Core Temp Error' ]
00205 diag_vals = [ KeyValue(key = 'Core Temp Error', value = stderr),
00206 KeyValue(key = 'Output', value = stdout) ]
00207 return diag_vals, diag_msgs, diag_level
00208
00209 tmp = stdout.strip()
00210 if unicode(tmp).isnumeric():
00211 temp = float(tmp) / 1000
00212 diag_vals.append(KeyValue(key = 'Core %d Temp' % index, value = str(temp)))
00213
00214 if temp >= 85 and temp < 90:
00215 diag_level = max(diag_level, DiagnosticStatus.WARN)
00216 diag_msgs.append('Warm')
00217 if temp >= 90:
00218 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00219 diag_msgs.append('Hot')
00220 else:
00221 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00222 diag_vals.append(KeyValue(key = 'Core %s Temp' % index, value = tmp))
00223
00224 return diag_vals, diag_msgs, diag_level
00225
00226
00227 def check_clock_speed(enforce_speed):
00228 vals = []
00229 msgs = []
00230 lvl = DiagnosticStatus.OK
00231
00232 try:
00233 p = subprocess.Popen('cat /proc/cpuinfo | grep MHz',
00234 stdout = subprocess.PIPE,
00235 stderr = subprocess.PIPE, shell = True)
00236 stdout, stderr = p.communicate()
00237 retcode = p.returncode
00238
00239 if retcode != 0:
00240 lvl = DiagnosticStatus.ERROR
00241 msgs = [ 'Clock speed error' ]
00242 vals = [ KeyValue(key = 'Clock speed error', value = stderr),
00243 KeyValue(key = 'Output', value = stdout) ]
00244
00245 return (vals, msgs, lvl)
00246
00247 for index, ln in enumerate(stdout.split('\n')):
00248 words = ln.split(':')
00249 if len(words) < 2:
00250 continue
00251
00252 speed = words[1].strip().split('.')[0]
00253 vals.append(KeyValue(key = 'Core %d MHz' % index, value = speed))
00254 if unicode(speed).isnumeric():
00255 mhz = float(speed)
00256
00257 if mhz < 2240 and mhz > 2150:
00258 lvl = max(lvl, DiagnosticStatus.WARN)
00259 if mhz <= 2150:
00260 lvl = max(lvl, DiagnosticStatus.ERROR)
00261 else:
00262
00263 lvl = max(lvl, DiagnosticStatus.ERROR)
00264
00265 if not enforce_speed:
00266 lvl = DiagnosticStatus.OK
00267
00268 if lvl == DiagnosticStatus.WARN and enforce_speed:
00269 msgs = [ 'Core slowing' ]
00270 elif lvl == DiagnosticStatus.ERROR and enforce_speed:
00271 msgs = [ 'Core throttled' ]
00272
00273 except Exception, e:
00274 rospy.logerr(traceback.format_exc())
00275 lvl = DiagnosticStatus.ERROR
00276 msgs.append('Exception')
00277 vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
00278
00279 return vals, msgs, lvl
00280
00281
00282
00283
00284 def check_uptime(load1_threshold, load5_threshold):
00285 level = DiagnosticStatus.OK
00286 vals = []
00287
00288 load_dict = { 0: 'OK', 1: 'High Load', 2: 'Very High Load' }
00289
00290 try:
00291 p = subprocess.Popen('uptime', stdout = subprocess.PIPE,
00292 stderr = subprocess.PIPE, shell = True)
00293 stdout, stderr = p.communicate()
00294 retcode = p.returncode
00295
00296 if retcode != 0:
00297 vals.append(KeyValue(key = 'uptime Failed', value = stderr))
00298 return DiagnosticStatus.ERROR, vals
00299
00300 upvals = stdout.split()
00301 load1 = upvals[-3].rstrip(',')
00302 load5 = upvals[-2].rstrip(',')
00303 load15 = upvals[-1]
00304 num_users = upvals[-7]
00305
00306
00307 if float(load1) > load1_threshold or float(load5) > load5_threshold:
00308 level = DiagnosticStatus.WARN
00309
00310 vals.append(KeyValue(key = 'Load Average Status', value = load_dict[level]))
00311 vals.append(KeyValue(key = '1 min Load Average', value = load1))
00312 vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(load1_threshold)))
00313 vals.append(KeyValue(key = '5 min Load Average', value = load5))
00314 vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(load5_threshold)))
00315 vals.append(KeyValue(key = '15 min Load Average', value = load15))
00316 vals.append(KeyValue(key = 'Number of Users', value = num_users))
00317
00318 except Exception, e:
00319 rospy.logerr(traceback.format_exc())
00320 level = DiagnosticStatus.ERROR
00321 vals.append(KeyValue(key = 'Load Average Status', value = traceback.format_exc()))
00322
00323 return level, load_dict[level], vals
00324
00325
00326
00327 def check_memory():
00328 values = []
00329 level = DiagnosticStatus.OK
00330 msg = ''
00331
00332 mem_dict = { 0: 'OK', 1: 'Low Memory', 2: 'Very Low Memory' }
00333
00334 try:
00335 p = subprocess.Popen('free -m',
00336 stdout = subprocess.PIPE,
00337 stderr = subprocess.PIPE, shell = True)
00338 stdout, stderr = p.communicate()
00339 retcode = p.returncode
00340
00341 if retcode != 0:
00342 values.append(KeyValue(key = "\"free -m\" Call Error", value = str(retcode)))
00343 return DiagnosticStatus.ERROR, values
00344
00345 rows = stdout.split('\n')
00346 data = rows[1].split()
00347 total_mem = data[1]
00348 used_mem = data[2]
00349 free_mem = data[3]
00350
00351 level = DiagnosticStatus.OK
00352 if float(free_mem) < 25:
00353 level = DiagnosticStatus.WARN
00354 if float(free_mem) < 1:
00355 level = DiagnosticStatus.ERROR
00356
00357 values.append(KeyValue(key = 'Memory Status', value = mem_dict[level]))
00358 values.append(KeyValue(key = 'Total Memory', value = total_mem))
00359 values.append(KeyValue(key = 'Used Memory', value = used_mem))
00360 values.append(KeyValue(key = 'Free Memory', value = free_mem))
00361
00362 msg = mem_dict[level]
00363 except Exception, e:
00364 rospy.logerr(traceback.format_exc())
00365 msg = 'Memory Usage Check Error'
00366 values.append(KeyValue(key = msg, value = str(e)))
00367 level = DiagnosticStatus.ERROR
00368
00369 return level, mem_dict[level], values
00370
00371
00372
00373
00374
00375 usage_old = 0
00376 has_warned_mpstat = False
00377 has_error_core_count = False
00378 def check_mpstat(core_count = -1):
00379 vals = []
00380 mp_level = DiagnosticStatus.OK
00381
00382 load_dict = { 0: 'OK', 1: 'High Load', 2: 'Error' }
00383
00384 try:
00385 p = subprocess.Popen('mpstat -P ALL 1 1',
00386 stdout = subprocess.PIPE,
00387 stderr = subprocess.PIPE, shell = True)
00388 stdout, stderr = p.communicate()
00389 retcode = p.returncode
00390
00391 if retcode != 0:
00392 global has_warned_mpstat
00393 if not has_warned_mpstat:
00394 rospy.logerr("mpstat failed to run for cpu_monitor. Return code %d.", retcode)
00395 has_warned_mpstat = True
00396
00397 mp_level = DiagnosticStatus.ERROR
00398 vals.append(KeyValue(key = '\"mpstat\" Call Error', value = str(retcode)))
00399 return mp_level, 'Unable to Check CPU Usage', vals
00400
00401
00402
00403 rows = stdout.split('\n')
00404 col_names = rows[2].split()
00405 idle_col = -1 if (len(col_names) > 2 and col_names[-1] == '%idle') else -2
00406
00407 num_cores = 0
00408 cores_loaded = 0
00409 for index, row in enumerate(stdout.split('\n')):
00410 if index < 3:
00411 continue
00412
00413
00414 if row.find('all') > -1:
00415 continue
00416
00417 lst = row.split()
00418 if len(lst) < 8:
00419 continue
00420
00421
00422 if lst[0].startswith('Average'):
00423 continue
00424
00425 cpu_name = '%d' % (num_cores)
00426 idle = lst[idle_col]
00427 user = lst[3]
00428 nice = lst[4]
00429 system = lst[5]
00430
00431 core_level = 0
00432 usage = float(user) + float(nice)
00433 if usage > 1000:
00434 rospy.logwarn('Read cpu usage of %f percent. Reverting to previous reading of %f percent'%(usage, usage_old))
00435 usage = usage_old
00436 usage_old = usage
00437
00438 if usage > 90.0:
00439 cores_loaded += 1
00440 core_level = DiagnosticStatus.WARN
00441 if usage > 110.0:
00442 core_level = DiagnosticStatus.ERROR
00443
00444 vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level]))
00445 vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = user))
00446 vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = nice))
00447 vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = system))
00448 vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = idle))
00449
00450 num_cores += 1
00451
00452
00453 if num_cores - cores_loaded <= 2 and num_cores > 2:
00454 mp_level = DiagnosticStatus.WARN
00455
00456
00457 if core_count > 0 and core_count != num_cores:
00458 mp_level = DiagnosticStatus.ERROR
00459 global has_error_core_count
00460 if not has_error_core_count:
00461 rospy.logerr('Error checking number of cores. Expected %d, got %d. Computer may have not booted properly.',
00462 core_count, num_cores)
00463 has_error_core_count = True
00464 return DiagnosticStatus.ERROR, 'Incorrect number of CPU cores', vals
00465
00466 except Exception, e:
00467 mp_level = DiagnosticStatus.ERROR
00468 vals.append(KeyValue(key = 'mpstat Exception', value = str(e)))
00469
00470 return mp_level, load_dict[mp_level], vals
00471
00472
00473
00474 def get_core_temp_names():
00475 temp_vals = []
00476 try:
00477 p = subprocess.Popen('find /sys/devices -name temp1_input',
00478 stdout = subprocess.PIPE,
00479 stderr = subprocess.PIPE, shell = True)
00480 stdout, stderr = p.communicate()
00481 retcode = p.returncode
00482
00483 if retcode != 0:
00484 rospy.logerr('Error find core temp locations: %s' % stderr)
00485 return []
00486
00487 for ln in stdout.split('\n'):
00488 temp_vals.append(ln.strip())
00489
00490 return temp_vals
00491 except:
00492 rospy.logerr('Exception finding temp vals: %s' % traceback.format_exc())
00493 return []
00494
00495 def update_status_stale(stat, last_update_time):
00496 time_since_update = rospy.get_time() - last_update_time
00497
00498 stale_status = 'OK'
00499 if time_since_update > 20 and time_since_update <= 35:
00500 stale_status = 'Lagging'
00501 if stat.level == DiagnosticStatus.OK:
00502 stat.message = stale_status
00503 elif stat.message.find(stale_status) < 0:
00504 stat.message = ', '.join([stat.message, stale_status])
00505 stat.level = max(stat.level, DiagnosticStatus.WARN)
00506 if time_since_update > 35:
00507 stale_status = 'Stale'
00508 if stat.level == DiagnosticStatus.OK:
00509 stat.message = stale_status
00510 elif stat.message.find(stale_status) < 0:
00511 stat.message = ', '.join([stat.message, stale_status])
00512 stat.level = max(stat.level, DiagnosticStatus.ERROR)
00513
00514
00515 stat.values.pop(0)
00516 stat.values.pop(0)
00517 stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
00518 stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
00519
00520
00521 class CPUMonitor():
00522 def __init__(self, hostname, diag_hostname):
00523 self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray)
00524
00525 self._mutex = threading.Lock()
00526
00527 self._check_ipmi = rospy.get_param('~check_ipmi_tool', True)
00528 self._enforce_speed = rospy.get_param('~enforce_clock_speed', True)
00529
00530 self._check_core_temps = rospy.get_param('~check_core_temps', False)
00531 if self._check_core_temps:
00532 rospy.logwarn('Checking CPU core temperatures is deprecated. This will be removed in D-turtle')
00533 self._check_nfs = rospy.get_param('~check_nfs', False)
00534 if self._check_nfs:
00535 rospy.logwarn('NFS checking is deprecated for CPU monitor. This will be removed in D-turtle')
00536
00537 self._load1_threshold = rospy.get_param('~load1_threshold', 5.0)
00538 self._load5_threshold = rospy.get_param('~load5_threshold', 3.0)
00539
00540 self._num_cores = rospy.get_param('~num_cores', 8.0)
00541
00542 self._temps_timer = None
00543 self._usage_timer = None
00544 self._nfs_timer = None
00545
00546
00547 self._temp_vals = get_core_temp_names()
00548
00549
00550 self._temp_stat = DiagnosticStatus()
00551 self._temp_stat.name = '%s CPU Temperature' % diag_hostname
00552 self._temp_stat.level = 1
00553 self._temp_stat.hardware_id = hostname
00554 self._temp_stat.message = 'No Data'
00555 self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00556 KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00557
00558 self._usage_stat = DiagnosticStatus()
00559 self._usage_stat.name = '%s CPU Usage' % diag_hostname
00560 self._usage_stat.level = 1
00561 self._usage_stat.hardware_id = hostname
00562 self._usage_stat.message = 'No Data'
00563 self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00564 KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00565
00566 self._nfs_stat = DiagnosticStatus()
00567 self._nfs_stat.name = '%s NFS IO' % diag_hostname
00568 self._nfs_stat.level = 1
00569 self._nfs_stat.hardware_id = hostname
00570 self._nfs_stat.message = 'No Data'
00571 self._nfs_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00572 KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00573
00574 self._last_temp_time = 0
00575 self._last_usage_time = 0
00576 self._last_nfs_time = 0
00577 self._last_publish_time = 0
00578
00579
00580 self.check_temps()
00581 if self._check_nfs:
00582 self.check_nfs_stat()
00583 self.check_usage()
00584
00585
00586 def _restart_temp_check(self):
00587 rospy.logerr('Restarting temperature check thread in cpu_monitor. This should not happen')
00588 try:
00589 with self._mutex:
00590 if self._temps_timer:
00591 self._temps_timer.cancel()
00592
00593 self.check_temps()
00594 except Exception, e:
00595 rospy.logerr('Unable to restart temp thread. Error: %s' % traceback.format_exc())
00596
00597
00598
00599 def cancel_timers(self):
00600 if self._temps_timer:
00601 self._temps_timer.cancel()
00602
00603 if self._nfs_timer:
00604 self._nfs_timer.cancel()
00605
00606 if self._usage_timer:
00607 self._usage_timer.cancel()
00608
00609 def check_nfs_stat(self):
00610 if rospy.is_shutdown():
00611 with self._mutex:
00612 self.cancel_timers()
00613 return
00614
00615 nfs_level = 0
00616 msg = 'OK'
00617 vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00618 KeyValue(key = 'Time Since Last Update', value = str(0) )]
00619
00620 try:
00621 p = subprocess.Popen('iostat -n',
00622 stdout = subprocess.PIPE,
00623 stderr = subprocess.PIPE, shell = True)
00624 stdout, stderr = p.communicate()
00625 retcode = p.returncode
00626
00627 if retcode != 0:
00628 nfs_level = DiagnosticStatus.ERROR
00629 msg = 'iostat Error'
00630 vals.append(KeyValue(key = '\"iostat -n\" Call Error', value = str(e)))
00631 stdout = ''
00632
00633
00634 for index, row in enumerate(stdout.split('\n')):
00635 if index < 3:
00636 continue
00637
00638 lst = row.split()
00639 if len(lst) < 7:
00640 continue
00641
00642 file_sys = lst[0]
00643 read_blk = lst[1]
00644 write_blk = lst[2]
00645 read_blk_dir = lst[3]
00646 write_blk_dir = lst[4]
00647 r_blk_srv = lst[5]
00648 w_blk_srv = lst[6]
00649
00650 vals.append(KeyValue(
00651 key = '%s Read Blks/s' % file_sys, value=read_blk))
00652 vals.append(KeyValue(
00653 key = '%s Write Blks/s' % file_sys, value=write_blk))
00654 vals.append(KeyValue(
00655 key = '%s Read Blk dir/s' % file_sys, value=read_blk_dir))
00656 vals.append(KeyValue(
00657 key = '%s Write Blks dir/s' % file_sys, value=write_blk_dir))
00658 vals.append(KeyValue(
00659 key = '%s Read Blks srv/s' % file_sys, value=r_blk_srv))
00660 vals.append(KeyValue(
00661 key = '%s Write Blks srv/s' % file_sys, value=w_blk_srv))
00662
00663 except Exception, e:
00664 rospy.logerr(traceback.format_exc())
00665 nfs_level = DiagnosticStatus.ERROR
00666 msg = 'Exception'
00667 vals.append(KeyValue(key = 'Exception', value = str(e)))
00668
00669 with self._mutex:
00670 self._nfs_stat.level = nfs_level
00671 self._nfs_stat.message = msg
00672 self._nfs_stat.values = vals
00673
00674 self._last_nfs_time = rospy.get_time()
00675
00676 if not rospy.is_shutdown():
00677 self._nfs_timer = threading.Timer(5.0, self.check_nfs_stat)
00678 self._nfs_timer.start()
00679 else:
00680 self.cancel_timers()
00681
00682
00683
00684 def check_temps(self):
00685 if rospy.is_shutdown():
00686 with self._mutex:
00687 self.cancel_timers()
00688 return
00689
00690 diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00691 KeyValue(key = 'Time Since Last Update', value = str(0) ) ]
00692 diag_msgs = []
00693 diag_level = 0
00694
00695 if self._check_ipmi:
00696 ipmi_vals, ipmi_msgs, ipmi_level = check_ipmi()
00697 diag_vals.extend(ipmi_vals)
00698 diag_msgs.extend(ipmi_msgs)
00699 diag_level = max(diag_level, ipmi_level)
00700
00701 if self._check_core_temps:
00702 core_vals, core_msgs, core_level = check_core_temps(self._temp_vals)
00703 diag_vals.extend(core_vals)
00704 diag_msgs.extend(core_msgs)
00705 diag_level = max(diag_level, core_level)
00706
00707 clock_vals, clock_msgs, clock_level = check_clock_speed(self._enforce_speed)
00708 diag_vals.extend(clock_vals)
00709 diag_msgs.extend(clock_msgs)
00710 diag_level = max(diag_level, clock_level)
00711
00712 diag_log = set(diag_msgs)
00713 if len(diag_log) > 0:
00714 message = ', '.join(diag_log)
00715 else:
00716 message = stat_dict[diag_level]
00717
00718 with self._mutex:
00719 self._last_temp_time = rospy.get_time()
00720
00721 self._temp_stat.level = diag_level
00722 self._temp_stat.message = message
00723 self._temp_stat.values = diag_vals
00724
00725 if not rospy.is_shutdown():
00726 self._temps_timer = threading.Timer(5.0, self.check_temps)
00727 self._temps_timer.start()
00728 else:
00729 self.cancel_timers()
00730
00731 def check_usage(self):
00732 if rospy.is_shutdown():
00733 with self._mutex:
00734 self.cancel_timers()
00735 return
00736
00737 diag_level = 0
00738 diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00739 KeyValue(key = 'Time Since Last Update', value = 0 )]
00740 diag_msgs = []
00741
00742
00743 mp_level, mp_msg, mp_vals = check_mpstat(self._num_cores)
00744 diag_vals.extend(mp_vals)
00745 if mp_level > 0:
00746 diag_msgs.append(mp_msg)
00747 diag_level = max(diag_level, mp_level)
00748
00749
00750 uptime_level, up_msg, up_vals = check_uptime(self._load1_threshold, self._load5_threshold)
00751 diag_vals.extend(up_vals)
00752 if uptime_level > 0:
00753 diag_msgs.append(up_msg)
00754 diag_level = max(diag_level, uptime_level)
00755
00756
00757 mem_level, mem_msg, mem_vals = check_memory()
00758 diag_vals.extend(mem_vals)
00759 if mem_level > 0:
00760 diag_msgs.append(mem_msg)
00761 diag_level = max(diag_level, mem_level)
00762
00763 if diag_msgs and diag_level > 0:
00764 usage_msg = ', '.join(set(diag_msgs))
00765 else:
00766 usage_msg = stat_dict[diag_level]
00767
00768
00769 with self._mutex:
00770 self._last_usage_time = rospy.get_time()
00771 self._usage_stat.level = diag_level
00772 self._usage_stat.values = diag_vals
00773
00774 self._usage_stat.message = usage_msg
00775
00776 if not rospy.is_shutdown():
00777 self._usage_timer = threading.Timer(5.0, self.check_usage)
00778 self._usage_timer.start()
00779 else:
00780 self.cancel_timers()
00781
00782 def publish_stats(self):
00783 with self._mutex:
00784
00785 update_status_stale(self._temp_stat, self._last_temp_time)
00786 update_status_stale(self._usage_stat, self._last_usage_time)
00787 if self._check_nfs:
00788 update_status_stale(self._nfs_stat, self._last_nfs_time)
00789
00790 msg = DiagnosticArray()
00791 msg.header.stamp = rospy.get_rostime()
00792 msg.status.append(self._temp_stat)
00793 msg.status.append(self._usage_stat)
00794 if self._check_nfs:
00795 msg.status.append(self._nfs_stat)
00796
00797 if rospy.get_time() - self._last_publish_time > 0.5:
00798 self._diag_pub.publish(msg)
00799 self._last_publish_time = rospy.get_time()
00800
00801
00802
00803
00804 if rospy.get_time() - self._last_temp_time > 90:
00805 self._restart_temp_check()
00806
00807
00808 if __name__ == '__main__':
00809 hostname = socket.gethostname()
00810
00811 import optparse
00812 parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]")
00813 parser.add_option("--diag-hostname", dest="diag_hostname",
00814 help="Computer name in diagnostics output (ex: 'c1')",
00815 metavar="DIAG_HOSTNAME",
00816 action="store", default = hostname)
00817 options, args = parser.parse_args(rospy.myargv())
00818
00819 try:
00820 rospy.init_node('cpu_monitor_%s' % hostname)
00821 except rospy.exceptions.ROSInitException:
00822 print >> sys.stderr, 'CPU monitor is unable to initialize node. Master may not be running.'
00823 sys.exit(0)
00824
00825 cpu_node = CPUMonitor(hostname, options.diag_hostname)
00826
00827 rate = rospy.Rate(1.0)
00828 try:
00829 while not rospy.is_shutdown():
00830 rate.sleep()
00831 cpu_node.publish_stats()
00832 except KeyboardInterrupt:
00833 pass
00834 except Exception, e:
00835 traceback.print_exc()
00836 rospy.logerr(traceback.format_exc())
00837
00838 cpu_node.cancel_timers()
00839 sys.exit(0)
00840
00841
00842
00843
00844
00845
00846