00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 from __future__ import with_statement
00038 import roslib
00039 roslib.load_manifest('pr2_computer_monitor')
00040
00041 import rospy
00042
00043 import traceback
00044 import threading
00045 from threading import Timer
00046 import sys, os, time
00047 from time import sleep
00048 import subprocess
00049 import string
00050
00051 import socket
00052
00053 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
00054
00055 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
00056
00057
00058 def check_ipmi():
00059 diag_vals = []
00060 diag_msgs = []
00061 diag_level = DiagnosticStatus.OK
00062
00063 try:
00064 p = subprocess.Popen('sudo ipmitool sdr',
00065 stdout = subprocess.PIPE,
00066 stderr = subprocess.PIPE, shell = True)
00067 stdout, stderr = p.communicate()
00068 retcode = p.returncode
00069
00070 if retcode != 0:
00071 diag_level = DiagnosticStatus.ERROR
00072 diag_msgs = [ 'ipmitool Error' ]
00073 diag_vals = [ KeyValue(key = 'IPMI Error', value = stderr) ]
00074 return diag_vals, diag_msgs, diag_level
00075
00076 lines = stdout.split('\n')
00077 if len(lines) < 2:
00078 diag_vals = [ KeyValue(key = 'ipmitool status', value = 'No output') ]
00079
00080 diag_msgs = [ 'No ipmitool response' ]
00081 diag_level = DiagnosticStatus.ERROR
00082
00083 return diag_vals, diag_msgs, diag_level
00084
00085 for ln in lines:
00086 if len(ln) < 3:
00087 continue
00088
00089 words = ln.split('|')
00090 if len(words) < 3:
00091 continue
00092
00093 name = words[0].strip()
00094 ipmi_val = words[1].strip()
00095 stat_byte = words[2].strip()
00096
00097
00098 if words[0].startswith('CPU') and words[0].strip().endswith('Temp'):
00099 if words[1].strip().endswith('degrees C'):
00100 tmp = ipmi_val.rstrip(' degrees C').lstrip()
00101 if unicode(tmp).isnumeric():
00102 temperature = float(tmp)
00103 diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00104
00105 cpu_name = name.split()[0]
00106 if temperature >= 80 and temperature < 89:
00107 diag_level = max(diag_level, DiagnosticStatus.WARN)
00108 if diag_msgs.count('CPU Hot') == 0:
00109 diag_msgs.append('CPU Warm')
00110
00111 if temperature >= 89:
00112 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00113 diag_msgs.append('CPU Hot')
00114
00115 if diag_msgs.count('CPU Warm') > 0:
00116 idx = diag_msgs.index('CPU Warm')
00117 diag_msgs.pop(idx)
00118 else:
00119 diag_vals.append(KeyValue(key = name, value = words[1]))
00120
00121
00122
00123 if name == 'MB Temp' or name == 'BP Temp' or name == 'FP Temp':
00124 if ipmi_val.endswith('degrees C'):
00125 tmp = ipmi_val.rstrip(' degrees C').lstrip()
00126 diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00127
00128 dev_name = name.split()[0]
00129 if unicode(tmp).isnumeric():
00130 temperature = float(tmp)
00131
00132 if temperature >= 60 and temperature < 75:
00133 diag_level = max(diag_level, DiagnosticStatus.WARN)
00134 diag_msgs.append('%s Warm' % dev_name)
00135
00136 if temperature >= 75:
00137 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00138 diag_msgs.append('%s Hot' % dev_name)
00139 else:
00140 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00141 diag_msgs.append('%s Error' % dev_name)
00142 else:
00143 diag_vals.append(KeyValue(key = name, value = ipmi_val))
00144
00145
00146 if (name.startswith('CPU') and name.endswith('Fan')) or name == 'MB Fan':
00147 if ipmi_val.endswith('RPM'):
00148 rpm = ipmi_val.rstrip(' RPM').lstrip()
00149 if unicode(rpm).isnumeric():
00150 if int(rpm) == 0:
00151 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00152 diag_msgs.append('CPU Fan Off')
00153
00154 diag_vals.append(KeyValue(key = name + ' RPM', value = rpm))
00155 else:
00156 diag_vals.append(KeyValue(key = name, value = ipmi_val))
00157
00158
00159
00160 if name.startswith('CPU') and name.endswith('hot'):
00161 if ipmi_val == '0x01':
00162 diag_vals.append(KeyValue(key = name, value = 'OK'))
00163 else:
00164 diag_vals.append(KeyValue(key = name, value = 'Hot'))
00165 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00166 diag_msgs.append('CPU Hot Alarm')
00167
00168 except Exception, e:
00169 diag_vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
00170 diag_level = DiagnosticStatus.ERROR
00171 diag_msgs.append('Exception')
00172
00173 return diag_vals, diag_msgs, diag_level
00174
00175
00176
00177
00178
00179
00180 def check_core_temps(sys_temp_strings):
00181 diag_vals = []
00182 diag_level = 0
00183 diag_msgs = []
00184
00185 for index, temp_str in enumerate(sys_temp_strings):
00186 if len(temp_str) < 5:
00187 continue
00188
00189 cmd = 'cat %s' % temp_str
00190 p = subprocess.Popen(cmd, stdout = subprocess.PIPE,
00191 stderr = subprocess.PIPE, shell = True)
00192 stdout, stderr = p.communicate()
00193 retcode = p.returncode
00194
00195 if retcode != 0:
00196 diag_level = DiagnosticStatus.ERROR
00197 diag_msg = [ 'Core Temp Error' ]
00198 diag_vals = [ KeyValue(key = 'Core Temp Error', value = stderr),
00199 KeyValue(key = 'Output', value = stdout) ]
00200 return diag_vals, diag_msgs, diag_level
00201
00202 tmp = stdout.strip()
00203 if unicode(tmp).isnumeric():
00204 temp = float(tmp) / 1000
00205 diag_vals.append(KeyValue(key = 'Core %d Temp' % index, value = str(temp)))
00206
00207 if temp >= 85 and temp < 90:
00208 diag_level = max(diag_level, DiagnosticStatus.WARN)
00209 diag_msgs.append('Warm')
00210 if temp >= 90:
00211 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00212 diag_msgs.append('Hot')
00213 else:
00214 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00215 diag_vals.append(KeyValue(key = 'Core %s Temp' % index, value = tmp))
00216
00217 return diag_vals, diag_msgs, diag_level
00218
00219
00220 def check_clock_speed(enforce_speed):
00221 vals = []
00222 msgs = []
00223 lvl = DiagnosticStatus.OK
00224
00225 try:
00226 p = subprocess.Popen('cat /proc/cpuinfo | grep MHz',
00227 stdout = subprocess.PIPE,
00228 stderr = subprocess.PIPE, shell = True)
00229 stdout, stderr = p.communicate()
00230 retcode = p.returncode
00231
00232 if retcode != 0:
00233 lvl = DiagnosticStatus.ERROR
00234 msgs = [ 'Clock speed error' ]
00235 vals = [ KeyValue(key = 'Clock speed error', value = stderr),
00236 KeyValue(key = 'Output', value = stdout) ]
00237
00238 return (vals, msgs, lvl)
00239
00240 for index, ln in enumerate(stdout.split('\n')):
00241 words = ln.split(':')
00242 if len(words) < 2:
00243 continue
00244
00245 speed = words[1].strip().split('.')[0]
00246 vals.append(KeyValue(key = 'Core %d MHz' % index, value = speed))
00247 if unicode(speed).isnumeric():
00248 mhz = float(speed)
00249
00250 if mhz < 2240 and mhz > 2150:
00251 lvl = max(lvl, DiagnosticStatus.WARN)
00252 if mhz <= 2150:
00253 lvl = max(lvl, DiagnosticStatus.ERROR)
00254 else:
00255
00256 lvl = max(lvl, DiagnosticStatus.ERROR)
00257
00258 if not enforce_speed:
00259 lvl = DiagnosticStatus.OK
00260
00261 if lvl == DiagnosticStatus.WARN and enforce_speed:
00262 msgs = [ 'Core slowing' ]
00263 elif lvl == DiagnosticStatus.ERROR and enforce_speed:
00264 msgs = [ 'Core throttled' ]
00265
00266 except Exception, e:
00267 rospy.logerr(traceback.format_exc())
00268 lvl = DiagnosticStatus.ERROR
00269 msgs.append('Exception')
00270 vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
00271
00272 return vals, msgs, lvl
00273
00274
00275
00276
00277 def check_uptime(load1_threshold, load5_threshold):
00278 level = DiagnosticStatus.OK
00279 vals = []
00280
00281 load_dict = { 0: 'OK', 1: 'High Load', 2: 'Very High Load' }
00282
00283 try:
00284 p = subprocess.Popen('uptime', stdout = subprocess.PIPE,
00285 stderr = subprocess.PIPE, shell = True)
00286 stdout, stderr = p.communicate()
00287 retcode = p.returncode
00288
00289 if retcode != 0:
00290 vals.append(KeyValue(key = 'uptime Failed', value = stderr))
00291 return DiagnosticStatus.ERROR, vals
00292
00293 upvals = stdout.split()
00294 load1 = upvals[-3].rstrip(',')
00295 load5 = upvals[-2].rstrip(',')
00296 load15 = upvals[-1]
00297 num_users = upvals[-7]
00298
00299
00300 if float(load1) > load1_threshold or float(load5) > load5_threshold:
00301 level = DiagnosticStatus.WARN
00302
00303 vals.append(KeyValue(key = 'Load Average Status', value = load_dict[level]))
00304 vals.append(KeyValue(key = '1 min Load Average', value = load1))
00305 vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(load1_threshold)))
00306 vals.append(KeyValue(key = '5 min Load Average', value = load5))
00307 vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(load5_threshold)))
00308 vals.append(KeyValue(key = '15 min Load Average', value = load15))
00309 vals.append(KeyValue(key = 'Number of Users', value = num_users))
00310
00311 except Exception, e:
00312 rospy.logerr(traceback.format_exc())
00313 level = DiagnosticStatus.ERROR
00314 vals.append(KeyValue(key = 'Load Average Status', value = traceback.format_exc()))
00315
00316 return level, load_dict[level], vals
00317
00318
00319
00320 def check_memory():
00321 values = []
00322 level = DiagnosticStatus.OK
00323 msg = ''
00324
00325 mem_dict = { 0: 'OK', 1: 'Low Memory', 2: 'Very Low Memory' }
00326
00327 try:
00328 p = subprocess.Popen('free -m',
00329 stdout = subprocess.PIPE,
00330 stderr = subprocess.PIPE, shell = True)
00331 stdout, stderr = p.communicate()
00332 retcode = p.returncode
00333
00334 if retcode != 0:
00335 values.append(KeyValue(key = "\"free -m\" Call Error", value = str(retcode)))
00336 return DiagnosticStatus.ERROR, values
00337
00338 rows = stdout.split('\n')
00339 data = rows[1].split()
00340 total_mem = data[1]
00341 used_mem = data[2]
00342 free_mem = data[3]
00343
00344 level = DiagnosticStatus.OK
00345 if float(free_mem) < 25:
00346 level = DiagnosticStatus.WARN
00347 if float(free_mem) < 1:
00348 level = DiagnosticStatus.ERROR
00349
00350 values.append(KeyValue(key = 'Memory Status', value = mem_dict[level]))
00351 values.append(KeyValue(key = 'Total Memory', value = total_mem))
00352 values.append(KeyValue(key = 'Used Memory', value = used_mem))
00353 values.append(KeyValue(key = 'Free Memory', value = free_mem))
00354
00355 msg = mem_dict[level]
00356 except Exception, e:
00357 rospy.logerr(traceback.format_exc())
00358 msg = 'Memory Usage Check Error'
00359 values.append(KeyValue(key = msg, value = str(e)))
00360 level = DiagnosticStatus.ERROR
00361
00362 return level, mem_dict[level], values
00363
00364
00365
00366
00367
00368 usage_old = 0
00369 has_warned_mpstat = False
00370 has_error_core_count = False
00371 def check_mpstat(core_count = -1):
00372 vals = []
00373 mp_level = DiagnosticStatus.OK
00374
00375 load_dict = { 0: 'OK', 1: 'High Load', 2: 'Error' }
00376
00377 try:
00378 p = subprocess.Popen('mpstat -P ALL 1 1',
00379 stdout = subprocess.PIPE,
00380 stderr = subprocess.PIPE, shell = True)
00381 stdout, stderr = p.communicate()
00382 retcode = p.returncode
00383
00384 if retcode != 0:
00385 global has_warned_mpstat
00386 if not has_warned_mpstat:
00387 rospy.logerr("mpstat failed to run for cpu_monitor. Return code %d.", retcode)
00388 has_warned_mpstat = True
00389
00390 mp_level = DiagnosticStatus.ERROR
00391 vals.append(KeyValue(key = '\"mpstat\" Call Error', value = str(retcode)))
00392 return mp_level, 'Unable to Check CPU Usage', vals
00393
00394
00395
00396 rows = stdout.split('\n')
00397 col_names = rows[2].split()
00398 idle_col = -1 if (len(col_names) > 2 and col_names[-1] == '%idle') else -2
00399
00400 num_cores = 0
00401 cores_loaded = 0
00402 for index, row in enumerate(stdout.split('\n')):
00403 if index < 3:
00404 continue
00405
00406
00407 if row.find('all') > -1:
00408 continue
00409
00410 lst = row.split()
00411 if len(lst) < 8:
00412 continue
00413
00414
00415 if lst[0].startswith('Average'):
00416 continue
00417
00418 cpu_name = '%d' % (num_cores)
00419 idle = lst[idle_col]
00420 user = lst[3]
00421 nice = lst[4]
00422 system = lst[5]
00423
00424 core_level = 0
00425 usage = float(user) + float(nice)
00426 if usage > 1000:
00427 rospy.logwarn('Read cpu usage of %f percent. Reverting to previous reading of %f percent'%(usage, usage_old))
00428 usage = usage_old
00429 usage_old = usage
00430
00431 if usage > 90.0:
00432 cores_loaded += 1
00433 core_level = DiagnosticStatus.WARN
00434 if usage > 110.0:
00435 core_level = DiagnosticStatus.ERROR
00436
00437 vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level]))
00438 vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = user))
00439 vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = nice))
00440 vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = system))
00441 vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = idle))
00442
00443 num_cores += 1
00444
00445
00446 if num_cores - cores_loaded <= 2 and num_cores > 2:
00447 mp_level = DiagnosticStatus.WARN
00448
00449
00450 if core_count > 0 and core_count != num_cores:
00451 mp_level = DiagnosticStatus.ERROR
00452 global has_error_core_count
00453 if not has_error_core_count:
00454 rospy.logerr('Error checking number of cores. Expected %d, got %d. Computer may have not booted properly.',
00455 core_count, num_cores)
00456 has_error_core_count = True
00457 return DiagnosticStatus.ERROR, 'Incorrect number of CPU cores', vals
00458
00459 except Exception, e:
00460 mp_level = DiagnosticStatus.ERROR
00461 vals.append(KeyValue(key = 'mpstat Exception', value = str(e)))
00462
00463 return mp_level, load_dict[mp_level], vals
00464
00465
00466
00467 def get_core_temp_names():
00468 temp_vals = []
00469 try:
00470 p = subprocess.Popen('find /sys/devices -name temp1_input',
00471 stdout = subprocess.PIPE,
00472 stderr = subprocess.PIPE, shell = True)
00473 stdout, stderr = p.communicate()
00474 retcode = p.returncode
00475
00476 if retcode != 0:
00477 rospy.logerr('Error find core temp locations: %s' % stderr)
00478 return []
00479
00480 for ln in stdout.split('\n'):
00481 temp_vals.append(ln.strip())
00482
00483 return temp_vals
00484 except:
00485 rospy.logerr('Exception finding temp vals: %s' % traceback.format_exc())
00486 return []
00487
00488 def update_status_stale(stat, last_update_time):
00489 time_since_update = rospy.get_time() - last_update_time
00490
00491 stale_status = 'OK'
00492 if time_since_update > 20 and time_since_update <= 35:
00493 stale_status = 'Lagging'
00494 if stat.level == DiagnosticStatus.OK:
00495 stat.message = stale_status
00496 elif stat.message.find(stale_status) < 0:
00497 stat.message = ', '.join([stat.message, stale_status])
00498 stat.level = max(stat.level, DiagnosticStatus.WARN)
00499 if time_since_update > 35:
00500 stale_status = 'Stale'
00501 if stat.level == DiagnosticStatus.OK:
00502 stat.message = stale_status
00503 elif stat.message.find(stale_status) < 0:
00504 stat.message = ', '.join([stat.message, stale_status])
00505 stat.level = max(stat.level, DiagnosticStatus.ERROR)
00506
00507
00508 stat.values.pop(0)
00509 stat.values.pop(0)
00510 stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
00511 stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
00512
00513
00514 class CPUMonitor():
00515 def __init__(self, hostname, diag_hostname):
00516 self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray)
00517
00518 self._mutex = threading.Lock()
00519
00520 self._check_ipmi = rospy.get_param('~check_ipmi_tool', True)
00521 self._enforce_speed = rospy.get_param('~enforce_clock_speed', True)
00522
00523 self._check_core_temps = rospy.get_param('~check_core_temps', False)
00524 if self._check_core_temps:
00525 rospy.logwarn('Checking CPU core temperatures is deprecated. This will be removed in D-turtle')
00526 self._check_nfs = rospy.get_param('~check_nfs', False)
00527 if self._check_nfs:
00528 rospy.logwarn('NFS checking is deprecated for CPU monitor. This will be removed in D-turtle')
00529
00530 self._load1_threshold = rospy.get_param('~load1_threshold', 5.0)
00531 self._load5_threshold = rospy.get_param('~load5_threshold', 3.0)
00532
00533 self._num_cores = rospy.get_param('~num_cores', 8.0)
00534
00535 self._temps_timer = None
00536 self._usage_timer = None
00537 self._nfs_timer = None
00538
00539
00540 self._temp_vals = get_core_temp_names()
00541
00542
00543 self._temp_stat = DiagnosticStatus()
00544 self._temp_stat.name = '%s CPU Temperature' % diag_hostname
00545 self._temp_stat.level = 1
00546 self._temp_stat.hardware_id = hostname
00547 self._temp_stat.message = 'No Data'
00548 self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00549 KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00550
00551 self._usage_stat = DiagnosticStatus()
00552 self._usage_stat.name = '%s CPU Usage' % diag_hostname
00553 self._usage_stat.level = 1
00554 self._usage_stat.hardware_id = hostname
00555 self._usage_stat.message = 'No Data'
00556 self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00557 KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00558
00559 self._nfs_stat = DiagnosticStatus()
00560 self._nfs_stat.name = '%s NFS IO' % diag_hostname
00561 self._nfs_stat.level = 1
00562 self._nfs_stat.hardware_id = hostname
00563 self._nfs_stat.message = 'No Data'
00564 self._nfs_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00565 KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00566
00567 self._last_temp_time = 0
00568 self._last_usage_time = 0
00569 self._last_nfs_time = 0
00570 self._last_publish_time = 0
00571
00572
00573 self.check_temps()
00574 if self._check_nfs:
00575 self.check_nfs_stat()
00576 self.check_usage()
00577
00578
00579 def _restart_temp_check(self):
00580 rospy.logerr('Restarting temperature check thread in cpu_monitor. This should not happen')
00581 try:
00582 with self._mutex:
00583 if self._temps_timer:
00584 self._temps_timer.cancel()
00585
00586 self.check_temps()
00587 except Exception, e:
00588 rospy.logerr('Unable to restart temp thread. Error: %s' % traceback.format_exc())
00589
00590
00591
00592 def cancel_timers(self):
00593 if self._temps_timer:
00594 self._temps_timer.cancel()
00595
00596 if self._nfs_timer:
00597 self._nfs_timer.cancel()
00598
00599 if self._usage_timer:
00600 self._usage_timer.cancel()
00601
00602 def check_nfs_stat(self):
00603 if rospy.is_shutdown():
00604 with self._mutex:
00605 self.cancel_timers()
00606 return
00607
00608 nfs_level = 0
00609 msg = 'OK'
00610 vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00611 KeyValue(key = 'Time Since Last Update', value = str(0) )]
00612
00613 try:
00614 p = subprocess.Popen('iostat -n',
00615 stdout = subprocess.PIPE,
00616 stderr = subprocess.PIPE, shell = True)
00617 stdout, stderr = p.communicate()
00618 retcode = p.returncode
00619
00620 if retcode != 0:
00621 nfs_level = DiagnosticStatus.ERROR
00622 msg = 'iostat Error'
00623 vals.append(KeyValue(key = '\"iostat -n\" Call Error', value = str(e)))
00624 stdout = ''
00625
00626
00627 for index, row in enumerate(stdout.split('\n')):
00628 if index < 3:
00629 continue
00630
00631 lst = row.split()
00632 if len(lst) < 7:
00633 continue
00634
00635 file_sys = lst[0]
00636 read_blk = lst[1]
00637 write_blk = lst[2]
00638 read_blk_dir = lst[3]
00639 write_blk_dir = lst[4]
00640 r_blk_srv = lst[5]
00641 w_blk_srv = lst[6]
00642
00643 vals.append(KeyValue(
00644 key = '%s Read Blks/s' % file_sys, value=read_blk))
00645 vals.append(KeyValue(
00646 key = '%s Write Blks/s' % file_sys, value=write_blk))
00647 vals.append(KeyValue(
00648 key = '%s Read Blk dir/s' % file_sys, value=read_blk_dir))
00649 vals.append(KeyValue(
00650 key = '%s Write Blks dir/s' % file_sys, value=write_blk_dir))
00651 vals.append(KeyValue(
00652 key = '%s Read Blks srv/s' % file_sys, value=r_blk_srv))
00653 vals.append(KeyValue(
00654 key = '%s Write Blks srv/s' % file_sys, value=w_blk_srv))
00655
00656 except Exception, e:
00657 rospy.logerr(traceback.format_exc())
00658 nfs_level = DiagnosticStatus.ERROR
00659 msg = 'Exception'
00660 vals.append(KeyValue(key = 'Exception', value = str(e)))
00661
00662 with self._mutex:
00663 self._nfs_stat.level = nfs_level
00664 self._nfs_stat.message = msg
00665 self._nfs_stat.values = vals
00666
00667 self._last_nfs_time = rospy.get_time()
00668
00669 if not rospy.is_shutdown():
00670 self._nfs_timer = threading.Timer(5.0, self.check_nfs_stat)
00671 self._nfs_timer.start()
00672 else:
00673 self.cancel_timers()
00674
00675
00676
00677 def check_temps(self):
00678 if rospy.is_shutdown():
00679 with self._mutex:
00680 self.cancel_timers()
00681 return
00682
00683 diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00684 KeyValue(key = 'Time Since Last Update', value = str(0) ) ]
00685 diag_msgs = []
00686 diag_level = 0
00687
00688 if self._check_ipmi:
00689 ipmi_vals, ipmi_msgs, ipmi_level = check_ipmi()
00690 diag_vals.extend(ipmi_vals)
00691 diag_msgs.extend(ipmi_msgs)
00692 diag_level = max(diag_level, ipmi_level)
00693
00694 if self._check_core_temps:
00695 core_vals, core_msgs, core_level = check_core_temps(self._temp_vals)
00696 diag_vals.extend(core_vals)
00697 diag_msgs.extend(core_msgs)
00698 diag_level = max(diag_level, core_level)
00699
00700 clock_vals, clock_msgs, clock_level = check_clock_speed(self._enforce_speed)
00701 diag_vals.extend(clock_vals)
00702 diag_msgs.extend(clock_msgs)
00703 diag_level = max(diag_level, clock_level)
00704
00705 diag_log = set(diag_msgs)
00706 if len(diag_log) > 0:
00707 message = ', '.join(diag_log)
00708 else:
00709 message = stat_dict[diag_level]
00710
00711 with self._mutex:
00712 self._last_temp_time = rospy.get_time()
00713
00714 self._temp_stat.level = diag_level
00715 self._temp_stat.message = message
00716 self._temp_stat.values = diag_vals
00717
00718 if not rospy.is_shutdown():
00719 self._temps_timer = threading.Timer(5.0, self.check_temps)
00720 self._temps_timer.start()
00721 else:
00722 self.cancel_timers()
00723
00724 def check_usage(self):
00725 if rospy.is_shutdown():
00726 with self._mutex:
00727 self.cancel_timers()
00728 return
00729
00730 diag_level = 0
00731 diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00732 KeyValue(key = 'Time Since Last Update', value = 0 )]
00733 diag_msgs = []
00734
00735
00736 mp_level, mp_msg, mp_vals = check_mpstat(self._num_cores)
00737 diag_vals.extend(mp_vals)
00738 if mp_level > 0:
00739 diag_msgs.append(mp_msg)
00740 diag_level = max(diag_level, mp_level)
00741
00742
00743 uptime_level, up_msg, up_vals = check_uptime(self._load1_threshold, self._load5_threshold)
00744 diag_vals.extend(up_vals)
00745 if uptime_level > 0:
00746 diag_msgs.append(up_msg)
00747 diag_level = max(diag_level, uptime_level)
00748
00749
00750 mem_level, mem_msg, mem_vals = check_memory()
00751 diag_vals.extend(mem_vals)
00752 if mem_level > 0:
00753 diag_msgs.append(mem_msg)
00754 diag_level = max(diag_level, mem_level)
00755
00756 if diag_msgs and diag_level > 0:
00757 usage_msg = ', '.join(set(diag_msgs))
00758 else:
00759 usage_msg = stat_dict[diag_level]
00760
00761
00762 with self._mutex:
00763 self._last_usage_time = rospy.get_time()
00764 self._usage_stat.level = diag_level
00765 self._usage_stat.values = diag_vals
00766
00767 self._usage_stat.message = usage_msg
00768
00769 if not rospy.is_shutdown():
00770 self._usage_timer = threading.Timer(5.0, self.check_usage)
00771 self._usage_timer.start()
00772 else:
00773 self.cancel_timers()
00774
00775 def publish_stats(self):
00776 with self._mutex:
00777
00778 update_status_stale(self._temp_stat, self._last_temp_time)
00779 update_status_stale(self._usage_stat, self._last_usage_time)
00780 if self._check_nfs:
00781 update_status_stale(self._nfs_stat, self._last_nfs_time)
00782
00783 msg = DiagnosticArray()
00784 msg.header.stamp = rospy.get_rostime()
00785 msg.status.append(self._temp_stat)
00786 msg.status.append(self._usage_stat)
00787 if self._check_nfs:
00788 msg.status.append(self._nfs_stat)
00789
00790 if rospy.get_time() - self._last_publish_time > 0.5:
00791 self._diag_pub.publish(msg)
00792 self._last_publish_time = rospy.get_time()
00793
00794
00795
00796
00797 if rospy.get_time() - self._last_temp_time > 90:
00798 self._restart_temp_check()
00799
00800
00801 if __name__ == '__main__':
00802 hostname = socket.gethostname()
00803
00804 import optparse
00805 parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]")
00806 parser.add_option("--diag-hostname", dest="diag_hostname",
00807 help="Computer name in diagnostics output (ex: 'c1')",
00808 metavar="DIAG_HOSTNAME",
00809 action="store", default = hostname)
00810 options, args = parser.parse_args(rospy.myargv())
00811
00812 try:
00813 rospy.init_node('cpu_monitor_%s' % hostname)
00814 except rospy.exceptions.ROSInitException:
00815 print >> sys.stderr, 'CPU monitor is unable to initialize node. Master may not be running.'
00816 sys.exit(0)
00817
00818 cpu_node = CPUMonitor(hostname, options.diag_hostname)
00819
00820 rate = rospy.Rate(1.0)
00821 try:
00822 while not rospy.is_shutdown():
00823 rate.sleep()
00824 cpu_node.publish_stats()
00825 except KeyboardInterrupt:
00826 pass
00827 except Exception, e:
00828 traceback.print_exc()
00829 rospy.logerr(traceback.format_exc())
00830
00831 cpu_node.cancel_timers()
00832 sys.exit(0)
00833
00834
00835
00836
00837
00838
00839