$search
00001 #!/usr/bin/env python 00002 # 00003 # Software License Agreement (BSD License) 00004 # 00005 # Copyright (c) 2009, Willow Garage, Inc. 00006 # All rights reserved. 00007 # 00008 # Redistribution and use in source and binary forms, with or without 00009 # modification, are permitted provided that the following conditions 00010 # are met: 00011 # 00012 # * Redistributions of source code must retain the above copyright 00013 # notice, this list of conditions and the following disclaimer. 00014 # * Redistributions in binary form must reproduce the above 00015 # copyright notice, this list of conditions and the following 00016 # disclaimer in the documentation and/or other materials provided 00017 # with the distribution. 00018 # * Neither the name of the Willow Garage nor the names of its 00019 # contributors may be used to endorse or promote products derived 00020 # from this software without specific prior written permission. 00021 # 00022 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00023 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00024 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00025 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00026 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00027 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00028 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00029 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00030 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00031 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00032 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00033 # POSSIBILITY OF SUCH DAMAGE. 00034 00035 ##\author Kevin Watts 00036 00037 from __future__ import with_statement 00038 import roslib 00039 roslib.load_manifest('pr2_computer_monitor') 00040 00041 import rospy 00042 00043 import traceback 00044 import threading 00045 from threading import Timer 00046 import sys, os, time 00047 from time import sleep 00048 import subprocess 00049 import string 00050 00051 import socket 00052 00053 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue 00054 00055 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' } 00056 00057 # Output entire IPMI data set 00058 def check_ipmi(): 00059 diag_vals = [] 00060 diag_msgs = [] 00061 diag_level = DiagnosticStatus.OK 00062 00063 try: 00064 p = subprocess.Popen('sudo ipmitool sdr', 00065 stdout = subprocess.PIPE, 00066 stderr = subprocess.PIPE, shell = True) 00067 stdout, stderr = p.communicate() 00068 retcode = p.returncode 00069 00070 if retcode != 0: 00071 diag_level = DiagnosticStatus.ERROR 00072 diag_msgs = [ 'ipmitool Error' ] 00073 diag_vals = [ KeyValue(key = 'IPMI Error', value = stderr) ] 00074 return diag_vals, diag_msgs, diag_level 00075 00076 lines = stdout.split('\n') 00077 if len(lines) < 2: 00078 diag_vals = [ KeyValue(key = 'ipmitool status', value = 'No output') ] 00079 00080 diag_msgs = [ 'No ipmitool response' ] 00081 diag_level = DiagnosticStatus.ERROR 00082 00083 return diag_vals, diag_msgs, diag_level 00084 00085 for ln in lines: 00086 if len(ln) < 3: 00087 continue 00088 00089 words = ln.split('|') 00090 if len(words) < 3: 00091 continue 00092 00093 name = words[0].strip() 00094 ipmi_val = words[1].strip() 00095 stat_byte = words[2].strip() 00096 00097 # CPU temps 00098 if words[0].startswith('CPU') and words[0].strip().endswith('Temp'): 00099 if words[1].strip().endswith('degrees C'): 00100 tmp = ipmi_val.rstrip(' degrees C').lstrip() 00101 if unicode(tmp).isnumeric(): 00102 temperature = float(tmp) 00103 diag_vals.append(KeyValue(key = name + ' (C)', value = tmp)) 00104 00105 cpu_name = name.split()[0] 00106 if temperature >= 80 and temperature < 89: 00107 diag_level = max(diag_level, DiagnosticStatus.WARN) 00108 if diag_msgs.count('CPU Hot') == 0: 00109 diag_msgs.append('CPU Warm') 00110 00111 if temperature >= 89: # CPU should shut down here 00112 diag_level = max(diag_level, DiagnosticStatus.ERROR) 00113 diag_msgs.append('CPU Hot') 00114 # Don't keep CPU Warm in list if CPU is hot 00115 if diag_msgs.count('CPU Warm') > 0: 00116 idx = diag_msgs.index('CPU Warm') 00117 diag_msgs.pop(idx) 00118 else: 00119 diag_vals.append(KeyValue(key = name, value = words[1])) 00120 00121 00122 # MP, BP, FP temps 00123 if name == 'MB Temp' or name == 'BP Temp' or name == 'FP Temp': 00124 if ipmi_val.endswith('degrees C'): 00125 tmp = ipmi_val.rstrip(' degrees C').lstrip() 00126 diag_vals.append(KeyValue(key = name + ' (C)', value = tmp)) 00127 # Give temp warning 00128 dev_name = name.split()[0] 00129 if unicode(tmp).isnumeric(): 00130 temperature = float(tmp) 00131 00132 if temperature >= 60 and temperature < 75: 00133 diag_level = max(diag_level, DiagnosticStatus.WARN) 00134 diag_msgs.append('%s Warm' % dev_name) 00135 00136 if temperature >= 75: 00137 diag_level = max(diag_level, DiagnosticStatus.ERROR) 00138 diag_msgs.append('%s Hot' % dev_name) 00139 else: 00140 diag_level = max(diag_level, DiagnosticStatus.ERROR) 00141 diag_msgs.append('%s Error' % dev_name) 00142 else: 00143 diag_vals.append(KeyValue(key = name, value = ipmi_val)) 00144 00145 # CPU fan speeds 00146 if (name.startswith('CPU') and name.endswith('Fan')) or name == 'MB Fan': 00147 if ipmi_val.endswith('RPM'): 00148 rpm = ipmi_val.rstrip(' RPM').lstrip() 00149 if unicode(rpm).isnumeric(): 00150 if int(rpm) == 0: 00151 diag_level = max(diag_level, DiagnosticStatus.ERROR) 00152 diag_msgs.append('CPU Fan Off') 00153 00154 diag_vals.append(KeyValue(key = name + ' RPM', value = rpm)) 00155 else: 00156 diag_vals.append(KeyValue(key = name, value = ipmi_val)) 00157 00158 # If CPU is hot we get an alarm from ipmitool, report that too 00159 # CPU should shut down if we get a hot alarm, so report as error 00160 if name.startswith('CPU') and name.endswith('hot'): 00161 if ipmi_val == '0x01': 00162 diag_vals.append(KeyValue(key = name, value = 'OK')) 00163 else: 00164 diag_vals.append(KeyValue(key = name, value = 'Hot')) 00165 diag_level = max(diag_level, DiagnosticStatus.ERROR) 00166 diag_msgs.append('CPU Hot Alarm') 00167 00168 except Exception, e: 00169 diag_vals.append(KeyValue(key = 'Exception', value = traceback.format_exc())) 00170 diag_level = DiagnosticStatus.ERROR 00171 diag_msgs.append('Exception') 00172 00173 return diag_vals, diag_msgs, diag_level 00174 00175 00176 ##\brief Check CPU core temps 00177 ## 00178 ## Use 'find /sys -name temp1_input' to find cores 00179 ## Read from every core, divide by 1000 00180 def check_core_temps(sys_temp_strings): 00181 diag_vals = [] 00182 diag_level = 0 00183 diag_msgs = [] 00184 00185 for index, temp_str in enumerate(sys_temp_strings): 00186 if len(temp_str) < 5: 00187 continue 00188 00189 cmd = 'cat %s' % temp_str 00190 p = subprocess.Popen(cmd, stdout = subprocess.PIPE, 00191 stderr = subprocess.PIPE, shell = True) 00192 stdout, stderr = p.communicate() 00193 retcode = p.returncode 00194 00195 if retcode != 0: 00196 diag_level = DiagnosticStatus.ERROR 00197 diag_msg = [ 'Core Temp Error' ] 00198 diag_vals = [ KeyValue(key = 'Core Temp Error', value = stderr), 00199 KeyValue(key = 'Output', value = stdout) ] 00200 return diag_vals, diag_msgs, diag_level 00201 00202 tmp = stdout.strip() 00203 if unicode(tmp).isnumeric(): 00204 temp = float(tmp) / 1000 00205 diag_vals.append(KeyValue(key = 'Core %d Temp' % index, value = str(temp))) 00206 00207 if temp >= 85 and temp < 90: 00208 diag_level = max(diag_level, DiagnosticStatus.WARN) 00209 diag_msgs.append('Warm') 00210 if temp >= 90: 00211 diag_level = max(diag_level, DiagnosticStatus.ERROR) 00212 diag_msgs.append('Hot') 00213 else: 00214 diag_level = max(diag_level, DiagnosticStatus.ERROR) # Error if not numeric value 00215 diag_vals.append(KeyValue(key = 'Core %s Temp' % index, value = tmp)) 00216 00217 return diag_vals, diag_msgs, diag_level 00218 00219 ## Checks clock speed from reading from CPU info 00220 def check_clock_speed(enforce_speed): 00221 vals = [] 00222 msgs = [] 00223 lvl = DiagnosticStatus.OK 00224 00225 try: 00226 p = subprocess.Popen('cat /proc/cpuinfo | grep MHz', 00227 stdout = subprocess.PIPE, 00228 stderr = subprocess.PIPE, shell = True) 00229 stdout, stderr = p.communicate() 00230 retcode = p.returncode 00231 00232 if retcode != 0: 00233 lvl = DiagnosticStatus.ERROR 00234 msgs = [ 'Clock speed error' ] 00235 vals = [ KeyValue(key = 'Clock speed error', value = stderr), 00236 KeyValue(key = 'Output', value = stdout) ] 00237 00238 return (vals, msgs, lvl) 00239 00240 for index, ln in enumerate(stdout.split('\n')): 00241 words = ln.split(':') 00242 if len(words) < 2: 00243 continue 00244 00245 speed = words[1].strip().split('.')[0] # Conversion to float doesn't work with decimal 00246 vals.append(KeyValue(key = 'Core %d MHz' % index, value = speed)) 00247 if unicode(speed).isnumeric(): 00248 mhz = float(speed) 00249 00250 if mhz < 2240 and mhz > 2150: 00251 lvl = max(lvl, DiagnosticStatus.WARN) 00252 if mhz <= 2150: 00253 lvl = max(lvl, DiagnosticStatus.ERROR) 00254 else: 00255 # Automatically give error if speed isn't a number 00256 lvl = max(lvl, DiagnosticStatus.ERROR) 00257 00258 if not enforce_speed: 00259 lvl = DiagnosticStatus.OK 00260 00261 if lvl == DiagnosticStatus.WARN and enforce_speed: 00262 msgs = [ 'Core slowing' ] 00263 elif lvl == DiagnosticStatus.ERROR and enforce_speed: 00264 msgs = [ 'Core throttled' ] 00265 00266 except Exception, e: 00267 rospy.logerr(traceback.format_exc()) 00268 lvl = DiagnosticStatus.ERROR 00269 msgs.append('Exception') 00270 vals.append(KeyValue(key = 'Exception', value = traceback.format_exc())) 00271 00272 return vals, msgs, lvl 00273 00274 00275 # Add msgs output, too 00276 ##\brief Uses 'uptime' to see load average 00277 def check_uptime(load1_threshold, load5_threshold): 00278 level = DiagnosticStatus.OK 00279 vals = [] 00280 00281 load_dict = { 0: 'OK', 1: 'High Load', 2: 'Very High Load' } 00282 00283 try: 00284 p = subprocess.Popen('uptime', stdout = subprocess.PIPE, 00285 stderr = subprocess.PIPE, shell = True) 00286 stdout, stderr = p.communicate() 00287 retcode = p.returncode 00288 00289 if retcode != 0: 00290 vals.append(KeyValue(key = 'uptime Failed', value = stderr)) 00291 return DiagnosticStatus.ERROR, vals 00292 00293 upvals = stdout.split() 00294 load1 = upvals[-3].rstrip(',') 00295 load5 = upvals[-2].rstrip(',') 00296 load15 = upvals[-1] 00297 num_users = upvals[-7] 00298 00299 # Give warning if we go over load limit 00300 if float(load1) > load1_threshold or float(load5) > load5_threshold: 00301 level = DiagnosticStatus.WARN 00302 00303 vals.append(KeyValue(key = 'Load Average Status', value = load_dict[level])) 00304 vals.append(KeyValue(key = '1 min Load Average', value = load1)) 00305 vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(load1_threshold))) 00306 vals.append(KeyValue(key = '5 min Load Average', value = load5)) 00307 vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(load5_threshold))) 00308 vals.append(KeyValue(key = '15 min Load Average', value = load15)) 00309 vals.append(KeyValue(key = 'Number of Users', value = num_users)) 00310 00311 except Exception, e: 00312 rospy.logerr(traceback.format_exc()) 00313 level = DiagnosticStatus.ERROR 00314 vals.append(KeyValue(key = 'Load Average Status', value = traceback.format_exc())) 00315 00316 return level, load_dict[level], vals 00317 00318 # Add msgs output 00319 ##\brief Uses 'free -m' to check free memory 00320 def check_memory(): 00321 values = [] 00322 level = DiagnosticStatus.OK 00323 msg = '' 00324 00325 mem_dict = { 0: 'OK', 1: 'Low Memory', 2: 'Very Low Memory' } 00326 00327 try: 00328 p = subprocess.Popen('free -m', 00329 stdout = subprocess.PIPE, 00330 stderr = subprocess.PIPE, shell = True) 00331 stdout, stderr = p.communicate() 00332 retcode = p.returncode 00333 00334 if retcode != 0: 00335 values.append(KeyValue(key = "\"free -m\" Call Error", value = str(retcode))) 00336 return DiagnosticStatus.ERROR, values 00337 00338 rows = stdout.split('\n') 00339 data = rows[1].split() 00340 total_mem = data[1] 00341 used_mem = data[2] 00342 free_mem = data[3] 00343 00344 level = DiagnosticStatus.OK 00345 if float(free_mem) < 25: 00346 level = DiagnosticStatus.WARN 00347 if float(free_mem) < 1: 00348 level = DiagnosticStatus.ERROR 00349 00350 values.append(KeyValue(key = 'Memory Status', value = mem_dict[level])) 00351 values.append(KeyValue(key = 'Total Memory', value = total_mem)) 00352 values.append(KeyValue(key = 'Used Memory', value = used_mem)) 00353 values.append(KeyValue(key = 'Free Memory', value = free_mem)) 00354 00355 msg = mem_dict[level] 00356 except Exception, e: 00357 rospy.logerr(traceback.format_exc()) 00358 msg = 'Memory Usage Check Error' 00359 values.append(KeyValue(key = msg, value = str(e))) 00360 level = DiagnosticStatus.ERROR 00361 00362 return level, mem_dict[level], values 00363 00364 00365 00366 ##\brief Use mpstat to find CPU usage 00367 ## 00368 usage_old = 0 00369 has_warned_mpstat = False 00370 has_error_core_count = False 00371 def check_mpstat(core_count = -1): 00372 vals = [] 00373 mp_level = DiagnosticStatus.OK 00374 00375 load_dict = { 0: 'OK', 1: 'High Load', 2: 'Error' } 00376 00377 try: 00378 p = subprocess.Popen('mpstat -P ALL 1 1', 00379 stdout = subprocess.PIPE, 00380 stderr = subprocess.PIPE, shell = True) 00381 stdout, stderr = p.communicate() 00382 retcode = p.returncode 00383 00384 if retcode != 0: 00385 global has_warned_mpstat 00386 if not has_warned_mpstat: 00387 rospy.logerr("mpstat failed to run for cpu_monitor. Return code %d.", retcode) 00388 has_warned_mpstat = True 00389 00390 mp_level = DiagnosticStatus.ERROR 00391 vals.append(KeyValue(key = '\"mpstat\" Call Error', value = str(retcode))) 00392 return mp_level, 'Unable to Check CPU Usage', vals 00393 00394 # Check which column '%idle' is, #4539 00395 # mpstat output changed between 8.06 and 8.1 00396 rows = stdout.split('\n') 00397 col_names = rows[2].split() 00398 idle_col = -1 if (len(col_names) > 2 and col_names[-1] == '%idle') else -2 00399 00400 num_cores = 0 00401 cores_loaded = 0 00402 for index, row in enumerate(stdout.split('\n')): 00403 if index < 3: 00404 continue 00405 00406 # Skip row containing 'all' data 00407 if row.find('all') > -1: 00408 continue 00409 00410 lst = row.split() 00411 if len(lst) < 8: 00412 continue 00413 00414 ## Ignore 'Average: ...' data 00415 if lst[0].startswith('Average'): 00416 continue 00417 00418 cpu_name = '%d' % (num_cores) 00419 idle = lst[idle_col] 00420 user = lst[3] 00421 nice = lst[4] 00422 system = lst[5] 00423 00424 core_level = 0 00425 usage = float(user) + float(nice) 00426 if usage > 1000: # wrong reading, use old reading instead 00427 rospy.logwarn('Read cpu usage of %f percent. Reverting to previous reading of %f percent'%(usage, usage_old)) 00428 usage = usage_old 00429 usage_old = usage 00430 00431 if usage > 90.0: 00432 cores_loaded += 1 00433 core_level = DiagnosticStatus.WARN 00434 if usage > 110.0: 00435 core_level = DiagnosticStatus.ERROR 00436 00437 vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level])) 00438 vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = user)) 00439 vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = nice)) 00440 vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = system)) 00441 vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = idle)) 00442 00443 num_cores += 1 00444 00445 # Warn for high load only if we have <= 2 cores that aren't loaded 00446 if num_cores - cores_loaded <= 2 and num_cores > 2: 00447 mp_level = DiagnosticStatus.WARN 00448 00449 # Check the number of cores if core_count > 0, #4850 00450 if core_count > 0 and core_count != num_cores: 00451 mp_level = DiagnosticStatus.ERROR 00452 global has_error_core_count 00453 if not has_error_core_count: 00454 rospy.logerr('Error checking number of cores. Expected %d, got %d. Computer may have not booted properly.', 00455 core_count, num_cores) 00456 has_error_core_count = True 00457 return DiagnosticStatus.ERROR, 'Incorrect number of CPU cores', vals 00458 00459 except Exception, e: 00460 mp_level = DiagnosticStatus.ERROR 00461 vals.append(KeyValue(key = 'mpstat Exception', value = str(e))) 00462 00463 return mp_level, load_dict[mp_level], vals 00464 00465 ## Returns names for core temperature files 00466 ## Returns list of names, each name can be read like file 00467 def get_core_temp_names(): 00468 temp_vals = [] 00469 try: 00470 p = subprocess.Popen('find /sys/devices -name temp1_input', 00471 stdout = subprocess.PIPE, 00472 stderr = subprocess.PIPE, shell = True) 00473 stdout, stderr = p.communicate() 00474 retcode = p.returncode 00475 00476 if retcode != 0: 00477 rospy.logerr('Error find core temp locations: %s' % stderr) 00478 return [] 00479 00480 for ln in stdout.split('\n'): 00481 temp_vals.append(ln.strip()) 00482 00483 return temp_vals 00484 except: 00485 rospy.logerr('Exception finding temp vals: %s' % traceback.format_exc()) 00486 return [] 00487 00488 def update_status_stale(stat, last_update_time): 00489 time_since_update = rospy.get_time() - last_update_time 00490 00491 stale_status = 'OK' 00492 if time_since_update > 20 and time_since_update <= 35: 00493 stale_status = 'Lagging' 00494 if stat.level == DiagnosticStatus.OK: 00495 stat.message = stale_status 00496 elif stat.message.find(stale_status) < 0: 00497 stat.message = ', '.join([stat.message, stale_status]) 00498 stat.level = max(stat.level, DiagnosticStatus.WARN) 00499 if time_since_update > 35: 00500 stale_status = 'Stale' 00501 if stat.level == DiagnosticStatus.OK: 00502 stat.message = stale_status 00503 elif stat.message.find(stale_status) < 0: 00504 stat.message = ', '.join([stat.message, stale_status]) 00505 stat.level = max(stat.level, DiagnosticStatus.ERROR) 00506 00507 00508 stat.values.pop(0) 00509 stat.values.pop(0) 00510 stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status)) 00511 stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update))) 00512 00513 00514 class CPUMonitor(): 00515 def __init__(self, hostname, diag_hostname): 00516 self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray) 00517 00518 self._mutex = threading.Lock() 00519 00520 self._check_ipmi = rospy.get_param('~check_ipmi_tool', True) 00521 self._enforce_speed = rospy.get_param('~enforce_clock_speed', True) 00522 00523 self._check_core_temps = rospy.get_param('~check_core_temps', False) 00524 if self._check_core_temps: 00525 rospy.logwarn('Checking CPU core temperatures is deprecated. This will be removed in D-turtle') 00526 self._check_nfs = rospy.get_param('~check_nfs', False) 00527 if self._check_nfs: 00528 rospy.logwarn('NFS checking is deprecated for CPU monitor. This will be removed in D-turtle') 00529 00530 self._load1_threshold = rospy.get_param('~load1_threshold', 5.0) 00531 self._load5_threshold = rospy.get_param('~load5_threshold', 3.0) 00532 00533 self._num_cores = rospy.get_param('~num_cores', 8.0) 00534 00535 self._temps_timer = None 00536 self._usage_timer = None 00537 self._nfs_timer = None 00538 00539 # Get temp_input files 00540 self._temp_vals = get_core_temp_names() 00541 00542 # CPU stats 00543 self._temp_stat = DiagnosticStatus() 00544 self._temp_stat.name = '%s CPU Temperature' % diag_hostname 00545 self._temp_stat.level = 1 00546 self._temp_stat.hardware_id = hostname 00547 self._temp_stat.message = 'No Data' 00548 self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ), 00549 KeyValue(key = 'Time Since Last Update', value = 'N/A') ] 00550 00551 self._usage_stat = DiagnosticStatus() 00552 self._usage_stat.name = '%s CPU Usage' % diag_hostname 00553 self._usage_stat.level = 1 00554 self._usage_stat.hardware_id = hostname 00555 self._usage_stat.message = 'No Data' 00556 self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ), 00557 KeyValue(key = 'Time Since Last Update', value = 'N/A') ] 00558 00559 self._nfs_stat = DiagnosticStatus() 00560 self._nfs_stat.name = '%s NFS IO' % diag_hostname 00561 self._nfs_stat.level = 1 00562 self._nfs_stat.hardware_id = hostname 00563 self._nfs_stat.message = 'No Data' 00564 self._nfs_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ), 00565 KeyValue(key = 'Time Since Last Update', value = 'N/A') ] 00566 00567 self._last_temp_time = 0 00568 self._last_usage_time = 0 00569 self._last_nfs_time = 0 00570 self._last_publish_time = 0 00571 00572 # Start checking everything 00573 self.check_temps() 00574 if self._check_nfs: 00575 self.check_nfs_stat() 00576 self.check_usage() 00577 00578 # Restart temperature checking 00579 def _restart_temp_check(self): 00580 rospy.logerr('Restarting temperature check thread in cpu_monitor. This should not happen') 00581 try: 00582 with self._mutex: 00583 if self._temps_timer: 00584 self._temps_timer.cancel() 00585 00586 self.check_temps() 00587 except Exception, e: 00588 rospy.logerr('Unable to restart temp thread. Error: %s' % traceback.format_exc()) 00589 00590 00591 ## Must have the lock to cancel everything 00592 def cancel_timers(self): 00593 if self._temps_timer: 00594 self._temps_timer.cancel() 00595 00596 if self._nfs_timer: 00597 self._nfs_timer.cancel() 00598 00599 if self._usage_timer: 00600 self._usage_timer.cancel() 00601 00602 def check_nfs_stat(self): 00603 if rospy.is_shutdown(): 00604 with self._mutex: 00605 self.cancel_timers() 00606 return 00607 00608 nfs_level = 0 00609 msg = 'OK' 00610 vals = [ KeyValue(key = 'Update Status', value = 'OK' ), 00611 KeyValue(key = 'Time Since Last Update', value = str(0) )] 00612 00613 try: 00614 p = subprocess.Popen('iostat -n', 00615 stdout = subprocess.PIPE, 00616 stderr = subprocess.PIPE, shell = True) 00617 stdout, stderr = p.communicate() 00618 retcode = p.returncode 00619 00620 if retcode != 0: 00621 nfs_level = DiagnosticStatus.ERROR 00622 msg = 'iostat Error' 00623 vals.append(KeyValue(key = '\"iostat -n\" Call Error', value = str(e))) 00624 stdout = '' 00625 00626 00627 for index, row in enumerate(stdout.split('\n')): 00628 if index < 3: 00629 continue 00630 00631 lst = row.split() 00632 if len(lst) < 7: 00633 continue 00634 00635 file_sys = lst[0] 00636 read_blk = lst[1] 00637 write_blk = lst[2] 00638 read_blk_dir = lst[3] 00639 write_blk_dir = lst[4] 00640 r_blk_srv = lst[5] 00641 w_blk_srv = lst[6] 00642 00643 vals.append(KeyValue( 00644 key = '%s Read Blks/s' % file_sys, value=read_blk)) 00645 vals.append(KeyValue( 00646 key = '%s Write Blks/s' % file_sys, value=write_blk)) 00647 vals.append(KeyValue( 00648 key = '%s Read Blk dir/s' % file_sys, value=read_blk_dir)) 00649 vals.append(KeyValue( 00650 key = '%s Write Blks dir/s' % file_sys, value=write_blk_dir)) 00651 vals.append(KeyValue( 00652 key = '%s Read Blks srv/s' % file_sys, value=r_blk_srv)) 00653 vals.append(KeyValue( 00654 key = '%s Write Blks srv/s' % file_sys, value=w_blk_srv)) 00655 00656 except Exception, e: 00657 rospy.logerr(traceback.format_exc()) 00658 nfs_level = DiagnosticStatus.ERROR 00659 msg = 'Exception' 00660 vals.append(KeyValue(key = 'Exception', value = str(e))) 00661 00662 with self._mutex: 00663 self._nfs_stat.level = nfs_level 00664 self._nfs_stat.message = msg 00665 self._nfs_stat.values = vals 00666 00667 self._last_nfs_time = rospy.get_time() 00668 00669 if not rospy.is_shutdown(): 00670 self._nfs_timer = threading.Timer(5.0, self.check_nfs_stat) 00671 self._nfs_timer.start() 00672 else: 00673 self.cancel_timers() 00674 00675 00676 ## Call every 10sec at minimum 00677 def check_temps(self): 00678 if rospy.is_shutdown(): 00679 with self._mutex: 00680 self.cancel_timers() 00681 return 00682 00683 diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ), 00684 KeyValue(key = 'Time Since Last Update', value = str(0) ) ] 00685 diag_msgs = [] 00686 diag_level = 0 00687 00688 if self._check_ipmi: 00689 ipmi_vals, ipmi_msgs, ipmi_level = check_ipmi() 00690 diag_vals.extend(ipmi_vals) 00691 diag_msgs.extend(ipmi_msgs) 00692 diag_level = max(diag_level, ipmi_level) 00693 00694 if self._check_core_temps: 00695 core_vals, core_msgs, core_level = check_core_temps(self._temp_vals) 00696 diag_vals.extend(core_vals) 00697 diag_msgs.extend(core_msgs) 00698 diag_level = max(diag_level, core_level) 00699 00700 clock_vals, clock_msgs, clock_level = check_clock_speed(self._enforce_speed) 00701 diag_vals.extend(clock_vals) 00702 diag_msgs.extend(clock_msgs) 00703 diag_level = max(diag_level, clock_level) 00704 00705 diag_log = set(diag_msgs) 00706 if len(diag_log) > 0: 00707 message = ', '.join(diag_log) 00708 else: 00709 message = stat_dict[diag_level] 00710 00711 with self._mutex: 00712 self._last_temp_time = rospy.get_time() 00713 00714 self._temp_stat.level = diag_level 00715 self._temp_stat.message = message 00716 self._temp_stat.values = diag_vals 00717 00718 if not rospy.is_shutdown(): 00719 self._temps_timer = threading.Timer(5.0, self.check_temps) 00720 self._temps_timer.start() 00721 else: 00722 self.cancel_timers() 00723 00724 def check_usage(self): 00725 if rospy.is_shutdown(): 00726 with self._mutex: 00727 self.cancel_timers() 00728 return 00729 00730 diag_level = 0 00731 diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ), 00732 KeyValue(key = 'Time Since Last Update', value = 0 )] 00733 diag_msgs = [] 00734 00735 # Check mpstat 00736 mp_level, mp_msg, mp_vals = check_mpstat(self._num_cores) 00737 diag_vals.extend(mp_vals) 00738 if mp_level > 0: 00739 diag_msgs.append(mp_msg) 00740 diag_level = max(diag_level, mp_level) 00741 00742 # Check uptime 00743 uptime_level, up_msg, up_vals = check_uptime(self._load1_threshold, self._load5_threshold) 00744 diag_vals.extend(up_vals) 00745 if uptime_level > 0: 00746 diag_msgs.append(up_msg) 00747 diag_level = max(diag_level, uptime_level) 00748 00749 # Check memory 00750 mem_level, mem_msg, mem_vals = check_memory() 00751 diag_vals.extend(mem_vals) 00752 if mem_level > 0: 00753 diag_msgs.append(mem_msg) 00754 diag_level = max(diag_level, mem_level) 00755 00756 if diag_msgs and diag_level > 0: 00757 usage_msg = ', '.join(set(diag_msgs)) 00758 else: 00759 usage_msg = stat_dict[diag_level] 00760 00761 # Update status 00762 with self._mutex: 00763 self._last_usage_time = rospy.get_time() 00764 self._usage_stat.level = diag_level 00765 self._usage_stat.values = diag_vals 00766 00767 self._usage_stat.message = usage_msg 00768 00769 if not rospy.is_shutdown(): 00770 self._usage_timer = threading.Timer(5.0, self.check_usage) 00771 self._usage_timer.start() 00772 else: 00773 self.cancel_timers() 00774 00775 def publish_stats(self): 00776 with self._mutex: 00777 # Update everything with last update times 00778 update_status_stale(self._temp_stat, self._last_temp_time) 00779 update_status_stale(self._usage_stat, self._last_usage_time) 00780 if self._check_nfs: 00781 update_status_stale(self._nfs_stat, self._last_nfs_time) 00782 00783 msg = DiagnosticArray() 00784 msg.header.stamp = rospy.get_rostime() 00785 msg.status.append(self._temp_stat) 00786 msg.status.append(self._usage_stat) 00787 if self._check_nfs: 00788 msg.status.append(self._nfs_stat) 00789 00790 if rospy.get_time() - self._last_publish_time > 0.5: 00791 self._diag_pub.publish(msg) 00792 self._last_publish_time = rospy.get_time() 00793 00794 00795 # Restart temperature checking if it goes stale, #4171 00796 # Need to run this without mutex 00797 if rospy.get_time() - self._last_temp_time > 90: 00798 self._restart_temp_check() 00799 00800 00801 if __name__ == '__main__': 00802 hostname = socket.gethostname() 00803 00804 import optparse 00805 parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]") 00806 parser.add_option("--diag-hostname", dest="diag_hostname", 00807 help="Computer name in diagnostics output (ex: 'c1')", 00808 metavar="DIAG_HOSTNAME", 00809 action="store", default = hostname) 00810 options, args = parser.parse_args(rospy.myargv()) 00811 00812 try: 00813 rospy.init_node('cpu_monitor_%s' % hostname) 00814 except rospy.exceptions.ROSInitException: 00815 print >> sys.stderr, 'CPU monitor is unable to initialize node. Master may not be running.' 00816 sys.exit(0) 00817 00818 cpu_node = CPUMonitor(hostname, options.diag_hostname) 00819 00820 rate = rospy.Rate(1.0) 00821 try: 00822 while not rospy.is_shutdown(): 00823 rate.sleep() 00824 cpu_node.publish_stats() 00825 except KeyboardInterrupt: 00826 pass 00827 except Exception, e: 00828 traceback.print_exc() 00829 rospy.logerr(traceback.format_exc()) 00830 00831 cpu_node.cancel_timers() 00832 sys.exit(0) 00833 00834 00835 00836 00837 00838 00839