cpu_monitor.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # Software License Agreement (BSD License)
00004 #
00005 # Copyright (c) 2009, Willow Garage, Inc.
00006 # All rights reserved.
00007 #
00008 # Redistribution and use in source and binary forms, with or without
00009 # modification, are permitted provided that the following conditions
00010 # are met:
00011 #
00012 #  * Redistributions of source code must retain the above copyright
00013 #    notice, this list of conditions and the following disclaimer.
00014 #  * Redistributions in binary form must reproduce the above
00015 #    copyright notice, this list of conditions and the following
00016 #    disclaimer in the documentation and/or other materials provided
00017 #    with the distribution.
00018 #  * Neither the name of the Willow Garage nor the names of its
00019 #    contributors may be used to endorse or promote products derived
00020 #    from this software without specific prior written permission.
00021 #
00022 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00023 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00024 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00025 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
00026 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00027 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00028 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00029 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00030 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00031 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00032 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00033 # POSSIBILITY OF SUCH DAMAGE.
00034 
00035 ##\author Kevin Watts
00036 
00037 from __future__ import with_statement
00038 import roslib
00039 roslib.load_manifest('pr2_computer_monitor')
00040 
00041 import rospy
00042 
00043 import traceback
00044 import threading
00045 from threading import Timer
00046 import sys, os, time
00047 from time import sleep
00048 import subprocess
00049 import string
00050 
00051 import socket
00052 
00053 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
00054 
00055 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
00056 
00057 # Output entire IPMI data set
00058 def check_ipmi():
00059     diag_vals = []
00060     diag_msgs = []
00061     diag_level = DiagnosticStatus.OK
00062 
00063     try:
00064         p = subprocess.Popen('sudo ipmitool sdr',
00065                              stdout = subprocess.PIPE,
00066                              stderr = subprocess.PIPE, shell = True)
00067         stdout, stderr = p.communicate()
00068         retcode = p.returncode
00069                         
00070         if retcode != 0:
00071             diag_level = DiagnosticStatus.ERROR
00072             diag_msgs = [ 'ipmitool Error' ]
00073             diag_vals = [ KeyValue(key = 'IPMI Error', value = stderr) ]
00074             return diag_vals, diag_msgs, diag_level
00075 
00076         lines = stdout.split('\n')
00077         if len(lines) < 2:
00078             diag_vals = [ KeyValue(key = 'ipmitool status', value = 'No output') ]
00079 
00080             diag_msgs = [ 'No ipmitool response' ]
00081             diag_level = DiagnosticStatus.ERROR
00082 
00083             return diag_vals, diag_msgs, diag_level
00084 
00085         for ln in lines:
00086             if len(ln) < 3:
00087                 continue
00088 
00089             words = ln.split('|')
00090             if len(words) < 3:
00091                 continue 
00092 
00093             name = words[0].strip()
00094             ipmi_val = words[1].strip()
00095             stat_byte = words[2].strip()
00096 
00097             # CPU temps
00098             if words[0].startswith('CPU') and words[0].strip().endswith('Temp'):
00099                 if words[1].strip().endswith('degrees C'):
00100                     tmp = ipmi_val.rstrip(' degrees C').lstrip()
00101                     if unicode(tmp).isnumeric():
00102                         temperature = float(tmp)
00103                         diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00104 
00105                         cpu_name = name.split()[0]
00106                         if temperature >= 80 and temperature < 89:
00107                             diag_level = max(diag_level, DiagnosticStatus.WARN)
00108                             if diag_msgs.count('CPU Hot') == 0:
00109                                 diag_msgs.append('CPU Warm')
00110 
00111                         if temperature >= 89: # CPU should shut down here
00112                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
00113                             diag_msgs.append('CPU Hot')                                
00114                             # Don't keep CPU Warm in list if CPU is hot
00115                             if diag_msgs.count('CPU Warm') > 0:
00116                                 idx = diag_msgs.index('CPU Warm')
00117                                 diag_msgs.pop(idx)
00118                 else:
00119                     diag_vals.append(KeyValue(key = name, value = words[1]))
00120 
00121 
00122             # MP, BP, FP temps
00123             if name == 'MB Temp' or name == 'BP Temp' or name == 'FP Temp':
00124                 if ipmi_val.endswith('degrees C'):
00125                     tmp = ipmi_val.rstrip(' degrees C').lstrip()
00126                     diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00127                     # Give temp warning
00128                     dev_name = name.split()[0]
00129                     if unicode(tmp).isnumeric():
00130                         temperature = float(tmp)
00131 
00132                         if temperature >= 60 and temperature < 75:
00133                             diag_level = max(diag_level, DiagnosticStatus.WARN)
00134                             diag_msgs.append('%s Warm' % dev_name)
00135 
00136                         if temperature >= 75:
00137                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
00138                             diag_msgs.append('%s Hot' % dev_name)
00139                     else:
00140                         diag_level = max(diag_level, DiagnosticStatus.ERROR)
00141                         diag_msgs.append('%s Error' % dev_name)
00142                 else:
00143                     diag_vals.append(KeyValue(key = name, value = ipmi_val))
00144         
00145             # CPU fan speeds
00146             if (name.startswith('CPU') and name.endswith('Fan')) or name == 'MB Fan':
00147                 if ipmi_val.endswith('RPM'):
00148                     rpm = ipmi_val.rstrip(' RPM').lstrip()
00149                     if unicode(rpm).isnumeric():
00150                         if int(rpm) == 0:
00151                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
00152                             diag_msgs.append('CPU Fan Off')
00153                             
00154                         diag_vals.append(KeyValue(key = name + ' RPM', value = rpm))
00155                     else:
00156                         diag_vals.append(KeyValue(key = name, value = ipmi_val))
00157 
00158             # If CPU is hot we get an alarm from ipmitool, report that too
00159             # CPU should shut down if we get a hot alarm, so report as error
00160             if name.startswith('CPU') and name.endswith('hot'):
00161                 if ipmi_val == '0x01':
00162                     diag_vals.append(KeyValue(key = name, value = 'OK'))
00163                 else:
00164                     diag_vals.append(KeyValue(key = name, value = 'Hot'))
00165                     diag_level = max(diag_level, DiagnosticStatus.ERROR)
00166                     diag_msgs.append('CPU Hot Alarm')
00167 
00168     except Exception, e:
00169         diag_vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
00170         diag_level = DiagnosticStatus.ERROR
00171         diag_msgs.append('Exception')
00172 
00173     return diag_vals, diag_msgs, diag_level
00174         
00175 
00176 ##\brief Check CPU core temps 
00177 ##
00178 ## Use 'find /sys -name temp1_input' to find cores
00179 ## Read from every core, divide by 1000
00180 def check_core_temps(sys_temp_strings):
00181     diag_vals = []
00182     diag_level = 0
00183     diag_msgs = []
00184     
00185     for index, temp_str in enumerate(sys_temp_strings):
00186         if len(temp_str) < 5:
00187             continue
00188         
00189         cmd = 'cat %s' % temp_str
00190         p = subprocess.Popen(cmd, stdout = subprocess.PIPE, 
00191                              stderr = subprocess.PIPE, shell = True)
00192         stdout, stderr = p.communicate()
00193         retcode = p.returncode
00194 
00195         if retcode != 0:
00196             diag_level = DiagnosticStatus.ERROR
00197             diag_msg = [ 'Core Temp Error' ]
00198             diag_vals = [ KeyValue(key = 'Core Temp Error', value = stderr), 
00199                           KeyValue(key = 'Output', value = stdout) ]
00200             return diag_vals, diag_msgs, diag_level
00201   
00202         tmp = stdout.strip()
00203         if unicode(tmp).isnumeric():
00204             temp = float(tmp) / 1000
00205             diag_vals.append(KeyValue(key = 'Core %d Temp' % index, value = str(temp)))
00206 
00207             if temp >= 85 and temp < 90:
00208                 diag_level = max(diag_level, DiagnosticStatus.WARN)
00209                 diag_msgs.append('Warm')
00210             if temp >= 90:
00211                 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00212                 diag_msgs.append('Hot')
00213         else:
00214             diag_level = max(diag_level, DiagnosticStatus.ERROR) # Error if not numeric value
00215             diag_vals.append(KeyValue(key = 'Core %s Temp' % index, value = tmp))
00216 
00217     return diag_vals, diag_msgs, diag_level
00218 
00219 ## Checks clock speed from reading from CPU info
00220 def check_clock_speed(enforce_speed):
00221     vals = []
00222     msgs = []
00223     lvl = DiagnosticStatus.OK
00224 
00225     try:
00226         p = subprocess.Popen('cat /proc/cpuinfo | grep MHz', 
00227                              stdout = subprocess.PIPE,
00228                              stderr = subprocess.PIPE, shell = True)
00229         stdout, stderr = p.communicate()
00230         retcode = p.returncode
00231 
00232         if retcode != 0:
00233             lvl = DiagnosticStatus.ERROR
00234             msgs = [ 'Clock speed error' ]
00235             vals = [ KeyValue(key = 'Clock speed error', value = stderr), 
00236                      KeyValue(key = 'Output', value = stdout) ]
00237             
00238             return (vals, msgs, lvl)
00239 
00240         for index, ln in enumerate(stdout.split('\n')):
00241             words = ln.split(':')
00242             if len(words) < 2:
00243                 continue
00244 
00245             speed = words[1].strip().split('.')[0] # Conversion to float doesn't work with decimal
00246             vals.append(KeyValue(key = 'Core %d MHz' % index, value = speed))
00247             if unicode(speed).isnumeric():
00248                 mhz = float(speed)
00249                 
00250                 if mhz < 2240 and mhz > 2150:
00251                     lvl = max(lvl, DiagnosticStatus.WARN)
00252                 if mhz <= 2150:
00253                     lvl = max(lvl, DiagnosticStatus.ERROR)
00254             else:
00255                 # Automatically give error if speed isn't a number
00256                 lvl = max(lvl, DiagnosticStatus.ERROR)
00257 
00258         if not enforce_speed:
00259             lvl = DiagnosticStatus.OK
00260 
00261         if lvl == DiagnosticStatus.WARN and enforce_speed:
00262             msgs = [ 'Core slowing' ]
00263         elif lvl == DiagnosticStatus.ERROR and enforce_speed:
00264             msgs = [ 'Core throttled' ]
00265 
00266     except Exception, e:
00267         rospy.logerr(traceback.format_exc())
00268         lvl = DiagnosticStatus.ERROR
00269         msgs.append('Exception')
00270         vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
00271 
00272     return vals, msgs, lvl
00273                     
00274 
00275 # Add msgs output, too
00276 ##\brief Uses 'uptime' to see load average
00277 def check_uptime(load1_threshold, load5_threshold):
00278     level = DiagnosticStatus.OK
00279     vals = []
00280     
00281     load_dict = { 0: 'OK', 1: 'High Load', 2: 'Very High Load' }
00282 
00283     try:
00284         p = subprocess.Popen('uptime', stdout = subprocess.PIPE, 
00285                              stderr = subprocess.PIPE, shell = True)
00286         stdout, stderr = p.communicate()
00287         retcode = p.returncode
00288 
00289         if retcode != 0:
00290             vals.append(KeyValue(key = 'uptime Failed', value = stderr))
00291             return DiagnosticStatus.ERROR, vals
00292 
00293         upvals = stdout.split()
00294         load1 = upvals[-3].rstrip(',')
00295         load5 = upvals[-2].rstrip(',')
00296         load15 = upvals[-1]
00297         num_users = upvals[-7]
00298 
00299         # Give warning if we go over load limit 
00300         if float(load1) > load1_threshold or float(load5) > load5_threshold:
00301             level = DiagnosticStatus.WARN
00302 
00303         vals.append(KeyValue(key = 'Load Average Status', value = load_dict[level]))
00304         vals.append(KeyValue(key = '1 min Load Average', value = load1))
00305         vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(load1_threshold)))
00306         vals.append(KeyValue(key = '5 min Load Average', value = load5))
00307         vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(load5_threshold)))
00308         vals.append(KeyValue(key = '15 min Load Average', value = load15))
00309         vals.append(KeyValue(key = 'Number of Users', value = num_users))
00310 
00311     except Exception, e:
00312         rospy.logerr(traceback.format_exc())
00313         level = DiagnosticStatus.ERROR
00314         vals.append(KeyValue(key = 'Load Average Status', value = traceback.format_exc()))
00315         
00316     return level, load_dict[level], vals
00317 
00318 # Add msgs output
00319 ##\brief Uses 'free -m' to check free memory
00320 def check_memory():
00321     values = []
00322     level = DiagnosticStatus.OK
00323     msg = ''
00324 
00325     mem_dict = { 0: 'OK', 1: 'Low Memory', 2: 'Very Low Memory' }
00326 
00327     try:
00328         p = subprocess.Popen('free -m',
00329                              stdout = subprocess.PIPE,
00330                              stderr = subprocess.PIPE, shell = True)
00331         stdout, stderr = p.communicate()
00332         retcode = p.returncode
00333                
00334         if retcode != 0:
00335             values.append(KeyValue(key = "\"free -m\" Call Error", value = str(retcode)))
00336             return DiagnosticStatus.ERROR, values
00337  
00338         rows = stdout.split('\n')
00339         data = rows[1].split()
00340         total_mem = data[1]
00341         used_mem = data[2]
00342         free_mem = data[3]
00343 
00344         level = DiagnosticStatus.OK
00345         if float(free_mem) < 25:
00346             level = DiagnosticStatus.WARN
00347         if float(free_mem) < 1:
00348             level = DiagnosticStatus.ERROR
00349 
00350         values.append(KeyValue(key = 'Memory Status', value = mem_dict[level]))
00351         values.append(KeyValue(key = 'Total Memory', value = total_mem))
00352         values.append(KeyValue(key = 'Used Memory', value = used_mem))
00353         values.append(KeyValue(key = 'Free Memory', value = free_mem))
00354 
00355         msg = mem_dict[level]
00356     except Exception, e:
00357         rospy.logerr(traceback.format_exc())
00358         msg = 'Memory Usage Check Error'
00359         values.append(KeyValue(key = msg, value = str(e)))
00360         level = DiagnosticStatus.ERROR
00361     
00362     return level, mem_dict[level], values
00363 
00364 
00365 
00366 ##\brief Use mpstat to find CPU usage
00367 ##
00368 usage_old = 0
00369 has_warned_mpstat = False
00370 has_error_core_count = False
00371 def check_mpstat(core_count = -1):
00372     vals = []
00373     mp_level = DiagnosticStatus.OK
00374     
00375     load_dict = { 0: 'OK', 1: 'High Load', 2: 'Error' }
00376 
00377     try:
00378         p = subprocess.Popen('mpstat -P ALL 1 1',
00379                              stdout = subprocess.PIPE,
00380                              stderr = subprocess.PIPE, shell = True)
00381         stdout, stderr = p.communicate()
00382         retcode = p.returncode
00383 
00384         if retcode != 0:
00385             global has_warned_mpstat
00386             if not has_warned_mpstat:
00387                 rospy.logerr("mpstat failed to run for cpu_monitor. Return code %d.", retcode)
00388                 has_warned_mpstat = True
00389 
00390             mp_level = DiagnosticStatus.ERROR
00391             vals.append(KeyValue(key = '\"mpstat\" Call Error', value = str(retcode)))
00392             return mp_level, 'Unable to Check CPU Usage', vals
00393 
00394         # Check which column '%idle' is, #4539
00395         # mpstat output changed between 8.06 and 8.1
00396         rows = stdout.split('\n')
00397         col_names = rows[2].split()
00398         idle_col = -1 if (len(col_names) > 2 and col_names[-1] == '%idle') else -2
00399 
00400         num_cores = 0
00401         cores_loaded = 0
00402         for index, row in enumerate(stdout.split('\n')):
00403             if index < 3:
00404                 continue
00405             
00406             # Skip row containing 'all' data
00407             if row.find('all') > -1:
00408                 continue
00409 
00410             lst = row.split()
00411             if len(lst) < 8:
00412                 continue
00413 
00414             ## Ignore 'Average: ...' data
00415             if lst[0].startswith('Average'):
00416                 continue
00417 
00418             cpu_name = '%d' % (num_cores)
00419             idle = lst[idle_col]
00420             user = lst[3]
00421             nice = lst[4]
00422             system = lst[5]
00423             
00424             core_level = 0
00425             usage = float(user) + float(nice)
00426             if usage > 1000: # wrong reading, use old reading instead
00427                 rospy.logwarn('Read cpu usage of %f percent. Reverting to previous reading of %f percent'%(usage, usage_old))
00428                 usage = usage_old
00429             usage_old = usage
00430 
00431             if usage > 90.0:
00432                 cores_loaded += 1
00433                 core_level = DiagnosticStatus.WARN
00434             if usage > 110.0:
00435                 core_level = DiagnosticStatus.ERROR
00436 
00437             vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level]))
00438             vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = user))
00439             vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = nice))
00440             vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = system))
00441             vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = idle))
00442 
00443             num_cores += 1
00444         
00445         # Warn for high load only if we have <= 2 cores that aren't loaded
00446         if num_cores - cores_loaded <= 2 and num_cores > 2:
00447             mp_level = DiagnosticStatus.WARN
00448 
00449         # Check the number of cores if core_count > 0, #4850
00450         if core_count > 0 and core_count != num_cores:
00451             mp_level = DiagnosticStatus.ERROR
00452             global has_error_core_count
00453             if not has_error_core_count:
00454                 rospy.logerr('Error checking number of cores. Expected %d, got %d. Computer may have not booted properly.',
00455                               core_count, num_cores)
00456                 has_error_core_count = True
00457             return DiagnosticStatus.ERROR, 'Incorrect number of CPU cores', vals
00458             
00459     except Exception, e:
00460         mp_level = DiagnosticStatus.ERROR
00461         vals.append(KeyValue(key = 'mpstat Exception', value = str(e)))
00462 
00463     return mp_level, load_dict[mp_level], vals
00464 
00465 ## Returns names for core temperature files
00466 ## Returns list of names, each name can be read like file
00467 def get_core_temp_names():
00468     temp_vals = []
00469     try:
00470         p = subprocess.Popen('find /sys/devices -name temp1_input', 
00471                              stdout = subprocess.PIPE,
00472                              stderr = subprocess.PIPE, shell = True)
00473         stdout, stderr = p.communicate()
00474         retcode = p.returncode
00475 
00476         if retcode != 0:
00477             rospy.logerr('Error find core temp locations: %s' % stderr)
00478             return []
00479         
00480         for ln in stdout.split('\n'):
00481             temp_vals.append(ln.strip())
00482         
00483         return temp_vals
00484     except:
00485         rospy.logerr('Exception finding temp vals: %s' % traceback.format_exc())
00486         return []
00487 
00488 def update_status_stale(stat, last_update_time):
00489     time_since_update = rospy.get_time() - last_update_time
00490 
00491     stale_status = 'OK'
00492     if time_since_update > 20 and time_since_update <= 35:
00493         stale_status = 'Lagging'
00494         if stat.level == DiagnosticStatus.OK:
00495             stat.message = stale_status
00496         elif stat.message.find(stale_status) < 0:
00497             stat.message = ', '.join([stat.message, stale_status])
00498         stat.level = max(stat.level, DiagnosticStatus.WARN)
00499     if time_since_update > 35:
00500         stale_status = 'Stale'
00501         if stat.level == DiagnosticStatus.OK:
00502             stat.message = stale_status
00503         elif stat.message.find(stale_status) < 0:
00504             stat.message = ', '.join([stat.message, stale_status])
00505         stat.level = max(stat.level, DiagnosticStatus.ERROR)
00506 
00507 
00508     stat.values.pop(0)
00509     stat.values.pop(0)
00510     stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
00511     stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
00512     
00513 
00514 class CPUMonitor():
00515     def __init__(self, hostname, diag_hostname):
00516         self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray)
00517 
00518         self._mutex = threading.Lock()
00519 
00520         self._check_ipmi = rospy.get_param('~check_ipmi_tool', True)
00521         self._enforce_speed = rospy.get_param('~enforce_clock_speed', True)
00522 
00523         self._check_core_temps = rospy.get_param('~check_core_temps', False)
00524         if self._check_core_temps:
00525             rospy.logwarn('Checking CPU core temperatures is deprecated. This will be removed in D-turtle')
00526         self._check_nfs = rospy.get_param('~check_nfs', False)
00527         if self._check_nfs:
00528             rospy.logwarn('NFS checking is deprecated for CPU monitor. This will be removed in D-turtle')
00529 
00530         self._load1_threshold = rospy.get_param('~load1_threshold', 5.0)
00531         self._load5_threshold = rospy.get_param('~load5_threshold', 3.0)
00532 
00533         self._num_cores = rospy.get_param('~num_cores', 8.0)
00534 
00535         self._temps_timer = None
00536         self._usage_timer = None
00537         self._nfs_timer = None
00538         
00539         # Get temp_input files
00540         self._temp_vals = get_core_temp_names()
00541 
00542         # CPU stats
00543         self._temp_stat = DiagnosticStatus()
00544         self._temp_stat.name = '%s CPU Temperature' % diag_hostname
00545         self._temp_stat.level = 1
00546         self._temp_stat.hardware_id = hostname
00547         self._temp_stat.message = 'No Data'
00548         self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00549                                    KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00550 
00551         self._usage_stat = DiagnosticStatus()
00552         self._usage_stat.name = '%s CPU Usage' % diag_hostname
00553         self._usage_stat.level = 1
00554         self._usage_stat.hardware_id = hostname
00555         self._usage_stat.message = 'No Data'
00556         self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00557                                     KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00558 
00559         self._nfs_stat = DiagnosticStatus()
00560         self._nfs_stat.name = '%s NFS IO' % diag_hostname
00561         self._nfs_stat.level = 1
00562         self._nfs_stat.hardware_id = hostname
00563         self._nfs_stat.message = 'No Data'
00564         self._nfs_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00565                                   KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00566 
00567         self._last_temp_time = 0
00568         self._last_usage_time = 0
00569         self._last_nfs_time = 0
00570         self._last_publish_time = 0
00571 
00572         # Start checking everything
00573         self.check_temps()
00574         if self._check_nfs:
00575             self.check_nfs_stat()
00576         self.check_usage()
00577 
00578     # Restart temperature checking 
00579     def _restart_temp_check(self):
00580         rospy.logerr('Restarting temperature check thread in cpu_monitor. This should not happen')
00581         try:
00582             with self._mutex:
00583                 if self._temps_timer:
00584                     self._temps_timer.cancel()
00585                 
00586             self.check_temps()
00587         except Exception, e:
00588             rospy.logerr('Unable to restart temp thread. Error: %s' % traceback.format_exc())
00589             
00590 
00591     ## Must have the lock to cancel everything
00592     def cancel_timers(self):
00593         if self._temps_timer:
00594             self._temps_timer.cancel()
00595 
00596         if self._nfs_timer:
00597             self._nfs_timer.cancel()
00598 
00599         if self._usage_timer:
00600             self._usage_timer.cancel()
00601 
00602     def check_nfs_stat(self):
00603         if rospy.is_shutdown():
00604             with self._mutex:
00605                 self.cancel_timers()
00606                 return
00607 
00608         nfs_level = 0
00609         msg = 'OK'
00610         vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00611                  KeyValue(key = 'Time Since Last Update', value = str(0) )]
00612 
00613         try:
00614             p = subprocess.Popen('iostat -n',
00615                                  stdout = subprocess.PIPE,
00616                                  stderr = subprocess.PIPE, shell = True)
00617             stdout, stderr = p.communicate()
00618             retcode = p.returncode
00619             
00620             if retcode != 0:
00621                 nfs_level = DiagnosticStatus.ERROR
00622                 msg = 'iostat Error'
00623                 vals.append(KeyValue(key = '\"iostat -n\" Call Error', value = str(e)))
00624                 stdout = ''
00625                 
00626 
00627             for index, row in enumerate(stdout.split('\n')):
00628                 if index < 3:
00629                     continue
00630                 
00631                 lst = row.split()
00632                 if len(lst) < 7:
00633                     continue
00634                 
00635                 file_sys = lst[0]
00636                 read_blk = lst[1]
00637                 write_blk = lst[2]
00638                 read_blk_dir = lst[3]
00639                 write_blk_dir = lst[4]
00640                 r_blk_srv = lst[5]
00641                 w_blk_srv = lst[6]
00642                 
00643                 vals.append(KeyValue(
00644                         key = '%s Read Blks/s' % file_sys, value=read_blk))
00645                 vals.append(KeyValue(
00646                         key = '%s Write Blks/s' % file_sys, value=write_blk))
00647                 vals.append(KeyValue(
00648                         key = '%s Read Blk dir/s' % file_sys, value=read_blk_dir))
00649                 vals.append(KeyValue(
00650                         key = '%s Write Blks dir/s' % file_sys, value=write_blk_dir))
00651                 vals.append(KeyValue(
00652                         key = '%s Read Blks srv/s' % file_sys, value=r_blk_srv))
00653                 vals.append(KeyValue(
00654                         key = '%s Write Blks srv/s' % file_sys, value=w_blk_srv))
00655                 
00656         except Exception, e:
00657             rospy.logerr(traceback.format_exc())
00658             nfs_level = DiagnosticStatus.ERROR
00659             msg = 'Exception'
00660             vals.append(KeyValue(key = 'Exception', value = str(e)))
00661           
00662         with self._mutex:
00663             self._nfs_stat.level = nfs_level
00664             self._nfs_stat.message = msg
00665             self._nfs_stat.values = vals
00666             
00667             self._last_nfs_time = rospy.get_time()
00668             
00669             if not rospy.is_shutdown():
00670                 self._nfs_timer = threading.Timer(5.0, self.check_nfs_stat)
00671                 self._nfs_timer.start()
00672             else:
00673                 self.cancel_timers()
00674 
00675 
00676     ## Call every 10sec at minimum
00677     def check_temps(self):
00678         if rospy.is_shutdown():
00679             with self._mutex:
00680                 self.cancel_timers()
00681             return
00682 
00683         diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00684                       KeyValue(key = 'Time Since Last Update', value = str(0) ) ]
00685         diag_msgs = []
00686         diag_level = 0
00687 
00688         if self._check_ipmi:
00689             ipmi_vals, ipmi_msgs, ipmi_level = check_ipmi()
00690             diag_vals.extend(ipmi_vals)
00691             diag_msgs.extend(ipmi_msgs)
00692             diag_level = max(diag_level, ipmi_level)
00693 
00694         if self._check_core_temps:
00695             core_vals, core_msgs, core_level = check_core_temps(self._temp_vals)
00696             diag_vals.extend(core_vals)
00697             diag_msgs.extend(core_msgs)
00698             diag_level = max(diag_level, core_level)
00699 
00700         clock_vals, clock_msgs, clock_level = check_clock_speed(self._enforce_speed)
00701         diag_vals.extend(clock_vals)
00702         diag_msgs.extend(clock_msgs)
00703         diag_level = max(diag_level, clock_level)
00704 
00705         diag_log = set(diag_msgs)
00706         if len(diag_log) > 0:
00707             message = ', '.join(diag_log)
00708         else:
00709             message = stat_dict[diag_level]
00710 
00711         with self._mutex:
00712             self._last_temp_time = rospy.get_time()
00713             
00714             self._temp_stat.level = diag_level
00715             self._temp_stat.message = message
00716             self._temp_stat.values = diag_vals
00717             
00718             if not rospy.is_shutdown():
00719                 self._temps_timer = threading.Timer(5.0, self.check_temps)
00720                 self._temps_timer.start()
00721             else:
00722                 self.cancel_timers()
00723 
00724     def check_usage(self):
00725         if rospy.is_shutdown():
00726             with self._mutex:
00727                 self.cancel_timers()
00728             return 
00729 
00730         diag_level = 0
00731         diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00732                       KeyValue(key = 'Time Since Last Update', value = 0 )]
00733         diag_msgs = []
00734 
00735         # Check mpstat
00736         mp_level, mp_msg, mp_vals = check_mpstat(self._num_cores)
00737         diag_vals.extend(mp_vals)
00738         if mp_level > 0:
00739             diag_msgs.append(mp_msg)
00740         diag_level = max(diag_level, mp_level)
00741             
00742         # Check uptime
00743         uptime_level, up_msg, up_vals = check_uptime(self._load1_threshold, self._load5_threshold)
00744         diag_vals.extend(up_vals)
00745         if uptime_level > 0:
00746             diag_msgs.append(up_msg)
00747         diag_level = max(diag_level, uptime_level)
00748         
00749         # Check memory
00750         mem_level, mem_msg, mem_vals = check_memory()
00751         diag_vals.extend(mem_vals)
00752         if mem_level > 0:
00753             diag_msgs.append(mem_msg)
00754         diag_level = max(diag_level, mem_level)
00755 
00756         if diag_msgs and diag_level > 0:
00757             usage_msg = ', '.join(set(diag_msgs))
00758         else:
00759             usage_msg = stat_dict[diag_level]
00760 
00761         # Update status
00762         with self._mutex:
00763             self._last_usage_time = rospy.get_time()
00764             self._usage_stat.level = diag_level
00765             self._usage_stat.values = diag_vals
00766             
00767             self._usage_stat.message = usage_msg
00768             
00769             if not rospy.is_shutdown():
00770                 self._usage_timer = threading.Timer(5.0, self.check_usage)
00771                 self._usage_timer.start()
00772             else:
00773                 self.cancel_timers()
00774 
00775     def publish_stats(self):
00776         with self._mutex:
00777             # Update everything with last update times
00778             update_status_stale(self._temp_stat, self._last_temp_time)
00779             update_status_stale(self._usage_stat, self._last_usage_time)
00780             if self._check_nfs:
00781                 update_status_stale(self._nfs_stat, self._last_nfs_time)
00782 
00783             msg = DiagnosticArray()
00784             msg.header.stamp = rospy.get_rostime()
00785             msg.status.append(self._temp_stat)
00786             msg.status.append(self._usage_stat)
00787             if self._check_nfs:
00788                 msg.status.append(self._nfs_stat)
00789 
00790             if rospy.get_time() - self._last_publish_time > 0.5:
00791                 self._diag_pub.publish(msg)
00792                 self._last_publish_time = rospy.get_time()
00793 
00794         
00795         # Restart temperature checking if it goes stale, #4171
00796         # Need to run this without mutex
00797         if rospy.get_time() - self._last_temp_time > 90: 
00798             self._restart_temp_check()
00799 
00800 
00801 if __name__ == '__main__':
00802     hostname = socket.gethostname()
00803 
00804     import optparse
00805     parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]")
00806     parser.add_option("--diag-hostname", dest="diag_hostname",
00807                       help="Computer name in diagnostics output (ex: 'c1')",
00808                       metavar="DIAG_HOSTNAME",
00809                       action="store", default = hostname)
00810     options, args = parser.parse_args(rospy.myargv())
00811 
00812     try:
00813         rospy.init_node('cpu_monitor_%s' % hostname)
00814     except rospy.exceptions.ROSInitException:
00815         print >> sys.stderr, 'CPU monitor is unable to initialize node. Master may not be running.'
00816         sys.exit(0)
00817 
00818     cpu_node = CPUMonitor(hostname, options.diag_hostname)
00819 
00820     rate = rospy.Rate(1.0)
00821     try:
00822         while not rospy.is_shutdown():
00823             rate.sleep()
00824             cpu_node.publish_stats()
00825     except KeyboardInterrupt:
00826         pass
00827     except Exception, e:
00828         traceback.print_exc()
00829         rospy.logerr(traceback.format_exc())
00830 
00831     cpu_node.cancel_timers()
00832     sys.exit(0)
00833     
00834 
00835 
00836     
00837 
00838             
00839 


pr2_computer_monitor
Author(s): Kevin Watts (watts@willowgarage.com)
autogenerated on Fri Dec 6 2013 20:23:59