cpu_monitor.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # Software License Agreement (BSD License)
00004 #
00005 # Copyright (c) 2009, Willow Garage, Inc.
00006 # All rights reserved.
00007 #
00008 # Redistribution and use in source and binary forms, with or without
00009 # modification, are permitted provided that the following conditions
00010 # are met:
00011 #
00012 #  * Redistributions of source code must retain the above copyright
00013 #    notice, this list of conditions and the following disclaimer.
00014 #  * Redistributions in binary form must reproduce the above
00015 #    copyright notice, this list of conditions and the following
00016 #    disclaimer in the documentation and/or other materials provided
00017 #    with the distribution.
00018 #  * Neither the name of the Willow Garage nor the names of its
00019 #    contributors may be used to endorse or promote products derived
00020 #    from this software without specific prior written permission.
00021 #
00022 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00023 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00024 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00025 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
00026 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00027 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00028 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00029 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00030 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00031 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00032 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00033 # POSSIBILITY OF SUCH DAMAGE.
00034 
00035 ##\author Kevin Watts
00036 
00037 from __future__ import with_statement
00038 import roslib
00039 #roslib.load_manifest('pr2_computer_monitor')
00040 roslib.load_manifest('labust_diagnostics')
00041 import rospy
00042 
00043 import traceback
00044 import threading
00045 from threading import Timer
00046 import sys, os, time
00047 from time import sleep
00048 import subprocess
00049 import string
00050 
00051 import socket
00052 
00053 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
00054 
00055 ##### monkey-patch to suppress threading error message in python 2.7.3
00056 ##### See http://stackoverflow.com/questions/13193278/understand-python-threading-bug
00057 if sys.version_info[:3] == (2, 7, 3):
00058     import threading
00059     threading._DummyThread._Thread__stop = lambda x: 42
00060 #####
00061 
00062 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
00063 
00064 # Output entire IPMI data set
00065 def check_ipmi():
00066     diag_vals = []
00067     diag_msgs = []
00068     diag_level = DiagnosticStatus.OK
00069 
00070     try:
00071         p = subprocess.Popen('sudo ipmitool sdr',
00072                              stdout = subprocess.PIPE,
00073                              stderr = subprocess.PIPE, shell = True)
00074         stdout, stderr = p.communicate()
00075         retcode = p.returncode
00076                         
00077         if retcode != 0:
00078             diag_level = DiagnosticStatus.ERROR
00079             diag_msgs = [ 'ipmitool Error' ]
00080             diag_vals = [ KeyValue(key = 'IPMI Error', value = stderr) ]
00081             return diag_vals, diag_msgs, diag_level
00082 
00083         lines = stdout.split('\n')
00084         if len(lines) < 2:
00085             diag_vals = [ KeyValue(key = 'ipmitool status', value = 'No output') ]
00086 
00087             diag_msgs = [ 'No ipmitool response' ]
00088             diag_level = DiagnosticStatus.ERROR
00089 
00090             return diag_vals, diag_msgs, diag_level
00091 
00092         for ln in lines:
00093             if len(ln) < 3:
00094                 continue
00095 
00096             words = ln.split('|')
00097             if len(words) < 3:
00098                 continue 
00099 
00100             name = words[0].strip()
00101             ipmi_val = words[1].strip()
00102             stat_byte = words[2].strip()
00103 
00104             # CPU temps
00105             if words[0].startswith('CPU') and words[0].strip().endswith('Temp'):
00106                 if words[1].strip().endswith('degrees C'):
00107                     tmp = ipmi_val.rstrip(' degrees C').lstrip()
00108                     if unicode(tmp).isnumeric():
00109                         temperature = float(tmp)
00110                         diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00111 
00112                         cpu_name = name.split()[0]
00113                         if temperature >= 80 and temperature < 89:
00114                             diag_level = max(diag_level, DiagnosticStatus.WARN)
00115                             if diag_msgs.count('CPU Hot') == 0:
00116                                 diag_msgs.append('CPU Warm')
00117 
00118                         if temperature >= 89: # CPU should shut down here
00119                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
00120                             diag_msgs.append('CPU Hot')                                
00121                             # Don't keep CPU Warm in list if CPU is hot
00122                             if diag_msgs.count('CPU Warm') > 0:
00123                                 idx = diag_msgs.index('CPU Warm')
00124                                 diag_msgs.pop(idx)
00125                 else:
00126                     diag_vals.append(KeyValue(key = name, value = words[1]))
00127 
00128 
00129             # MP, BP, FP temps
00130             if name == 'MB Temp' or name == 'BP Temp' or name == 'FP Temp':
00131                 if ipmi_val.endswith('degrees C'):
00132                     tmp = ipmi_val.rstrip(' degrees C').lstrip()
00133                     diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00134                     # Give temp warning
00135                     dev_name = name.split()[0]
00136                     if unicode(tmp).isnumeric():
00137                         temperature = float(tmp)
00138 
00139                         if temperature >= 60 and temperature < 75:
00140                             diag_level = max(diag_level, DiagnosticStatus.WARN)
00141                             diag_msgs.append('%s Warm' % dev_name)
00142 
00143                         if temperature >= 75:
00144                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
00145                             diag_msgs.append('%s Hot' % dev_name)
00146                     else:
00147                         diag_level = max(diag_level, DiagnosticStatus.ERROR)
00148                         diag_msgs.append('%s Error' % dev_name)
00149                 else:
00150                     diag_vals.append(KeyValue(key = name, value = ipmi_val))
00151         
00152             # CPU fan speeds
00153             if (name.startswith('CPU') and name.endswith('Fan')) or name == 'MB Fan':
00154                 if ipmi_val.endswith('RPM'):
00155                     rpm = ipmi_val.rstrip(' RPM').lstrip()
00156                     if unicode(rpm).isnumeric():
00157                         if int(rpm) == 0:
00158                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
00159                             diag_msgs.append('CPU Fan Off')
00160                             
00161                         diag_vals.append(KeyValue(key = name + ' RPM', value = rpm))
00162                     else:
00163                         diag_vals.append(KeyValue(key = name, value = ipmi_val))
00164 
00165             # If CPU is hot we get an alarm from ipmitool, report that too
00166             # CPU should shut down if we get a hot alarm, so report as error
00167             if name.startswith('CPU') and name.endswith('hot'):
00168                 if ipmi_val == '0x01':
00169                     diag_vals.append(KeyValue(key = name, value = 'OK'))
00170                 else:
00171                     diag_vals.append(KeyValue(key = name, value = 'Hot'))
00172                     diag_level = max(diag_level, DiagnosticStatus.ERROR)
00173                     diag_msgs.append('CPU Hot Alarm')
00174 
00175     except Exception, e:
00176         diag_vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
00177         diag_level = DiagnosticStatus.ERROR
00178         diag_msgs.append('Exception')
00179 
00180     return diag_vals, diag_msgs, diag_level
00181         
00182 
00183 ##\brief Check CPU core temps 
00184 ##
00185 ## Use 'find /sys -name temp1_input' to find cores
00186 ## Read from every core, divide by 1000
00187 def check_core_temps(sys_temp_strings):
00188     diag_vals = []
00189     diag_level = 0
00190     diag_msgs = []
00191     
00192     for index, temp_str in enumerate(sys_temp_strings):
00193         if len(temp_str) < 5:
00194             continue
00195         
00196         cmd = 'cat %s' % temp_str
00197         p = subprocess.Popen(cmd, stdout = subprocess.PIPE, 
00198                              stderr = subprocess.PIPE, shell = True)
00199         stdout, stderr = p.communicate()
00200         retcode = p.returncode
00201 
00202         if retcode != 0:
00203             diag_level = DiagnosticStatus.ERROR
00204             diag_msg = [ 'Core Temp Error' ]
00205             diag_vals = [ KeyValue(key = 'Core Temp Error', value = stderr), 
00206                           KeyValue(key = 'Output', value = stdout) ]
00207             return diag_vals, diag_msgs, diag_level
00208   
00209         tmp = stdout.strip()
00210         if unicode(tmp).isnumeric():
00211             temp = float(tmp) / 1000
00212             diag_vals.append(KeyValue(key = 'Core %d Temp' % index, value = str(temp)))
00213 
00214             if temp >= 85 and temp < 90:
00215                 diag_level = max(diag_level, DiagnosticStatus.WARN)
00216                 diag_msgs.append('Warm')
00217             if temp >= 90:
00218                 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00219                 diag_msgs.append('Hot')
00220         else:
00221             diag_level = max(diag_level, DiagnosticStatus.ERROR) # Error if not numeric value
00222             diag_vals.append(KeyValue(key = 'Core %s Temp' % index, value = tmp))
00223 
00224     return diag_vals, diag_msgs, diag_level
00225 
00226 ## Checks clock speed from reading from CPU info
00227 def check_clock_speed(enforce_speed):
00228     vals = []
00229     msgs = []
00230     lvl = DiagnosticStatus.OK
00231 
00232     try:
00233         p = subprocess.Popen('cat /proc/cpuinfo | grep MHz', 
00234                              stdout = subprocess.PIPE,
00235                              stderr = subprocess.PIPE, shell = True)
00236         stdout, stderr = p.communicate()
00237         retcode = p.returncode
00238 
00239         if retcode != 0:
00240             lvl = DiagnosticStatus.ERROR
00241             msgs = [ 'Clock speed error' ]
00242             vals = [ KeyValue(key = 'Clock speed error', value = stderr), 
00243                      KeyValue(key = 'Output', value = stdout) ]
00244             
00245             return (vals, msgs, lvl)
00246 
00247         for index, ln in enumerate(stdout.split('\n')):
00248             words = ln.split(':')
00249             if len(words) < 2:
00250                 continue
00251 
00252             speed = words[1].strip().split('.')[0] # Conversion to float doesn't work with decimal
00253             vals.append(KeyValue(key = 'Core %d MHz' % index, value = speed))
00254             if unicode(speed).isnumeric():
00255                 mhz = float(speed)
00256                 
00257                 if mhz < 2240 and mhz > 2150:
00258                     lvl = max(lvl, DiagnosticStatus.WARN)
00259                 if mhz <= 2150:
00260                     lvl = max(lvl, DiagnosticStatus.ERROR)
00261             else:
00262                 # Automatically give error if speed isn't a number
00263                 lvl = max(lvl, DiagnosticStatus.ERROR)
00264 
00265         if not enforce_speed:
00266             lvl = DiagnosticStatus.OK
00267 
00268         if lvl == DiagnosticStatus.WARN and enforce_speed:
00269             msgs = [ 'Core slowing' ]
00270         elif lvl == DiagnosticStatus.ERROR and enforce_speed:
00271             msgs = [ 'Core throttled' ]
00272 
00273     except Exception, e:
00274         rospy.logerr(traceback.format_exc())
00275         lvl = DiagnosticStatus.ERROR
00276         msgs.append('Exception')
00277         vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
00278 
00279     return vals, msgs, lvl
00280                     
00281 
00282 # Add msgs output, too
00283 ##\brief Uses 'uptime' to see load average
00284 def check_uptime(load1_threshold, load5_threshold):
00285     level = DiagnosticStatus.OK
00286     vals = []
00287     
00288     load_dict = { 0: 'OK', 1: 'High Load', 2: 'Very High Load' }
00289 
00290     try:
00291         p = subprocess.Popen('uptime', stdout = subprocess.PIPE, 
00292                              stderr = subprocess.PIPE, shell = True)
00293         stdout, stderr = p.communicate()
00294         retcode = p.returncode
00295 
00296         if retcode != 0:
00297             vals.append(KeyValue(key = 'uptime Failed', value = stderr))
00298             return DiagnosticStatus.ERROR, vals
00299 
00300         upvals = stdout.split()
00301         load1 = upvals[-3].rstrip(',')
00302         load5 = upvals[-2].rstrip(',')
00303         load15 = upvals[-1]
00304         num_users = upvals[-7]
00305 
00306         # Give warning if we go over load limit 
00307         if float(load1) > load1_threshold or float(load5) > load5_threshold:
00308             level = DiagnosticStatus.WARN
00309 
00310         vals.append(KeyValue(key = 'Load Average Status', value = load_dict[level]))
00311         vals.append(KeyValue(key = '1 min Load Average', value = load1))
00312         vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(load1_threshold)))
00313         vals.append(KeyValue(key = '5 min Load Average', value = load5))
00314         vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(load5_threshold)))
00315         vals.append(KeyValue(key = '15 min Load Average', value = load15))
00316         vals.append(KeyValue(key = 'Number of Users', value = num_users))
00317 
00318     except Exception, e:
00319         rospy.logerr(traceback.format_exc())
00320         level = DiagnosticStatus.ERROR
00321         vals.append(KeyValue(key = 'Load Average Status', value = traceback.format_exc()))
00322         
00323     return level, load_dict[level], vals
00324 
00325 # Add msgs output
00326 ##\brief Uses 'free -m' to check free memory
00327 def check_memory():
00328     values = []
00329     level = DiagnosticStatus.OK
00330     msg = ''
00331 
00332     mem_dict = { 0: 'OK', 1: 'Low Memory', 2: 'Very Low Memory' }
00333 
00334     try:
00335         p = subprocess.Popen('free -m',
00336                              stdout = subprocess.PIPE,
00337                              stderr = subprocess.PIPE, shell = True)
00338         stdout, stderr = p.communicate()
00339         retcode = p.returncode
00340                
00341         if retcode != 0:
00342             values.append(KeyValue(key = "\"free -m\" Call Error", value = str(retcode)))
00343             return DiagnosticStatus.ERROR, values
00344  
00345         rows = stdout.split('\n')
00346         data = rows[1].split()
00347         total_mem = data[1]
00348         used_mem = data[2]
00349         free_mem = data[3]
00350 
00351         level = DiagnosticStatus.OK
00352         if float(free_mem) < 25:
00353             level = DiagnosticStatus.WARN
00354         if float(free_mem) < 1:
00355             level = DiagnosticStatus.ERROR
00356 
00357         values.append(KeyValue(key = 'Memory Status', value = mem_dict[level]))
00358         values.append(KeyValue(key = 'Total Memory', value = total_mem))
00359         values.append(KeyValue(key = 'Used Memory', value = used_mem))
00360         values.append(KeyValue(key = 'Free Memory', value = free_mem))
00361 
00362         msg = mem_dict[level]
00363     except Exception, e:
00364         rospy.logerr(traceback.format_exc())
00365         msg = 'Memory Usage Check Error'
00366         values.append(KeyValue(key = msg, value = str(e)))
00367         level = DiagnosticStatus.ERROR
00368     
00369     return level, mem_dict[level], values
00370 
00371 
00372 
00373 ##\brief Use mpstat to find CPU usage
00374 ##
00375 usage_old = 0
00376 has_warned_mpstat = False
00377 has_error_core_count = False
00378 def check_mpstat(core_count = -1):
00379     vals = []
00380     mp_level = DiagnosticStatus.OK
00381     
00382     load_dict = { 0: 'OK', 1: 'High Load', 2: 'Error' }
00383 
00384     try:
00385         p = subprocess.Popen('mpstat -P ALL 1 1',
00386                              stdout = subprocess.PIPE,
00387                              stderr = subprocess.PIPE, shell = True)
00388         stdout, stderr = p.communicate()
00389         retcode = p.returncode
00390 
00391         if retcode != 0:
00392             global has_warned_mpstat
00393             if not has_warned_mpstat:
00394                 rospy.logerr("mpstat failed to run for cpu_monitor. Return code %d.", retcode)
00395                 has_warned_mpstat = True
00396 
00397             mp_level = DiagnosticStatus.ERROR
00398             vals.append(KeyValue(key = '\"mpstat\" Call Error', value = str(retcode)))
00399             return mp_level, 'Unable to Check CPU Usage', vals
00400 
00401         # Check which column '%idle' is, #4539
00402         # mpstat output changed between 8.06 and 8.1
00403         rows = stdout.split('\n')
00404         col_names = rows[2].split()
00405         idle_col = -1 if (len(col_names) > 2 and col_names[-1] == '%idle') else -2
00406 
00407         num_cores = 0
00408         cores_loaded = 0
00409         for index, row in enumerate(stdout.split('\n')):
00410             if index < 3:
00411                 continue
00412             
00413             # Skip row containing 'all' data
00414             if row.find('all') > -1:
00415                 continue
00416 
00417             lst = row.split()
00418             if len(lst) < 8:
00419                 continue
00420 
00421             ## Ignore 'Average: ...' data
00422             if lst[0].startswith('Average'):
00423                 continue
00424 
00425             cpu_name = '%d' % (num_cores)
00426             idle = lst[idle_col].replace(',', '.')
00427             user = lst[3].replace(',', '.')
00428             nice = lst[4].replace(',', '.')
00429             system = lst[5].replace(',', '.')
00430             
00431             core_level = 0
00432             usage = float(user) + float(nice)
00433             if usage > 1000: # wrong reading, use old reading instead
00434                 rospy.logwarn('Read cpu usage of %f percent. Reverting to previous reading of %f percent'%(usage, usage_old))
00435                 usage = usage_old
00436             usage_old = usage
00437 
00438             if usage > 90.0:
00439                 cores_loaded += 1
00440                 core_level = DiagnosticStatus.WARN
00441             if usage > 110.0:
00442                 core_level = DiagnosticStatus.ERROR
00443 
00444             vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level]))
00445             vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = user))
00446             vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = nice))
00447             vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = system))
00448             vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = idle))
00449 
00450             num_cores += 1
00451         
00452         # Warn for high load only if we have <= 2 cores that aren't loaded
00453         if num_cores - cores_loaded <= 2 and num_cores > 2:
00454             mp_level = DiagnosticStatus.WARN
00455 
00456         # Check the number of cores if core_count > 0, #4850
00457         if core_count > 0 and core_count != num_cores:
00458             mp_level = DiagnosticStatus.ERROR
00459             global has_error_core_count
00460             if not has_error_core_count:
00461                 rospy.logerr('Error checking number of cores. Expected %d, got %d. Computer may have not booted properly.',
00462                               core_count, num_cores)
00463                 has_error_core_count = True
00464             return DiagnosticStatus.ERROR, 'Incorrect number of CPU cores', vals
00465             
00466     except Exception, e:
00467         mp_level = DiagnosticStatus.ERROR
00468         vals.append(KeyValue(key = 'mpstat Exception', value = str(e)))
00469 
00470     return mp_level, load_dict[mp_level], vals
00471 
00472 ## Returns names for core temperature files
00473 ## Returns list of names, each name can be read like file
00474 def get_core_temp_names():
00475     temp_vals = []
00476     try:
00477         p = subprocess.Popen('find /sys/devices -name temp1_input', 
00478                              stdout = subprocess.PIPE,
00479                              stderr = subprocess.PIPE, shell = True)
00480         stdout, stderr = p.communicate()
00481         retcode = p.returncode
00482 
00483         if retcode != 0:
00484             rospy.logerr('Error find core temp locations: %s' % stderr)
00485             return []
00486         
00487         for ln in stdout.split('\n'):
00488             temp_vals.append(ln.strip())
00489         
00490         return temp_vals
00491     except:
00492         rospy.logerr('Exception finding temp vals: %s' % traceback.format_exc())
00493         return []
00494 
00495 def update_status_stale(stat, last_update_time):
00496     time_since_update = rospy.get_time() - last_update_time
00497 
00498     stale_status = 'OK'
00499     if time_since_update > 20 and time_since_update <= 35:
00500         stale_status = 'Lagging'
00501         if stat.level == DiagnosticStatus.OK:
00502             stat.message = stale_status
00503         elif stat.message.find(stale_status) < 0:
00504             stat.message = ', '.join([stat.message, stale_status])
00505         stat.level = max(stat.level, DiagnosticStatus.WARN)
00506     if time_since_update > 35:
00507         stale_status = 'Stale'
00508         if stat.level == DiagnosticStatus.OK:
00509             stat.message = stale_status
00510         elif stat.message.find(stale_status) < 0:
00511             stat.message = ', '.join([stat.message, stale_status])
00512         stat.level = max(stat.level, DiagnosticStatus.ERROR)
00513 
00514 
00515     stat.values.pop(0)
00516     stat.values.pop(0)
00517     stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
00518     stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
00519     
00520 
00521 class CPUMonitor():
00522     def __init__(self, hostname, diag_hostname):
00523         self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray)
00524 
00525         self._mutex = threading.Lock()
00526 
00527         self._check_ipmi = rospy.get_param('~check_ipmi_tool', True)
00528         self._enforce_speed = rospy.get_param('~enforce_clock_speed', True)
00529 
00530         self._check_core_temps = rospy.get_param('~check_core_temps', False)
00531         if self._check_core_temps:
00532             rospy.logwarn('Checking CPU core temperatures is deprecated. This will be removed in D-turtle')
00533         self._check_nfs = rospy.get_param('~check_nfs', False)
00534         if self._check_nfs:
00535             rospy.logwarn('NFS checking is deprecated for CPU monitor. This will be removed in D-turtle')
00536 
00537         self._load1_threshold = rospy.get_param('~load1_threshold', 5.0)
00538         self._load5_threshold = rospy.get_param('~load5_threshold', 3.0)
00539 
00540         self._num_cores = rospy.get_param('~num_cores', 8.0)
00541 
00542         self._temps_timer = None
00543         self._usage_timer = None
00544         self._nfs_timer = None
00545         
00546         # Get temp_input files
00547         self._temp_vals = get_core_temp_names()
00548 
00549         # CPU stats
00550         self._temp_stat = DiagnosticStatus()
00551         self._temp_stat.name = '%s CPU Temperature' % diag_hostname
00552         self._temp_stat.level = 1
00553         self._temp_stat.hardware_id = hostname
00554         self._temp_stat.message = 'No Data'
00555         self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00556                                    KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00557 
00558         self._usage_stat = DiagnosticStatus()
00559         self._usage_stat.name = '%s CPU Usage' % diag_hostname
00560         self._usage_stat.level = 1
00561         self._usage_stat.hardware_id = hostname
00562         self._usage_stat.message = 'No Data'
00563         self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00564                                     KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00565 
00566         self._nfs_stat = DiagnosticStatus()
00567         self._nfs_stat.name = '%s NFS IO' % diag_hostname
00568         self._nfs_stat.level = 1
00569         self._nfs_stat.hardware_id = hostname
00570         self._nfs_stat.message = 'No Data'
00571         self._nfs_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00572                                   KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00573 
00574         self._last_temp_time = 0
00575         self._last_usage_time = 0
00576         self._last_nfs_time = 0
00577         self._last_publish_time = 0
00578 
00579         # Start checking everything
00580         self.check_temps()
00581         if self._check_nfs:
00582             self.check_nfs_stat()
00583         self.check_usage()
00584 
00585     # Restart temperature checking 
00586     def _restart_temp_check(self):
00587         rospy.logerr('Restarting temperature check thread in cpu_monitor. This should not happen')
00588         try:
00589             with self._mutex:
00590                 if self._temps_timer:
00591                     self._temps_timer.cancel()
00592                 
00593             self.check_temps()
00594         except Exception, e:
00595             rospy.logerr('Unable to restart temp thread. Error: %s' % traceback.format_exc())
00596             
00597 
00598     ## Must have the lock to cancel everything
00599     def cancel_timers(self):
00600         if self._temps_timer:
00601             self._temps_timer.cancel()
00602 
00603         if self._nfs_timer:
00604             self._nfs_timer.cancel()
00605 
00606         if self._usage_timer:
00607             self._usage_timer.cancel()
00608 
00609     def check_nfs_stat(self):
00610         if rospy.is_shutdown():
00611             with self._mutex:
00612                 self.cancel_timers()
00613                 return
00614 
00615         nfs_level = 0
00616         msg = 'OK'
00617         vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00618                  KeyValue(key = 'Time Since Last Update', value = str(0) )]
00619 
00620         try:
00621             p = subprocess.Popen('iostat -n',
00622                                  stdout = subprocess.PIPE,
00623                                  stderr = subprocess.PIPE, shell = True)
00624             stdout, stderr = p.communicate()
00625             retcode = p.returncode
00626             
00627             if retcode != 0:
00628                 nfs_level = DiagnosticStatus.ERROR
00629                 msg = 'iostat Error'
00630                 vals.append(KeyValue(key = '\"iostat -n\" Call Error', value = str(e)))
00631                 stdout = ''
00632                 
00633 
00634             for index, row in enumerate(stdout.split('\n')):
00635                 if index < 3:
00636                     continue
00637                 
00638                 lst = row.split()
00639                 if len(lst) < 7:
00640                     continue
00641                 
00642                 file_sys = lst[0]
00643                 read_blk = lst[1]
00644                 write_blk = lst[2]
00645                 read_blk_dir = lst[3]
00646                 write_blk_dir = lst[4]
00647                 r_blk_srv = lst[5]
00648                 w_blk_srv = lst[6]
00649                 
00650                 vals.append(KeyValue(
00651                         key = '%s Read Blks/s' % file_sys, value=read_blk))
00652                 vals.append(KeyValue(
00653                         key = '%s Write Blks/s' % file_sys, value=write_blk))
00654                 vals.append(KeyValue(
00655                         key = '%s Read Blk dir/s' % file_sys, value=read_blk_dir))
00656                 vals.append(KeyValue(
00657                         key = '%s Write Blks dir/s' % file_sys, value=write_blk_dir))
00658                 vals.append(KeyValue(
00659                         key = '%s Read Blks srv/s' % file_sys, value=r_blk_srv))
00660                 vals.append(KeyValue(
00661                         key = '%s Write Blks srv/s' % file_sys, value=w_blk_srv))
00662                 
00663         except Exception, e:
00664             rospy.logerr(traceback.format_exc())
00665             nfs_level = DiagnosticStatus.ERROR
00666             msg = 'Exception'
00667             vals.append(KeyValue(key = 'Exception', value = str(e)))
00668           
00669         with self._mutex:
00670             self._nfs_stat.level = nfs_level
00671             self._nfs_stat.message = msg
00672             self._nfs_stat.values = vals
00673             
00674             self._last_nfs_time = rospy.get_time()
00675             
00676             if not rospy.is_shutdown():
00677                 self._nfs_timer = threading.Timer(5.0, self.check_nfs_stat)
00678                 self._nfs_timer.start()
00679             else:
00680                 self.cancel_timers()
00681 
00682 
00683     ## Call every 10sec at minimum
00684     def check_temps(self):
00685         if rospy.is_shutdown():
00686             with self._mutex:
00687                 self.cancel_timers()
00688             return
00689 
00690         diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00691                       KeyValue(key = 'Time Since Last Update', value = str(0) ) ]
00692         diag_msgs = []
00693         diag_level = 0
00694 
00695         if self._check_ipmi:
00696             ipmi_vals, ipmi_msgs, ipmi_level = check_ipmi()
00697             diag_vals.extend(ipmi_vals)
00698             diag_msgs.extend(ipmi_msgs)
00699             diag_level = max(diag_level, ipmi_level)
00700 
00701         if self._check_core_temps:
00702             core_vals, core_msgs, core_level = check_core_temps(self._temp_vals)
00703             diag_vals.extend(core_vals)
00704             diag_msgs.extend(core_msgs)
00705             diag_level = max(diag_level, core_level)
00706 
00707         clock_vals, clock_msgs, clock_level = check_clock_speed(self._enforce_speed)
00708         diag_vals.extend(clock_vals)
00709         diag_msgs.extend(clock_msgs)
00710         diag_level = max(diag_level, clock_level)
00711 
00712         diag_log = set(diag_msgs)
00713         if len(diag_log) > 0:
00714             message = ', '.join(diag_log)
00715         else:
00716             message = stat_dict[diag_level]
00717 
00718         with self._mutex:
00719             self._last_temp_time = rospy.get_time()
00720             
00721             self._temp_stat.level = diag_level
00722             self._temp_stat.message = message
00723             self._temp_stat.values = diag_vals
00724             
00725             if not rospy.is_shutdown():
00726                 self._temps_timer = threading.Timer(5.0, self.check_temps)
00727                 self._temps_timer.start()
00728             else:
00729                 self.cancel_timers()
00730 
00731     def check_usage(self):
00732         if rospy.is_shutdown():
00733             with self._mutex:
00734                 self.cancel_timers()
00735             return 
00736 
00737         diag_level = 0
00738         diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00739                       KeyValue(key = 'Time Since Last Update', value = 0 )]
00740         diag_msgs = []
00741 
00742         # Check mpstat
00743         mp_level, mp_msg, mp_vals = check_mpstat(self._num_cores)
00744         diag_vals.extend(mp_vals)
00745         if mp_level > 0:
00746             diag_msgs.append(mp_msg)
00747         diag_level = max(diag_level, mp_level)
00748             
00749         # Check uptime
00750         uptime_level, up_msg, up_vals = check_uptime(self._load1_threshold, self._load5_threshold)
00751         diag_vals.extend(up_vals)
00752         if uptime_level > 0:
00753             diag_msgs.append(up_msg)
00754         diag_level = max(diag_level, uptime_level)
00755         
00756         # Check memory
00757         mem_level, mem_msg, mem_vals = check_memory()
00758         diag_vals.extend(mem_vals)
00759         if mem_level > 0:
00760             diag_msgs.append(mem_msg)
00761         diag_level = max(diag_level, mem_level)
00762 
00763         if diag_msgs and diag_level > 0:
00764             usage_msg = ', '.join(set(diag_msgs))
00765         else:
00766             usage_msg = stat_dict[diag_level]
00767 
00768         # Update status
00769         with self._mutex:
00770             self._last_usage_time = rospy.get_time()
00771             self._usage_stat.level = diag_level
00772             self._usage_stat.values = diag_vals
00773             
00774             self._usage_stat.message = usage_msg
00775             
00776             if not rospy.is_shutdown():
00777                 self._usage_timer = threading.Timer(5.0, self.check_usage)
00778                 self._usage_timer.start()
00779             else:
00780                 self.cancel_timers()
00781 
00782     def publish_stats(self):
00783         with self._mutex:
00784             # Update everything with last update times
00785             update_status_stale(self._temp_stat, self._last_temp_time)
00786             update_status_stale(self._usage_stat, self._last_usage_time)
00787             if self._check_nfs:
00788                 update_status_stale(self._nfs_stat, self._last_nfs_time)
00789 
00790             msg = DiagnosticArray()
00791             msg.header.stamp = rospy.get_rostime()
00792             msg.status.append(self._temp_stat)
00793             msg.status.append(self._usage_stat)
00794             if self._check_nfs:
00795                 msg.status.append(self._nfs_stat)
00796 
00797             if rospy.get_time() - self._last_publish_time > 0.5:
00798                 self._diag_pub.publish(msg)
00799                 self._last_publish_time = rospy.get_time()
00800 
00801         
00802         # Restart temperature checking if it goes stale, #4171
00803         # Need to run this without mutex
00804         if rospy.get_time() - self._last_temp_time > 90: 
00805             self._restart_temp_check()
00806 
00807 
00808 if __name__ == '__main__':
00809     hostname = socket.gethostname()
00810 
00811     import optparse
00812     parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]")
00813     parser.add_option("--diag-hostname", dest="diag_hostname",
00814                       help="Computer name in diagnostics output (ex: 'c1')",
00815                       metavar="DIAG_HOSTNAME",
00816                       action="store", default = hostname)
00817     options, args = parser.parse_args(rospy.myargv())
00818 
00819     try:
00820         rospy.init_node('cpu_monitor_%s' % hostname)
00821     except rospy.exceptions.ROSInitException:
00822         print >> sys.stderr, 'CPU monitor is unable to initialize node. Master may not be running.'
00823         sys.exit(0)
00824 
00825     cpu_node = CPUMonitor(hostname, options.diag_hostname)
00826 
00827     rate = rospy.Rate(1.0)
00828     try:
00829         while not rospy.is_shutdown():
00830             rate.sleep()
00831             cpu_node.publish_stats()
00832     except KeyboardInterrupt:
00833         pass
00834     except Exception, e:
00835         traceback.print_exc()
00836         rospy.logerr(traceback.format_exc())
00837 
00838     cpu_node.cancel_timers()
00839     sys.exit(0)
00840     
00841 
00842 
00843     
00844 
00845             
00846 


labust_diagnostics
Author(s): Ivor Rendulic
autogenerated on Fri Aug 28 2015 11:23:40