cpu_monitor.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # Software License Agreement (BSD License)
00004 #
00005 # Copyright (c) 2009, Willow Garage, Inc.
00006 # All rights reserved.
00007 #
00008 # Redistribution and use in source and binary forms, with or without
00009 # modification, are permitted provided that the following conditions
00010 # are met:
00011 #
00012 #  * Redistributions of source code must retain the above copyright
00013 #    notice, this list of conditions and the following disclaimer.
00014 #  * Redistributions in binary form must reproduce the above
00015 #    copyright notice, this list of conditions and the following
00016 #    disclaimer in the documentation and/or other materials provided
00017 #    with the distribution.
00018 #  * Neither the name of the Willow Garage nor the names of its
00019 #    contributors may be used to endorse or promote products derived
00020 #    from this software without specific prior written permission.
00021 #
00022 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00023 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00024 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00025 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
00026 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00027 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00028 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00029 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00030 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00031 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00032 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00033 # POSSIBILITY OF SUCH DAMAGE.
00034 
00035 ##\author Kevin Watts
00036 
00037 from __future__ import with_statement
00038 import roslib
00039 roslib.load_manifest('pr2_computer_monitor')
00040 
00041 import rospy
00042 
00043 import traceback
00044 import threading
00045 from threading import Timer
00046 import sys, os, time
00047 from time import sleep
00048 import subprocess
00049 import string
00050 import multiprocessing
00051 import socket
00052 
00053 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
00054 
00055 ##### monkey-patch to suppress threading error message in python 2.7.3
00056 ##### See http://stackoverflow.com/questions/13193278/understand-python-threading-bug
00057 if sys.version_info[:3] == (2, 7, 3):
00058     import threading
00059     threading._DummyThread._Thread__stop = lambda x: 42
00060 #####
00061 
00062 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
00063 
00064 # Output entire IPMI data set
00065 def check_ipmi():
00066     diag_vals = []
00067     diag_msgs = []
00068     diag_level = DiagnosticStatus.OK
00069 
00070     try:
00071         p = subprocess.Popen('sudo ipmitool sdr',
00072                              stdout = subprocess.PIPE,
00073                              stderr = subprocess.PIPE, shell = True)
00074         stdout, stderr = p.communicate()
00075         retcode = p.returncode
00076                         
00077         if retcode != 0:
00078             diag_level = DiagnosticStatus.ERROR
00079             diag_msgs = [ 'ipmitool Error' ]
00080             diag_vals = [ KeyValue(key = 'IPMI Error', value = stderr) ]
00081             return diag_vals, diag_msgs, diag_level
00082 
00083         lines = stdout.split('\n')
00084         if len(lines) < 2:
00085             diag_vals = [ KeyValue(key = 'ipmitool status', value = 'No output') ]
00086 
00087             diag_msgs = [ 'No ipmitool response' ]
00088             diag_level = DiagnosticStatus.ERROR
00089 
00090             return diag_vals, diag_msgs, diag_level
00091 
00092         for ln in lines:
00093             if len(ln) < 3:
00094                 continue
00095 
00096             words = ln.split('|')
00097             if len(words) < 3:
00098                 continue 
00099 
00100             name = words[0].strip()
00101             ipmi_val = words[1].strip()
00102             stat_byte = words[2].strip()
00103 
00104             # CPU temps
00105             if words[0].startswith('CPU') and words[0].strip().endswith('Temp'):
00106                 if words[1].strip().endswith('degrees C'):
00107                     tmp = ipmi_val.rstrip(' degrees C').lstrip()
00108                     if unicode(tmp).isnumeric():
00109                         temperature = float(tmp)
00110                         diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00111 
00112                         cpu_name = name.split()[0]
00113                         if temperature >= 80 and temperature < 89:
00114                             diag_level = max(diag_level, DiagnosticStatus.WARN)
00115                             if diag_msgs.count('CPU Hot') == 0:
00116                                 diag_msgs.append('CPU Warm')
00117 
00118                         if temperature >= 89: # CPU should shut down here
00119                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
00120                             diag_msgs.append('CPU Hot')                                
00121                             # Don't keep CPU Warm in list if CPU is hot
00122                             if diag_msgs.count('CPU Warm') > 0:
00123                                 idx = diag_msgs.index('CPU Warm')
00124                                 diag_msgs.pop(idx)
00125                 else:
00126                     diag_vals.append(KeyValue(key = name, value = words[1]))
00127 
00128 
00129             # MP, BP, FP temps
00130             if name == 'MB Temp' or name == 'BP Temp' or name == 'FP Temp':
00131                 if ipmi_val.endswith('degrees C'):
00132                     tmp = ipmi_val.rstrip(' degrees C').lstrip()
00133                     diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00134                     # Give temp warning
00135                     dev_name = name.split()[0]
00136                     if unicode(tmp).isnumeric():
00137                         temperature = float(tmp)
00138 
00139                         if temperature >= 60 and temperature < 75:
00140                             diag_level = max(diag_level, DiagnosticStatus.WARN)
00141                             diag_msgs.append('%s Warm' % dev_name)
00142 
00143                         if temperature >= 75:
00144                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
00145                             diag_msgs.append('%s Hot' % dev_name)
00146                     else:
00147                         diag_level = max(diag_level, DiagnosticStatus.ERROR)
00148                         diag_msgs.append('%s Error' % dev_name)
00149                 else:
00150                     diag_vals.append(KeyValue(key = name, value = ipmi_val))
00151         
00152             # CPU fan speeds
00153             if (name.startswith('CPU') and name.endswith('Fan')) or name == 'MB Fan':
00154                 if ipmi_val.endswith('RPM'):
00155                     rpm = ipmi_val.rstrip(' RPM').lstrip()
00156                     if unicode(rpm).isnumeric():
00157                         if int(rpm) == 0:
00158                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
00159                             diag_msgs.append('CPU Fan Off')
00160                             
00161                         diag_vals.append(KeyValue(key = name + ' RPM', value = rpm))
00162                     else:
00163                         diag_vals.append(KeyValue(key = name, value = ipmi_val))
00164 
00165             # If CPU is hot we get an alarm from ipmitool, report that too
00166             # CPU should shut down if we get a hot alarm, so report as error
00167             if name.startswith('CPU') and name.endswith('hot'):
00168                 if ipmi_val == '0x01':
00169                     diag_vals.append(KeyValue(key = name, value = 'OK'))
00170                 else:
00171                     diag_vals.append(KeyValue(key = name, value = 'Hot'))
00172                     diag_level = max(diag_level, DiagnosticStatus.ERROR)
00173                     diag_msgs.append('CPU Hot Alarm')
00174 
00175     except Exception, e:
00176         diag_vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
00177         diag_level = DiagnosticStatus.ERROR
00178         diag_msgs.append('Exception')
00179 
00180     return diag_vals, diag_msgs, diag_level
00181         
00182 
00183 ##\brief Check CPU core temps 
00184 ##
00185 ## Use 'find /sys -name temp1_input' to find cores
00186 ## Read from every core, divide by 1000
00187 def check_core_temps(sys_temp_strings):
00188     diag_vals = []
00189     diag_level = 0
00190     diag_msgs = []
00191     
00192     for index, temp_str in enumerate(sys_temp_strings):
00193         if len(temp_str) < 5:
00194             continue
00195         
00196         cmd = 'cat %s' % temp_str
00197         p = subprocess.Popen(cmd, stdout = subprocess.PIPE, 
00198                              stderr = subprocess.PIPE, shell = True)
00199         stdout, stderr = p.communicate()
00200         retcode = p.returncode
00201 
00202         if retcode != 0:
00203             diag_level = DiagnosticStatus.ERROR
00204             diag_msg = [ 'Core Temp Error' ]
00205             diag_vals = [ KeyValue(key = 'Core Temp Error', value = stderr), 
00206                           KeyValue(key = 'Output', value = stdout) ]
00207             return diag_vals, diag_msgs, diag_level
00208   
00209         tmp = stdout.strip()
00210         if unicode(tmp).isnumeric():
00211             temp = float(tmp) / 1000
00212             diag_vals.append(KeyValue(key = 'Core %d Temp' % index, value = str(temp)))
00213 
00214             if temp >= 85 and temp < 90:
00215                 diag_level = max(diag_level, DiagnosticStatus.WARN)
00216                 diag_msgs.append('Warm')
00217             if temp >= 90:
00218                 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00219                 diag_msgs.append('Hot')
00220         else:
00221             diag_level = max(diag_level, DiagnosticStatus.ERROR) # Error if not numeric value
00222             diag_vals.append(KeyValue(key = 'Core %s Temp' % index, value = tmp))
00223 
00224     return diag_vals, diag_msgs, diag_level
00225 
00226 ## Checks clock speed from reading from CPU info
00227 def check_clock_speed(enforce_speed):
00228     vals = []
00229     msgs = []
00230     lvl = DiagnosticStatus.OK
00231 
00232     try:
00233         p = subprocess.Popen('cat /proc/cpuinfo | grep MHz', 
00234                              stdout = subprocess.PIPE,
00235                              stderr = subprocess.PIPE, shell = True)
00236         stdout, stderr = p.communicate()
00237         retcode = p.returncode
00238 
00239         if retcode != 0:
00240             lvl = DiagnosticStatus.ERROR
00241             msgs = [ 'Clock speed error' ]
00242             vals = [ KeyValue(key = 'Clock speed error', value = stderr), 
00243                      KeyValue(key = 'Output', value = stdout) ]
00244             
00245             return (vals, msgs, lvl)
00246 
00247         for index, ln in enumerate(stdout.split('\n')):
00248             words = ln.split(':')
00249             if len(words) < 2:
00250                 continue
00251 
00252             speed = words[1].strip().split('.')[0] # Conversion to float doesn't work with decimal
00253             vals.append(KeyValue(key = 'Core %d MHz' % index, value = speed))
00254             if unicode(speed).isnumeric():
00255                 mhz = float(speed)
00256                 
00257                 if mhz < 2240 and mhz > 2150:
00258                     lvl = max(lvl, DiagnosticStatus.WARN)
00259                 if mhz <= 2150:
00260                     lvl = max(lvl, DiagnosticStatus.ERROR)
00261             else:
00262                 # Automatically give error if speed isn't a number
00263                 lvl = max(lvl, DiagnosticStatus.ERROR)
00264 
00265         if not enforce_speed:
00266             lvl = DiagnosticStatus.OK
00267 
00268         if lvl == DiagnosticStatus.WARN and enforce_speed:
00269             msgs = [ 'Core slowing' ]
00270         elif lvl == DiagnosticStatus.ERROR and enforce_speed:
00271             msgs = [ 'Core throttled' ]
00272 
00273     except Exception, e:
00274         rospy.logerr(traceback.format_exc())
00275         lvl = DiagnosticStatus.ERROR
00276         msgs.append('Exception')
00277         vals.append(KeyValue(key = 'Exception', value = traceback.format_exc()))
00278 
00279     return vals, msgs, lvl
00280                     
00281 
00282 # Add msgs output, too
00283 ##\brief Uses 'uptime' to see load average
00284 def check_uptime(load1_threshold, load5_threshold):
00285     level = DiagnosticStatus.OK
00286     vals = []
00287     
00288     load_dict = { 0: 'OK', 1: 'High Load', 2: 'Very High Load' }
00289 
00290     try:
00291         p = subprocess.Popen('uptime', stdout = subprocess.PIPE, 
00292                              stderr = subprocess.PIPE, shell = True)
00293         stdout, stderr = p.communicate()
00294         retcode = p.returncode
00295 
00296         if retcode != 0:
00297             vals.append(KeyValue(key = 'uptime Failed', value = stderr))
00298             return DiagnosticStatus.ERROR, vals
00299 
00300         upvals = stdout.split()
00301         load1 = upvals[-3].rstrip(',')
00302         load5 = upvals[-2].rstrip(',')
00303         load15 = upvals[-1]
00304         num_users = upvals[-7]
00305 
00306         # Give warning if we go over load limit 
00307         if float(load1) > load1_threshold or float(load5) > load5_threshold:
00308             level = DiagnosticStatus.WARN
00309 
00310         vals.append(KeyValue(key = 'Load Average Status', value = load_dict[level]))
00311         vals.append(KeyValue(key = '1 min Load Average', value = load1))
00312         vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(load1_threshold)))
00313         vals.append(KeyValue(key = '5 min Load Average', value = load5))
00314         vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(load5_threshold)))
00315         vals.append(KeyValue(key = '15 min Load Average', value = load15))
00316         vals.append(KeyValue(key = 'Number of Users', value = num_users))
00317 
00318     except Exception, e:
00319         rospy.logerr(traceback.format_exc())
00320         level = DiagnosticStatus.ERROR
00321         vals.append(KeyValue(key = 'Load Average Status', value = traceback.format_exc()))
00322         
00323     return level, load_dict[level], vals
00324 
00325 # Add msgs output
00326 ##\brief Uses 'free -m' to check free memory
00327 def check_memory():
00328     values = []
00329     level = DiagnosticStatus.OK
00330     msg = ''
00331 
00332     mem_dict = { 0: 'OK', 1: 'Low Memory', 2: 'Very Low Memory' }
00333 
00334     try:
00335         p = subprocess.Popen('free -m',
00336                              stdout = subprocess.PIPE,
00337                              stderr = subprocess.PIPE, shell = True)
00338         stdout, stderr = p.communicate()
00339         retcode = p.returncode
00340                
00341         if retcode != 0:
00342             values.append(KeyValue(key = "\"free -m\" Call Error", value = str(retcode)))
00343             return DiagnosticStatus.ERROR, values
00344  
00345         rows = stdout.split('\n')
00346         data = rows[1].split()
00347         total_mem = data[1]
00348         used_mem = data[2]
00349         free_mem = data[3]
00350 
00351         level = DiagnosticStatus.OK
00352         if float(free_mem) < 25:
00353             level = DiagnosticStatus.WARN
00354         if float(free_mem) < 1:
00355             level = DiagnosticStatus.ERROR
00356 
00357         values.append(KeyValue(key = 'Memory Status', value = mem_dict[level]))
00358         values.append(KeyValue(key = 'Total Memory', value = total_mem))
00359         values.append(KeyValue(key = 'Used Memory', value = used_mem))
00360         values.append(KeyValue(key = 'Free Memory', value = free_mem))
00361 
00362         msg = mem_dict[level]
00363     except Exception, e:
00364         rospy.logerr(traceback.format_exc())
00365         msg = 'Memory Usage Check Error'
00366         values.append(KeyValue(key = msg, value = str(e)))
00367         level = DiagnosticStatus.ERROR
00368     
00369     return level, mem_dict[level], values
00370 
00371 
00372 
00373 ##\brief Use mpstat to find CPU usage
00374 ##
00375 usage_old = 0
00376 has_warned_mpstat = False
00377 has_error_core_count = False
00378 def check_mpstat(core_count = -1):
00379     vals = []
00380     mp_level = DiagnosticStatus.OK
00381     
00382     load_dict = { 0: 'OK', 1: 'High Load', 2: 'Error' }
00383 
00384     try:
00385         p = subprocess.Popen('mpstat -P ALL 1 1',
00386                              stdout = subprocess.PIPE,
00387                              stderr = subprocess.PIPE, shell = True)
00388         stdout, stderr = p.communicate()
00389         retcode = p.returncode
00390 
00391         if retcode != 0:
00392             global has_warned_mpstat
00393             if not has_warned_mpstat:
00394                 rospy.logerr("mpstat failed to run for cpu_monitor. Return code %d.", retcode)
00395                 has_warned_mpstat = True
00396 
00397             mp_level = DiagnosticStatus.ERROR
00398             vals.append(KeyValue(key = '\"mpstat\" Call Error', value = str(retcode)))
00399             return mp_level, 'Unable to Check CPU Usage', vals
00400 
00401         # Check which column '%idle' is, #4539
00402         # mpstat output changed between 8.06 and 8.1
00403         rows = stdout.split('\n')
00404         col_names = rows[2].split()
00405         idle_col = -1 if (len(col_names) > 2 and col_names[-1] == '%idle') else -2
00406 
00407         num_cores = 0
00408         cores_loaded = 0
00409         for index, row in enumerate(stdout.split('\n')):
00410             if index < 3:
00411                 continue
00412             
00413             # Skip row containing 'all' data
00414             if row.find('all') > -1:
00415                 continue
00416 
00417             lst = row.split()
00418             if len(lst) < 8:
00419                 continue
00420 
00421             ## Ignore 'Average: ...' data
00422             if lst[0].startswith('Average'):
00423                 continue
00424 
00425             cpu_name = '%d' % (num_cores)
00426             idle = lst[idle_col].replace(',', '.')
00427             user = lst[3].replace(',', '.')
00428             nice = lst[4].replace(',', '.')
00429             system = lst[5].replace(',', '.')
00430             
00431             core_level = 0
00432             usage = float(user) + float(nice)
00433             if usage > 1000: # wrong reading, use old reading instead
00434                 rospy.logwarn('Read cpu usage of %f percent. Reverting to previous reading of %f percent'%(usage, usage_old))
00435                 usage = usage_old
00436             usage_old = usage
00437 
00438             if usage > 90.0:
00439                 cores_loaded += 1
00440                 core_level = DiagnosticStatus.WARN
00441             if usage > 110.0:
00442                 core_level = DiagnosticStatus.ERROR
00443 
00444             vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level]))
00445             vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = user))
00446             vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = nice))
00447             vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = system))
00448             vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = idle))
00449 
00450             num_cores += 1
00451         
00452         # Warn for high load only if we have <= 2 cores that aren't loaded
00453         if num_cores - cores_loaded <= 2 and num_cores > 2:
00454             mp_level = DiagnosticStatus.WARN
00455 
00456         # Check the number of cores if core_count > 0, #4850
00457         if core_count > 0 and core_count != num_cores:
00458             mp_level = DiagnosticStatus.ERROR
00459             global has_error_core_count
00460             if not has_error_core_count:
00461                 rospy.logerr('Error checking number of cores. Expected %d, got %d. Computer may have not booted properly.',
00462                               core_count, num_cores)
00463                 has_error_core_count = True
00464             return DiagnosticStatus.ERROR, 'Incorrect number of CPU cores', vals
00465             
00466     except Exception, e:
00467         mp_level = DiagnosticStatus.ERROR
00468         vals.append(KeyValue(key = 'mpstat Exception', value = str(e)))
00469 
00470     return mp_level, load_dict[mp_level], vals
00471 
00472 ## Returns names for core temperature files
00473 ## Returns list of names, each name can be read like file
00474 def get_core_temp_names():
00475     temp_vals = []
00476     try:
00477         p = subprocess.Popen('find /sys/devices -name temp1_input', 
00478                              stdout = subprocess.PIPE,
00479                              stderr = subprocess.PIPE, shell = True)
00480         stdout, stderr = p.communicate()
00481         retcode = p.returncode
00482 
00483         if retcode != 0:
00484             rospy.logerr('Error find core temp locations: %s' % stderr)
00485             return []
00486         
00487         for ln in stdout.split('\n'):
00488             temp_vals.append(ln.strip())
00489         
00490         return temp_vals
00491     except:
00492         rospy.logerr('Exception finding temp vals: %s' % traceback.format_exc())
00493         return []
00494 
00495 def update_status_stale(stat, last_update_time):
00496     time_since_update = rospy.get_time() - last_update_time
00497 
00498     stale_status = 'OK'
00499     if time_since_update > 20 and time_since_update <= 35:
00500         stale_status = 'Lagging'
00501         if stat.level == DiagnosticStatus.OK:
00502             stat.message = stale_status
00503         elif stat.message.find(stale_status) < 0:
00504             stat.message = ', '.join([stat.message, stale_status])
00505         stat.level = max(stat.level, DiagnosticStatus.WARN)
00506     if time_since_update > 35:
00507         stale_status = 'Stale'
00508         if stat.level == DiagnosticStatus.OK:
00509             stat.message = stale_status
00510         elif stat.message.find(stale_status) < 0:
00511             stat.message = ', '.join([stat.message, stale_status])
00512         stat.level = max(stat.level, DiagnosticStatus.ERROR)
00513 
00514 
00515     stat.values.pop(0)
00516     stat.values.pop(0)
00517     stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
00518     stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
00519     
00520 
00521 class CPUMonitor():
00522     def __init__(self, hostname, diag_hostname):
00523         self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray)
00524 
00525         self._mutex = threading.Lock()
00526 
00527         self._check_ipmi = rospy.get_param('~check_ipmi_tool', True)
00528         self._enforce_speed = rospy.get_param('~enforce_clock_speed', True)
00529 
00530         self._check_core_temps = rospy.get_param('~check_core_temps', False)
00531         if self._check_core_temps:
00532             rospy.logwarn('Checking CPU core temperatures is deprecated. This will be removed in D-turtle')
00533         self._check_nfs = rospy.get_param('~check_nfs', False)
00534         if self._check_nfs:
00535             rospy.logwarn('NFS checking is deprecated for CPU monitor. This will be removed in D-turtle')
00536 
00537         self._load1_threshold = rospy.get_param('~load1_threshold', 5.0)
00538         self._load5_threshold = rospy.get_param('~load5_threshold', 3.0)
00539 
00540         self._num_cores = rospy.get_param('~num_cores', 8.0)
00541         if self._num_cores == -1:
00542             self._num_cores = multiprocessing.cpu_count()
00543         self._temps_timer = None
00544         self._usage_timer = None
00545         self._nfs_timer = None
00546         
00547         # Get temp_input files
00548         self._temp_vals = get_core_temp_names()
00549 
00550         # CPU stats
00551         self._temp_stat = DiagnosticStatus()
00552         self._temp_stat.name = '%s CPU Temperature' % diag_hostname
00553         self._temp_stat.level = 1
00554         self._temp_stat.hardware_id = hostname
00555         self._temp_stat.message = 'No Data'
00556         self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00557                                    KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00558 
00559         self._usage_stat = DiagnosticStatus()
00560         self._usage_stat.name = '%s CPU Usage' % diag_hostname
00561         self._usage_stat.level = 1
00562         self._usage_stat.hardware_id = hostname
00563         self._usage_stat.message = 'No Data'
00564         self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00565                                     KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00566 
00567         self._nfs_stat = DiagnosticStatus()
00568         self._nfs_stat.name = '%s NFS IO' % diag_hostname
00569         self._nfs_stat.level = 1
00570         self._nfs_stat.hardware_id = hostname
00571         self._nfs_stat.message = 'No Data'
00572         self._nfs_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00573                                   KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00574 
00575         self._last_temp_time = 0
00576         self._last_usage_time = 0
00577         self._last_nfs_time = 0
00578         self._last_publish_time = 0
00579 
00580         # Start checking everything
00581         self.check_temps()
00582         if self._check_nfs:
00583             self.check_nfs_stat()
00584         self.check_usage()
00585 
00586     # Restart temperature checking 
00587     def _restart_temp_check(self):
00588         rospy.logerr('Restarting temperature check thread in cpu_monitor. This should not happen')
00589         try:
00590             with self._mutex:
00591                 if self._temps_timer:
00592                     self._temps_timer.cancel()
00593                 
00594             self.check_temps()
00595         except Exception, e:
00596             rospy.logerr('Unable to restart temp thread. Error: %s' % traceback.format_exc())
00597             
00598 
00599     ## Must have the lock to cancel everything
00600     def cancel_timers(self):
00601         if self._temps_timer:
00602             self._temps_timer.cancel()
00603 
00604         if self._nfs_timer:
00605             self._nfs_timer.cancel()
00606 
00607         if self._usage_timer:
00608             self._usage_timer.cancel()
00609 
00610     def check_nfs_stat(self):
00611         if rospy.is_shutdown():
00612             with self._mutex:
00613                 self.cancel_timers()
00614                 return
00615 
00616         nfs_level = 0
00617         msg = 'OK'
00618         vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00619                  KeyValue(key = 'Time Since Last Update', value = str(0) )]
00620 
00621         try:
00622             p = subprocess.Popen('iostat -n',
00623                                  stdout = subprocess.PIPE,
00624                                  stderr = subprocess.PIPE, shell = True)
00625             stdout, stderr = p.communicate()
00626             retcode = p.returncode
00627             
00628             if retcode != 0:
00629                 nfs_level = DiagnosticStatus.ERROR
00630                 msg = 'iostat Error'
00631                 vals.append(KeyValue(key = '\"iostat -n\" Call Error', value = str(e)))
00632                 stdout = ''
00633                 
00634 
00635             for index, row in enumerate(stdout.split('\n')):
00636                 if index < 3:
00637                     continue
00638                 
00639                 lst = row.split()
00640                 if len(lst) < 7:
00641                     continue
00642                 
00643                 file_sys = lst[0]
00644                 read_blk = lst[1]
00645                 write_blk = lst[2]
00646                 read_blk_dir = lst[3]
00647                 write_blk_dir = lst[4]
00648                 r_blk_srv = lst[5]
00649                 w_blk_srv = lst[6]
00650                 
00651                 vals.append(KeyValue(
00652                         key = '%s Read Blks/s' % file_sys, value=read_blk))
00653                 vals.append(KeyValue(
00654                         key = '%s Write Blks/s' % file_sys, value=write_blk))
00655                 vals.append(KeyValue(
00656                         key = '%s Read Blk dir/s' % file_sys, value=read_blk_dir))
00657                 vals.append(KeyValue(
00658                         key = '%s Write Blks dir/s' % file_sys, value=write_blk_dir))
00659                 vals.append(KeyValue(
00660                         key = '%s Read Blks srv/s' % file_sys, value=r_blk_srv))
00661                 vals.append(KeyValue(
00662                         key = '%s Write Blks srv/s' % file_sys, value=w_blk_srv))
00663                 
00664         except Exception, e:
00665             rospy.logerr(traceback.format_exc())
00666             nfs_level = DiagnosticStatus.ERROR
00667             msg = 'Exception'
00668             vals.append(KeyValue(key = 'Exception', value = str(e)))
00669           
00670         with self._mutex:
00671             self._nfs_stat.level = nfs_level
00672             self._nfs_stat.message = msg
00673             self._nfs_stat.values = vals
00674             
00675             self._last_nfs_time = rospy.get_time()
00676             
00677             if not rospy.is_shutdown():
00678                 self._nfs_timer = threading.Timer(5.0, self.check_nfs_stat)
00679                 self._nfs_timer.start()
00680             else:
00681                 self.cancel_timers()
00682 
00683 
00684     ## Call every 10sec at minimum
00685     def check_temps(self):
00686         if rospy.is_shutdown():
00687             with self._mutex:
00688                 self.cancel_timers()
00689             return
00690 
00691         diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00692                       KeyValue(key = 'Time Since Last Update', value = str(0) ) ]
00693         diag_msgs = []
00694         diag_level = 0
00695 
00696         if self._check_ipmi:
00697             ipmi_vals, ipmi_msgs, ipmi_level = check_ipmi()
00698             diag_vals.extend(ipmi_vals)
00699             diag_msgs.extend(ipmi_msgs)
00700             diag_level = max(diag_level, ipmi_level)
00701 
00702         if self._check_core_temps:
00703             core_vals, core_msgs, core_level = check_core_temps(self._temp_vals)
00704             diag_vals.extend(core_vals)
00705             diag_msgs.extend(core_msgs)
00706             diag_level = max(diag_level, core_level)
00707 
00708         clock_vals, clock_msgs, clock_level = check_clock_speed(self._enforce_speed)
00709         diag_vals.extend(clock_vals)
00710         diag_msgs.extend(clock_msgs)
00711         diag_level = max(diag_level, clock_level)
00712 
00713         diag_log = set(diag_msgs)
00714         if len(diag_log) > 0:
00715             message = ', '.join(diag_log)
00716         else:
00717             message = stat_dict[diag_level]
00718 
00719         with self._mutex:
00720             self._last_temp_time = rospy.get_time()
00721             
00722             self._temp_stat.level = diag_level
00723             self._temp_stat.message = message
00724             self._temp_stat.values = diag_vals
00725             
00726             if not rospy.is_shutdown():
00727                 self._temps_timer = threading.Timer(5.0, self.check_temps)
00728                 self._temps_timer.start()
00729             else:
00730                 self.cancel_timers()
00731 
00732     def check_usage(self):
00733         if rospy.is_shutdown():
00734             with self._mutex:
00735                 self.cancel_timers()
00736             return 
00737 
00738         diag_level = 0
00739         diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00740                       KeyValue(key = 'Time Since Last Update', value = 0 )]
00741         diag_msgs = []
00742 
00743         # Check mpstat
00744         mp_level, mp_msg, mp_vals = check_mpstat(self._num_cores)
00745         diag_vals.extend(mp_vals)
00746         if mp_level > 0:
00747             diag_msgs.append(mp_msg)
00748         diag_level = max(diag_level, mp_level)
00749             
00750         # Check uptime
00751         uptime_level, up_msg, up_vals = check_uptime(self._load1_threshold, self._load5_threshold)
00752         diag_vals.extend(up_vals)
00753         if uptime_level > 0:
00754             diag_msgs.append(up_msg)
00755         diag_level = max(diag_level, uptime_level)
00756         
00757         # Check memory
00758         mem_level, mem_msg, mem_vals = check_memory()
00759         diag_vals.extend(mem_vals)
00760         if mem_level > 0:
00761             diag_msgs.append(mem_msg)
00762         diag_level = max(diag_level, mem_level)
00763 
00764         if diag_msgs and diag_level > 0:
00765             usage_msg = ', '.join(set(diag_msgs))
00766         else:
00767             usage_msg = stat_dict[diag_level]
00768 
00769         # Update status
00770         with self._mutex:
00771             self._last_usage_time = rospy.get_time()
00772             self._usage_stat.level = diag_level
00773             self._usage_stat.values = diag_vals
00774             
00775             self._usage_stat.message = usage_msg
00776             
00777             if not rospy.is_shutdown():
00778                 self._usage_timer = threading.Timer(5.0, self.check_usage)
00779                 self._usage_timer.start()
00780             else:
00781                 self.cancel_timers()
00782 
00783     def publish_stats(self):
00784         with self._mutex:
00785             # Update everything with last update times
00786             update_status_stale(self._temp_stat, self._last_temp_time)
00787             update_status_stale(self._usage_stat, self._last_usage_time)
00788             if self._check_nfs:
00789                 update_status_stale(self._nfs_stat, self._last_nfs_time)
00790 
00791             msg = DiagnosticArray()
00792             msg.header.stamp = rospy.get_rostime()
00793             msg.status.append(self._temp_stat)
00794             msg.status.append(self._usage_stat)
00795             if self._check_nfs:
00796                 msg.status.append(self._nfs_stat)
00797 
00798             if rospy.get_time() - self._last_publish_time > 0.5:
00799                 self._diag_pub.publish(msg)
00800                 self._last_publish_time = rospy.get_time()
00801 
00802         
00803         # Restart temperature checking if it goes stale, #4171
00804         # Need to run this without mutex
00805         if rospy.get_time() - self._last_temp_time > 90: 
00806             self._restart_temp_check()
00807 
00808 
00809 if __name__ == '__main__':
00810     hostname = socket.gethostname()
00811 
00812     import optparse
00813     parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]")
00814     parser.add_option("--diag-hostname", dest="diag_hostname",
00815                       help="Computer name in diagnostics output (ex: 'c1')",
00816                       metavar="DIAG_HOSTNAME",
00817                       action="store", default = hostname)
00818     options, args = parser.parse_args(rospy.myargv())
00819 
00820     try:
00821         rospy.init_node('cpu_monitor_%s' % hostname)
00822     except rospy.exceptions.ROSInitException:
00823         print >> sys.stderr, 'CPU monitor is unable to initialize node. Master may not be running.'
00824         sys.exit(0)
00825 
00826     cpu_node = CPUMonitor(hostname, options.diag_hostname)
00827 
00828     rate = rospy.Rate(1.0)
00829     try:
00830         while not rospy.is_shutdown():
00831             rate.sleep()
00832             cpu_node.publish_stats()
00833     except KeyboardInterrupt:
00834         pass
00835     except Exception, e:
00836         traceback.print_exc()
00837         rospy.logerr(traceback.format_exc())
00838 
00839     cpu_node.cancel_timers()
00840     sys.exit(0)
00841     
00842 
00843 
00844     
00845 
00846             
00847 


pr2_computer_monitor
Author(s): Kevin Watts (watts@willowgarage.com)
autogenerated on Thu Jun 6 2019 20:28:51