hd_monitor.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # Software License Agreement (BSD License)
00004 #
00005 # Copyright (c) 2009, Willow Garage, Inc.
00006 # All rights reserved.
00007 #
00008 # Redistribution and use in source and binary forms, with or without
00009 # modification, are permitted provided that the following conditions
00010 # are met:
00011 #
00012 #  * Redistributions of source code must retain the above copyright
00013 #    notice, this list of conditions and the following disclaimer.
00014 #  * Redistributions in binary form must reproduce the above
00015 #    copyright notice, this list of conditions and the following
00016 #    disclaimer in the documentation and/or other materials provided
00017 #    with the distribution.
00018 #  * Neither the name of the Willow Garage nor the names of its
00019 #    contributors may be used to endorse or promote products derived
00020 #    from this software without specific prior written permission.
00021 #
00022 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00023 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00024 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00025 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
00026 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00027 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00028 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00029 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00030 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00031 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00032 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00033 # POSSIBILITY OF SUCH DAMAGE.
00034 
00035 ##\author Kevin Watts
00036 
00037 from __future__ import with_statement
00038 import roslib
00039 roslib.load_manifest('pr2_computer_monitor')
00040 
00041 import rospy
00042 
00043 import traceback
00044 import threading
00045 from threading import Timer
00046 import sys, os, time
00047 from time import sleep
00048 import subprocess
00049 
00050 import socket
00051 
00052 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
00053 
00054 ##### monkey-patch to suppress threading error message in python 2.7.3
00055 ##### See http://stackoverflow.com/questions/13193278/understand-python-threading-bug
00056 if sys.version_info[:3] == (2, 7, 3):
00057     import threading
00058     threading._DummyThread._Thread__stop = lambda x: 42
00059 #####
00060 
00061 low_hd_level = 5
00062 critical_hd_level = 1
00063 
00064 hd_temp_warn = 55 #3580, setting to 55C to after checking manual
00065 hd_temp_error = 70 # Above this temperature, hard drives will have serious problems
00066 
00067 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
00068 temp_dict = { 0: 'OK', 1: 'Hot', 2: 'Critical Hot' }
00069 usage_dict = { 0: 'OK', 1: 'Low Disk Space', 2: 'Very Low Disk Space' }
00070 
00071 REMOVABLE = ['/dev/sg1', '/dev/sdb'] # Store removable drives so we can ignore if removed
00072 
00073 ## Connects to hddtemp daemon to get temp, HD make.
00074 def get_hddtemp_data(hostname = 'localhost', port = 7634):
00075     try:
00076         hd_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
00077         hd_sock.connect((hostname, port))
00078         sock_data = ''
00079         while True:
00080             newdat = hd_sock.recv(1024)
00081             if len(newdat) == 0:
00082                 break
00083             sock_data = sock_data + newdat
00084         hd_sock.close()
00085         
00086         sock_vals = sock_data.split('|')
00087 
00088         # Format of output looks like ' | DRIVE | MAKE | TEMP | ' 
00089         idx = 0
00090         
00091         drives = []
00092         makes = []
00093         temps = []
00094         while idx + 5 < len(sock_vals):
00095             this_drive = sock_vals[idx + 1]
00096             this_make = sock_vals[idx + 2]
00097             this_temp = sock_vals[idx + 3]
00098 
00099             # Sometimes we get duplicate makes if hard drives are mounted
00100             # to two different points
00101             if this_make in makes:
00102                 idx += 5
00103                 continue
00104 
00105             drives.append(this_drive)
00106             makes.append(this_make)
00107             temps.append(this_temp)
00108                         
00109             idx += 5
00110 
00111         return True, drives, makes, temps
00112     except:
00113         rospy.logerr(traceback.format_exc())
00114         return False, [ 'Exception' ], [ traceback.format_exc() ], [ 0 ]
00115 
00116 def update_status_stale(stat, last_update_time):
00117     time_since_update = rospy.get_time() - last_update_time
00118 
00119     stale_status = 'OK'
00120     if time_since_update > 20 and time_since_update <= 35:
00121         stale_status = 'Lagging'
00122         if stat.level == DiagnosticStatus.OK:
00123             stat.message = stale_status
00124         elif stat.message.find(stale_status) < 0:
00125             stat.message = ', '.join([stat.message, stale_status])
00126         stat.level = max(stat.level, DiagnosticStatus.WARN)
00127     if time_since_update > 35:
00128         stale_status = 'Stale'
00129         if stat.level == DiagnosticStatus.OK:
00130             stat.message = stale_status
00131         elif stat.message.find(stale_status) < 0:
00132             stat.message = ', '.join([stat.message, stale_status])
00133         stat.level = max(stat.level, DiagnosticStatus.ERROR)
00134         
00135     stat.values.pop(0)
00136     stat.values.pop(0)
00137     stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
00138     stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
00139 
00140 class hd_monitor():
00141     def __init__(self, hostname, diag_hostname, home_dir = ''):
00142         self._mutex = threading.Lock()
00143         
00144         self._hostname = hostname
00145         self._no_temp_warn = rospy.get_param('~no_hd_temp_warn', False)
00146         if self._no_temp_warn:
00147             rospy.logwarn('Not warning for HD temperatures is deprecated. This will be removed in D-turtle')
00148         self._home_dir = home_dir
00149 
00150         self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray)
00151 
00152         self._last_temp_time = 0
00153         self._last_usage_time = 0
00154         self._last_publish_time = 0
00155 
00156         self._temp_timer = None
00157         self._usage_timer = None
00158 
00159         self._temp_stat = DiagnosticStatus()
00160         self._temp_stat.name = "%s HD Temperature" % diag_hostname
00161         self._temp_stat.level = DiagnosticStatus.ERROR
00162         self._temp_stat.hardware_id = hostname
00163         self._temp_stat.message = 'No Data'
00164         self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data'), 
00165                                    KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00166 
00167         if self._home_dir != '':
00168             self._usage_stat = DiagnosticStatus()
00169             self._usage_stat.level = DiagnosticStatus.ERROR
00170             self._usage_stat.hardware_id = hostname
00171             self._usage_stat.name = '%s HD Usage' % diag_hostname
00172             self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00173                                         KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00174             self.check_disk_usage()
00175 
00176         self.check_temps()
00177 
00178     ## Must have the lock to cancel everything
00179     def cancel_timers(self):
00180         if self._temp_timer:
00181             self._temp_timer.cancel()
00182             self._temp_timer = None
00183  
00184         if self._usage_timer:
00185             self._usage_timer.cancel()
00186             self._usage_timer = None
00187 
00188     def check_temps(self):
00189         if rospy.is_shutdown():
00190             with self._mutex:
00191                 self.cancel_timers()
00192             return
00193 
00194         diag_strs = [ KeyValue(key = 'Update Status', value = 'OK' ) ,
00195                       KeyValue(key = 'Time Since Last Update', value = '0' ) ]
00196         diag_level = DiagnosticStatus.OK
00197         diag_message = 'OK'
00198                 
00199         temp_ok, drives, makes, temps = get_hddtemp_data()
00200 
00201         for index in range(0, len(drives)):
00202             temp = temps[index]
00203             
00204             if not unicode(temp).isnumeric() and drives[index] not in REMOVABLE:
00205                 temp_level = DiagnosticStatus.ERROR
00206                 temp_ok = False
00207             elif not unicode(temp).isnumeric() and drives[index] in REMOVABLE:
00208                 temp_level = DiagnosticStatus.OK
00209                 temp = "Removed"
00210             else:
00211                 temp_level = DiagnosticStatus.OK
00212                 if float(temp) > hd_temp_warn:
00213                     temp_level = DiagnosticStatus.WARN
00214                 if float(temp) > hd_temp_error:
00215                     temp_level = DiagnosticStatus.ERROR
00216             
00217             diag_level = max(diag_level, temp_level)
00218             
00219             diag_strs.append(KeyValue(key = 'Disk %d Temp Status' % index, value = temp_dict[temp_level]))
00220             diag_strs.append(KeyValue(key = 'Disk %d Mount Pt.' % index, value = drives[index]))
00221             diag_strs.append(KeyValue(key = 'Disk %d Device ID' % index, value = makes[index]))
00222             diag_strs.append(KeyValue(key = 'Disk %d Temp' % index, value = temp))
00223         
00224         if not temp_ok:
00225             diag_level = DiagnosticStatus.ERROR
00226 
00227         with self._mutex:
00228             self._last_temp_time = rospy.get_time()
00229             self._temp_stat.values = diag_strs
00230             self._temp_stat.level = diag_level
00231             
00232             # Give No Data message if we have no reading
00233             self._temp_stat.message = temp_dict[diag_level]
00234             if not temp_ok:
00235                 self._temp_stat.message = 'Error'
00236 
00237             if self._no_temp_warn and temp_ok:
00238                 self._temp_stat.level = DiagnosticStatus.OK
00239 
00240             if not rospy.is_shutdown():
00241                 self._temp_timer = threading.Timer(10.0, self.check_temps)
00242                 self._temp_timer.start()
00243             else:
00244                 self.cancel_timers()
00245         
00246     def check_disk_usage(self):
00247         if rospy.is_shutdown():
00248             with self._mutex:
00249                 self.cancel_timers()
00250             return
00251 
00252         diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00253                       KeyValue(key = 'Time Since Last Update', value = '0' ) ]
00254         diag_level = DiagnosticStatus.OK
00255         diag_message = 'OK'
00256         
00257         try:
00258             p = subprocess.Popen(["df", "-P", "--block-size=1G", self._home_dir], 
00259                                  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
00260             stdout, stderr = p.communicate()
00261             retcode = p.returncode
00262             
00263             if (retcode == 0):
00264                 
00265                 diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'OK'))
00266                 row_count = 0
00267                 for row in stdout.split('\n'):
00268                     if len(row.split()) < 2:
00269                         continue
00270                     if not unicode(row.split()[1]).isnumeric() or float(row.split()[1]) < 10: # Ignore small drives
00271                         continue
00272                 
00273                     row_count += 1
00274                     g_available = row.split()[-3]
00275                     name = row.split()[0]
00276                     size = row.split()[1]
00277                     mount_pt = row.split()[-1]
00278                     
00279                     if (float(g_available) > low_hd_level):
00280                         level = DiagnosticStatus.OK
00281                     elif (float(g_available) > critical_hd_level):
00282                         level = DiagnosticStatus.WARN
00283                     else:
00284                         level = DiagnosticStatus.ERROR
00285                         
00286                     diag_vals.append(KeyValue(
00287                             key = 'Disk %d Name' % row_count, value = name))
00288                     diag_vals.append(KeyValue(
00289                             key = 'Disk %d Available' % row_count, value = g_available))
00290                     diag_vals.append(KeyValue(
00291                             key = 'Disk %d Size' % row_count, value = size))
00292                     diag_vals.append(KeyValue(
00293                             key = 'Disk %d Status' % row_count, value = stat_dict[level]))
00294                     diag_vals.append(KeyValue(
00295                             key = 'Disk %d Mount Point' % row_count, value = mount_pt))
00296                     
00297                     diag_level = max(diag_level, level)
00298                     diag_message = usage_dict[diag_level]
00299 
00300             else:
00301                 diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Failed'))
00302                 diag_level = DiagnosticStatus.ERROR
00303                 diag_message = stat_dict[diag_level]
00304             
00305                     
00306         except:
00307             rospy.logerr(traceback.format_exc())
00308             
00309             diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Exception'))
00310             diag_vals.append(KeyValue(key = 'Disk Space Ex', value = traceback.format_exc()))
00311 
00312             diag_level = DiagnosticStatus.ERROR
00313             diag_message = stat_dict[diag_level]
00314             
00315         # Update status
00316         with self._mutex:
00317             self._last_usage_time = rospy.get_time()
00318             self._usage_stat.values = diag_vals
00319             self._usage_stat.message = diag_message
00320             self._usage_stat.level = diag_level
00321             
00322             if not rospy.is_shutdown():
00323                 self._usage_timer = threading.Timer(5.0, self.check_disk_usage)
00324                 self._usage_timer.start()
00325             else:
00326                 self.cancel_timers()
00327 
00328         
00329     def publish_stats(self):
00330         with self._mutex:
00331             update_status_stale(self._temp_stat, self._last_temp_time)
00332             
00333             msg = DiagnosticArray()
00334             msg.header.stamp = rospy.get_rostime()
00335             msg.status.append(self._temp_stat)
00336             if self._home_dir != '':
00337                 update_status_stale(self._usage_stat, self._last_usage_time)
00338                 msg.status.append(self._usage_stat)
00339                 
00340             if rospy.get_time() - self._last_publish_time > 0.5:
00341                 self._diag_pub.publish(msg)
00342                 self._last_publish_time = rospy.get_time()
00343             
00344 
00345 
00346         
00347 ##\todo Need to check HD input/output too using iostat
00348 
00349 if __name__ == '__main__':
00350     hostname = socket.gethostname()
00351 
00352     import optparse
00353     parser = optparse.OptionParser(usage="usage: hd_monitor.py [--diag-hostname=cX]")
00354     parser.add_option("--diag-hostname", dest="diag_hostname",
00355                       help="Computer name in diagnostics output (ex: 'c1')",
00356                       metavar="DIAG_HOSTNAME",
00357                       action="store", default = hostname)
00358     options, args = parser.parse_args(rospy.myargv())
00359 
00360     home_dir = ''
00361     if len(args) > 1:
00362         home_dir = args[1]
00363 
00364     try:
00365         rospy.init_node('hd_monitor_%s' % hostname)
00366     except rospy.exceptions.ROSInitException:
00367         print 'HD monitor is unable to initialize node. Master may not be running.'
00368         sys.exit(0)
00369         
00370     hd_monitor = hd_monitor(hostname, options.diag_hostname, home_dir)
00371     rate = rospy.Rate(1.0)
00372 
00373     try:
00374         while not rospy.is_shutdown():
00375             rate.sleep()
00376             hd_monitor.publish_stats()
00377     except KeyboardInterrupt:
00378         pass
00379     except Exception, e:
00380         traceback.print_exc()
00381 
00382     hd_monitor.cancel_timers()
00383     sys.exit(0)
00384     
00385 
00386             


pr2_computer_monitor
Author(s): Kevin Watts (watts@willowgarage.com)
autogenerated on Thu Sep 3 2015 11:31:16