hd_monitor.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # Software License Agreement (BSD License)
00004 #
00005 # Copyright (c) 2009, Willow Garage, Inc.
00006 # All rights reserved.
00007 #
00008 # Redistribution and use in source and binary forms, with or without
00009 # modification, are permitted provided that the following conditions
00010 # are met:
00011 #
00012 #  * Redistributions of source code must retain the above copyright
00013 #    notice, this list of conditions and the following disclaimer.
00014 #  * Redistributions in binary form must reproduce the above
00015 #    copyright notice, this list of conditions and the following
00016 #    disclaimer in the documentation and/or other materials provided
00017 #    with the distribution.
00018 #  * Neither the name of the Willow Garage nor the names of its
00019 #    contributors may be used to endorse or promote products derived
00020 #    from this software without specific prior written permission.
00021 #
00022 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00023 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00024 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00025 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
00026 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00027 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00028 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00029 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00030 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00031 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00032 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00033 # POSSIBILITY OF SUCH DAMAGE.
00034 
00035 ##\author Kevin Watts
00036 
00037 from __future__ import with_statement
00038 import roslib
00039 roslib.load_manifest('pr2_computer_monitor')
00040 
00041 import rospy
00042 
00043 import traceback
00044 import threading
00045 from threading import Timer
00046 import sys, os, time
00047 from time import sleep
00048 import subprocess
00049 
00050 import socket
00051 
00052 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
00053 
00054 low_hd_level = 5
00055 critical_hd_level = 1
00056 
00057 hd_temp_warn = 55 #3580, setting to 55C to after checking manual
00058 hd_temp_error = 70 # Above this temperature, hard drives will have serious problems
00059 
00060 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
00061 temp_dict = { 0: 'OK', 1: 'Hot', 2: 'Critical Hot' }
00062 usage_dict = { 0: 'OK', 1: 'Low Disk Space', 2: 'Very Low Disk Space' }
00063 
00064 REMOVABLE = ['/dev/sg1', '/dev/sdb'] # Store removable drives so we can ignore if removed
00065 
00066 ## Connects to hddtemp daemon to get temp, HD make.
00067 def get_hddtemp_data(hostname = 'localhost', port = 7634):
00068     try:
00069         hd_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
00070         hd_sock.connect((hostname, port))
00071         sock_data = ''
00072         while True:
00073             newdat = hd_sock.recv(1024)
00074             if len(newdat) == 0:
00075                 break
00076             sock_data = sock_data + newdat
00077         hd_sock.close()
00078         
00079         sock_vals = sock_data.split('|')
00080 
00081         # Format of output looks like ' | DRIVE | MAKE | TEMP | ' 
00082         idx = 0
00083         
00084         drives = []
00085         makes = []
00086         temps = []
00087         while idx + 5 < len(sock_vals):
00088             this_drive = sock_vals[idx + 1]
00089             this_make = sock_vals[idx + 2]
00090             this_temp = sock_vals[idx + 3]
00091 
00092             # Sometimes we get duplicate makes if hard drives are mounted
00093             # to two different points
00094             if this_make in makes:
00095                 idx += 5
00096                 continue
00097 
00098             drives.append(this_drive)
00099             makes.append(this_make)
00100             temps.append(this_temp)
00101                         
00102             idx += 5
00103 
00104         return True, drives, makes, temps
00105     except:
00106         rospy.logerr(traceback.format_exc())
00107         return False, [ 'Exception' ], [ traceback.format_exc() ], [ 0 ]
00108 
00109 def update_status_stale(stat, last_update_time):
00110     time_since_update = rospy.get_time() - last_update_time
00111 
00112     stale_status = 'OK'
00113     if time_since_update > 20 and time_since_update <= 35:
00114         stale_status = 'Lagging'
00115         if stat.level == DiagnosticStatus.OK:
00116             stat.message = stale_status
00117         elif stat.message.find(stale_status) < 0:
00118             stat.message = ', '.join([stat.message, stale_status])
00119         stat.level = max(stat.level, DiagnosticStatus.WARN)
00120     if time_since_update > 35:
00121         stale_status = 'Stale'
00122         if stat.level == DiagnosticStatus.OK:
00123             stat.message = stale_status
00124         elif stat.message.find(stale_status) < 0:
00125             stat.message = ', '.join([stat.message, stale_status])
00126         stat.level = max(stat.level, DiagnosticStatus.ERROR)
00127         
00128     stat.values.pop(0)
00129     stat.values.pop(0)
00130     stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
00131     stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
00132 
00133 class hd_monitor():
00134     def __init__(self, hostname, diag_hostname, home_dir = ''):
00135         self._mutex = threading.Lock()
00136         
00137         self._hostname = hostname
00138         self._no_temp_warn = rospy.get_param('~no_hd_temp_warn', False)
00139         if self._no_temp_warn:
00140             rospy.logwarn('Not warning for HD temperatures is deprecated. This will be removed in D-turtle')
00141         self._home_dir = home_dir
00142 
00143         self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray)
00144 
00145         self._last_temp_time = 0
00146         self._last_usage_time = 0
00147         self._last_publish_time = 0
00148 
00149         self._temp_timer = None
00150         self._usage_timer = None
00151 
00152         self._temp_stat = DiagnosticStatus()
00153         self._temp_stat.name = "%s HD Temperature" % diag_hostname
00154         self._temp_stat.level = DiagnosticStatus.ERROR
00155         self._temp_stat.hardware_id = hostname
00156         self._temp_stat.message = 'No Data'
00157         self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data'), 
00158                                    KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00159 
00160         if self._home_dir != '':
00161             self._usage_stat = DiagnosticStatus()
00162             self._usage_stat.level = DiagnosticStatus.ERROR
00163             self._usage_stat.hardware_id = hostname
00164             self._usage_stat.name = '%s HD Usage' % diag_hostname
00165             self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00166                                         KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00167             self.check_disk_usage()
00168 
00169         self.check_temps()
00170 
00171     ## Must have the lock to cancel everything
00172     def cancel_timers(self):
00173         if self._temp_timer:
00174             self._temp_timer.cancel()
00175             self._temp_timer = None
00176  
00177         if self._usage_timer:
00178             self._usage_timer.cancel()
00179             self._usage_timer = None
00180 
00181     def check_temps(self):
00182         if rospy.is_shutdown():
00183             with self._mutex:
00184                 self.cancel_timers()
00185             return
00186 
00187         diag_strs = [ KeyValue(key = 'Update Status', value = 'OK' ) ,
00188                       KeyValue(key = 'Time Since Last Update', value = '0' ) ]
00189         diag_level = DiagnosticStatus.OK
00190         diag_message = 'OK'
00191                 
00192         temp_ok, drives, makes, temps = get_hddtemp_data()
00193 
00194         for index in range(0, len(drives)):
00195             temp = temps[index]
00196             
00197             if not unicode(temp).isnumeric() and drives[index] not in REMOVABLE:
00198                 temp_level = DiagnosticStatus.ERROR
00199                 temp_ok = False
00200             elif not unicode(temp).isnumeric() and drives[index] in REMOVABLE:
00201                 temp_level = DiagnosticStatus.OK
00202                 temp = "Removed"
00203             else:
00204                 temp_level = DiagnosticStatus.OK
00205                 if float(temp) > hd_temp_warn:
00206                     temp_level = DiagnosticStatus.WARN
00207                 if float(temp) > hd_temp_error:
00208                     temp_level = DiagnosticStatus.ERROR
00209             
00210             diag_level = max(diag_level, temp_level)
00211             
00212             diag_strs.append(KeyValue(key = 'Disk %d Temp Status' % index, value = temp_dict[temp_level]))
00213             diag_strs.append(KeyValue(key = 'Disk %d Mount Pt.' % index, value = drives[index]))
00214             diag_strs.append(KeyValue(key = 'Disk %d Device ID' % index, value = makes[index]))
00215             diag_strs.append(KeyValue(key = 'Disk %d Temp' % index, value = temp))
00216         
00217         if not temp_ok:
00218             diag_level = DiagnosticStatus.ERROR
00219 
00220         with self._mutex:
00221             self._last_temp_time = rospy.get_time()
00222             self._temp_stat.values = diag_strs
00223             self._temp_stat.level = diag_level
00224             
00225             # Give No Data message if we have no reading
00226             self._temp_stat.message = temp_dict[diag_level]
00227             if not temp_ok:
00228                 self._temp_stat.message = 'Error'
00229 
00230             if self._no_temp_warn and temp_ok:
00231                 self._temp_stat.level = DiagnosticStatus.OK
00232 
00233             if not rospy.is_shutdown():
00234                 self._temp_timer = threading.Timer(10.0, self.check_temps)
00235                 self._temp_timer.start()
00236             else:
00237                 self.cancel_timers()
00238         
00239     def check_disk_usage(self):
00240         if rospy.is_shutdown():
00241             with self._mutex:
00242                 self.cancel_timers()
00243             return
00244 
00245         diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00246                       KeyValue(key = 'Time Since Last Update', value = '0' ) ]
00247         diag_level = DiagnosticStatus.OK
00248         diag_message = 'OK'
00249         
00250         try:
00251             p = subprocess.Popen(["df", "-P", "--block-size=1G", self._home_dir], 
00252                                  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
00253             stdout, stderr = p.communicate()
00254             retcode = p.returncode
00255             
00256             if (retcode == 0):
00257                 
00258                 diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'OK'))
00259                 row_count = 0
00260                 for row in stdout.split('\n'):
00261                     if len(row.split()) < 2:
00262                         continue
00263                     if not unicode(row.split()[1]).isnumeric() or float(row.split()[1]) < 10: # Ignore small drives
00264                         continue
00265                 
00266                     row_count += 1
00267                     g_available = row.split()[-3]
00268                     name = row.split()[0]
00269                     size = row.split()[1]
00270                     mount_pt = row.split()[-1]
00271                     
00272                     if (float(g_available) > low_hd_level):
00273                         level = DiagnosticStatus.OK
00274                     elif (float(g_available) > critical_hd_level):
00275                         level = DiagnosticStatus.WARN
00276                     else:
00277                         level = DiagnosticStatus.ERROR
00278                         
00279                     diag_vals.append(KeyValue(
00280                             key = 'Disk %d Name' % row_count, value = name))
00281                     diag_vals.append(KeyValue(
00282                             key = 'Disk %d Available' % row_count, value = g_available))
00283                     diag_vals.append(KeyValue(
00284                             key = 'Disk %d Size' % row_count, value = size))
00285                     diag_vals.append(KeyValue(
00286                             key = 'Disk %d Status' % row_count, value = stat_dict[level]))
00287                     diag_vals.append(KeyValue(
00288                             key = 'Disk %d Mount Point' % row_count, value = mount_pt))
00289                     
00290                     diag_level = max(diag_level, level)
00291                     diag_message = usage_dict[diag_level]
00292 
00293             else:
00294                 diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Failed'))
00295                 diag_level = DiagnosticStatus.ERROR
00296                 diag_message = stat_dict[diag_level]
00297             
00298                     
00299         except:
00300             rospy.logerr(traceback.format_exc())
00301             
00302             diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Exception'))
00303             diag_vals.append(KeyValue(key = 'Disk Space Ex', value = traceback.format_exc()))
00304 
00305             diag_level = DiagnosticStatus.ERROR
00306             diag_message = stat_dict[diag_level]
00307             
00308         # Update status
00309         with self._mutex:
00310             self._last_usage_time = rospy.get_time()
00311             self._usage_stat.values = diag_vals
00312             self._usage_stat.message = diag_message
00313             self._usage_stat.level = diag_level
00314             
00315             if not rospy.is_shutdown():
00316                 self._usage_timer = threading.Timer(5.0, self.check_disk_usage)
00317                 self._usage_timer.start()
00318             else:
00319                 self.cancel_timers()
00320 
00321         
00322     def publish_stats(self):
00323         with self._mutex:
00324             update_status_stale(self._temp_stat, self._last_temp_time)
00325             
00326             msg = DiagnosticArray()
00327             msg.header.stamp = rospy.get_rostime()
00328             msg.status.append(self._temp_stat)
00329             if self._home_dir != '':
00330                 update_status_stale(self._usage_stat, self._last_usage_time)
00331                 msg.status.append(self._usage_stat)
00332                 
00333             if rospy.get_time() - self._last_publish_time > 0.5:
00334                 self._diag_pub.publish(msg)
00335                 self._last_publish_time = rospy.get_time()
00336             
00337 
00338 
00339         
00340 ##\todo Need to check HD input/output too using iostat
00341 
00342 if __name__ == '__main__':
00343     hostname = socket.gethostname()
00344 
00345     import optparse
00346     parser = optparse.OptionParser(usage="usage: hd_monitor.py [--diag-hostname=cX]")
00347     parser.add_option("--diag-hostname", dest="diag_hostname",
00348                       help="Computer name in diagnostics output (ex: 'c1')",
00349                       metavar="DIAG_HOSTNAME",
00350                       action="store", default = hostname)
00351     options, args = parser.parse_args(rospy.myargv())
00352 
00353     home_dir = ''
00354     if len(args) > 1:
00355         home_dir = args[1]
00356 
00357     try:
00358         rospy.init_node('hd_monitor_%s' % hostname)
00359     except rospy.exceptions.ROSInitException:
00360         print 'HD monitor is unable to initialize node. Master may not be running.'
00361         sys.exit(0)
00362         
00363     hd_monitor = hd_monitor(hostname, options.diag_hostname, home_dir)
00364     rate = rospy.Rate(1.0)
00365 
00366     try:
00367         while not rospy.is_shutdown():
00368             rate.sleep()
00369             hd_monitor.publish_stats()
00370     except KeyboardInterrupt:
00371         pass
00372     except Exception, e:
00373         traceback.print_exc()
00374 
00375     hd_monitor.cancel_timers()
00376     sys.exit(0)
00377     
00378 
00379             


pr2_computer_monitor
Author(s): Kevin Watts (watts@willowgarage.com)
autogenerated on Fri Dec 6 2013 20:23:59