hd_monitor.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 # -*- coding: utf-8 -*-
00003 #
00004 # Software License Agreement (BSD License)
00005 #
00006 # Copyright (c) 2009, Willow Garage, Inc.
00007 # All rights reserved.
00008 #
00009 # Redistribution and use in source and binary forms, with or without
00010 # modification, are permitted provided that the following conditions
00011 # are met:
00012 #
00013 #  * Redistributions of source code must retain the above copyright
00014 #    notice, this list of conditions and the following disclaimer.
00015 #  * Redistributions in binary form must reproduce the above
00016 #    copyright notice, this list of conditions and the following
00017 #    disclaimer in the documentation and/or other materials provided
00018 #    with the distribution.
00019 #  * Neither the name of the Willow Garage nor the names of its
00020 #    contributors may be used to endorse or promote products derived
00021 #    from this software without specific prior written permission.
00022 #
00023 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
00024 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
00025 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
00026 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
00027 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
00028 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
00029 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
00030 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
00031 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
00032 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
00033 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00034 # POSSIBILITY OF SUCH DAMAGE.
00035 
00036 ##\author Kevin Watts
00037 
00038 from __future__ import with_statement
00039 import roslib
00040 roslib.load_manifest('diagnostic_common_diagnostics')
00041 
00042 import rospy
00043 
00044 import traceback
00045 import threading
00046 from threading import Timer
00047 import sys, os, time, string
00048 from time import sleep
00049 import subprocess
00050 import socket
00051 
00052 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
00053 
00054 low_hd_level = 5
00055 critical_hd_level = 1
00056 
00057 hd_temp_warn = 55 #3580, setting to 55C to after checking manual
00058 hd_temp_error = 70 # Above this temperature, hard drives will have serious problems
00059 
00060 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
00061 temp_dict = { 0: 'OK', 1: 'Hot', 2: 'Critical Hot' }
00062 usage_dict = { 0: 'OK', 1: 'Low Disk Space', 2: 'Very Low Disk Space' }
00063 
00064 REMOVABLE = ['/dev/sg1', '/dev/sdb'] # Store removable drives so we can ignore if removed
00065 
00066 ## Connects to hddtemp daemon to get temp, HD make.
00067 def get_hddtemp_data(hostname = 'localhost', port = 7634):
00068     try:
00069         hd_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
00070         hd_sock.connect((hostname, port))
00071         sock_data = ''
00072         while True:
00073             newdat = hd_sock.recv(1024)
00074             if len(newdat) == 0:
00075                 break
00076             sock_data = sock_data + newdat
00077         hd_sock.close()
00078 
00079         sock_vals = sock_data.split('|')
00080 
00081         # Format of output looks like ' | DRIVE | MAKE | TEMP | '
00082         idx = 0
00083 
00084         drives = []
00085         makes = []
00086         temps = []
00087         while idx + 5 < len(sock_vals):
00088             this_drive = sock_vals[idx + 1]
00089             this_make = sock_vals[idx + 2]
00090             this_temp = sock_vals[idx + 3]
00091 
00092             # Sometimes we get duplicate makes if hard drives are mounted
00093             # to two different points
00094             if this_make in makes:
00095                 idx += 5
00096                 continue
00097 
00098             drives.append(this_drive)
00099             makes.append(this_make)
00100             temps.append(this_temp)
00101 
00102             idx += 5
00103 
00104         return True, drives, makes, temps
00105     except:
00106         rospy.logerr(traceback.format_exc())
00107         return False, [ 'Exception' ], [ traceback.format_exc() ], [ '0' ]
00108 
00109 def update_status_stale(stat, last_update_time):
00110     time_since_update = rospy.get_time() - last_update_time
00111 
00112     stale_status = 'OK'
00113     if time_since_update > 20 and time_since_update <= 35:
00114         stale_status = 'Lagging'
00115         if stat.level == DiagnosticStatus.OK:
00116             stat.message = stale_status
00117         elif stat.message.find(stale_status) < 0:
00118             stat.message = ', '.join([stat.message, stale_status])
00119         stat.level = max(stat.level, DiagnosticStatus.WARN)
00120     if time_since_update > 35:
00121         stale_status = 'Stale'
00122         if stat.level == DiagnosticStatus.OK:
00123             stat.message = stale_status
00124         elif stat.message.find(stale_status) < 0:
00125             stat.message = ', '.join([stat.message, stale_status])
00126         stat.level = max(stat.level, DiagnosticStatus.ERROR)
00127 
00128     stat.values.pop(0)
00129     stat.values.pop(0)
00130     stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
00131     stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
00132 
00133 class hd_monitor():
00134     def __init__(self, hostname, diag_hostname, home_dir = ''):
00135         self._mutex = threading.Lock()
00136 
00137         self._hostname = hostname
00138         self._no_temp_warn = rospy.get_param('~no_hd_temp_warn', False)
00139         if self._no_temp_warn:
00140             rospy.logwarn('Not warning for HD temperatures is deprecated. This will be removed in D-turtle')
00141         self._home_dir = home_dir
00142 
00143         self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray)
00144 
00145         self._last_temp_time = 0
00146         self._last_usage_time = 0
00147         self._last_publish_time = 0
00148 
00149         self._temp_timer = None
00150         self._usage_timer = None
00151 
00152         self._temp_stat = DiagnosticStatus()
00153         self._temp_stat.name = "%s HD Temperature" % diag_hostname
00154         self._temp_stat.level = DiagnosticStatus.ERROR
00155         self._temp_stat.hardware_id = hostname
00156         self._temp_stat.message = 'No Data'
00157         self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data'),
00158                                    KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00159 
00160         if self._home_dir != '':
00161             self._usage_stat = DiagnosticStatus()
00162             self._usage_stat.level = DiagnosticStatus.ERROR
00163             self._usage_stat.hardware_id = hostname
00164             self._usage_stat.name = '%s HD Usage' % diag_hostname
00165             self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00166                                         KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00167             self.check_disk_usage()
00168 
00169         self.check_temps()
00170 
00171     ## Must have the lock to cancel everything
00172     def cancel_timers(self):
00173         if self._temp_timer:
00174             self._temp_timer.cancel()
00175             self._temp_timer = None
00176 
00177         if self._usage_timer:
00178             self._usage_timer.cancel()
00179             self._usage_timer = None
00180 
00181     def check_temps(self):
00182         if rospy.is_shutdown():
00183             with self._mutex:
00184                 self.cancel_timers()
00185             return
00186 
00187         diag_strs = [ KeyValue(key = 'Update Status', value = 'OK' ) ,
00188                       KeyValue(key = 'Time Since Last Update', value = '0' ) ]
00189         diag_level = DiagnosticStatus.OK
00190         diag_message = 'OK'
00191 
00192         temp_ok, drives, makes, temps = get_hddtemp_data()
00193 
00194         for index in range(0, len(drives)):
00195             temp = temps[index]
00196 
00197             if not unicode(temp).isnumeric() and drives[index] not in REMOVABLE:
00198                 temp_level = DiagnosticStatus.ERROR
00199                 temp_ok = False
00200             elif not unicode(temp).isnumeric() and drives[index] in REMOVABLE:
00201                 temp_level = DiagnosticStatus.OK
00202                 temp = "Removed"
00203             else:
00204                 temp_level = DiagnosticStatus.OK
00205                 if float(temp) > hd_temp_warn:
00206                     temp_level = DiagnosticStatus.WARN
00207                 if float(temp) > hd_temp_error:
00208                     temp_level = DiagnosticStatus.ERROR
00209 
00210             diag_level = max(diag_level, temp_level)
00211 
00212             diag_strs.append(KeyValue(key = 'Disk %d Temp Status' % index, value = temp_dict[temp_level]))
00213             diag_strs.append(KeyValue(key = 'Disk %d Mount Pt.' % index, value = drives[index]))
00214             diag_strs.append(KeyValue(key = 'Disk %d Device ID' % index, value = makes[index]))
00215             diag_strs.append(KeyValue(key = 'Disk %d Temp' % index, value = temp))
00216 
00217         if not temp_ok:
00218             diag_level = DiagnosticStatus.ERROR
00219 
00220         with self._mutex:
00221             self._last_temp_time = rospy.get_time()
00222             self._temp_stat.values = diag_strs
00223             self._temp_stat.level = diag_level
00224 
00225             # Give No Data message if we have no reading
00226             self._temp_stat.message = temp_dict[diag_level]
00227             if not temp_ok:
00228                 self._temp_stat.message = 'Error'
00229 
00230             if self._no_temp_warn and temp_ok:
00231                 self._temp_stat.level = DiagnosticStatus.OK
00232 
00233             if not rospy.is_shutdown():
00234                 self._temp_timer = threading.Timer(10.0, self.check_temps)
00235                 self._temp_timer.start()
00236             else:
00237                 self.cancel_timers()
00238 
00239     def check_disk_usage(self):
00240         if rospy.is_shutdown():
00241             with self._mutex:
00242                 self.cancel_timers()
00243             return
00244 
00245         diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00246                       KeyValue(key = 'Time Since Last Update', value = '0' ) ]
00247         diag_level = DiagnosticStatus.OK
00248         diag_message = 'OK'
00249 
00250         try:
00251             p = subprocess.Popen(["df", "-P", "--block-size=1G", self._home_dir],
00252                                  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
00253             stdout, stderr = p.communicate()
00254             retcode = p.returncode
00255 
00256             if (retcode == 0):
00257 
00258                 diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'OK'))
00259                 row_count = 0
00260                 for row in stdout.split('\n'):
00261                     if len(row.split()) < 2:
00262                         continue
00263                     if not unicode(row.split()[1]).isnumeric() or float(row.split()[1]) < 10: # Ignore small drives
00264                         continue
00265 
00266                     row_count += 1
00267                     g_available = row.split()[-3]
00268                     name = row.split()[0]
00269                     size = row.split()[1]
00270                     mount_pt = row.split()[-1]
00271 
00272                     if (float(g_available) > low_hd_level):
00273                         level = DiagnosticStatus.OK
00274                     elif (float(g_available) > critical_hd_level):
00275                         level = DiagnosticStatus.WARN
00276                     else:
00277                         level = DiagnosticStatus.ERROR
00278 
00279                     diag_vals.append(KeyValue(
00280                             key = 'Disk %d Name' % row_count, value = name))
00281                     diag_vals.append(KeyValue(
00282                             key = 'Disk %d Available' % row_count, value = g_available))
00283                     diag_vals.append(KeyValue(
00284                             key = 'Disk %d Size' % row_count, value = size))
00285                     diag_vals.append(KeyValue(
00286                             key = 'Disk %d Status' % row_count, value = stat_dict[level]))
00287                     diag_vals.append(KeyValue(
00288                             key = 'Disk %d Mount Point' % row_count, value = mount_pt))
00289 
00290                     diag_level = max(diag_level, level)
00291                     diag_message = usage_dict[diag_level]
00292 
00293             else:
00294                 diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Failed'))
00295                 diag_level = DiagnosticStatus.ERROR
00296                 diag_message = stat_dict[diag_level]
00297 
00298 
00299         except:
00300             rospy.logerr(traceback.format_exc())
00301 
00302             diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Exception'))
00303             diag_vals.append(KeyValue(key = 'Disk Space Ex', value = traceback.format_exc()))
00304 
00305             diag_level = DiagnosticStatus.ERROR
00306             diag_message = stat_dict[diag_level]
00307 
00308         # Update status
00309         with self._mutex:
00310             self._last_usage_time = rospy.get_time()
00311             self._usage_stat.values = diag_vals
00312             self._usage_stat.message = diag_message
00313             self._usage_stat.level = diag_level
00314 
00315             if not rospy.is_shutdown():
00316                 self._usage_timer = threading.Timer(5.0, self.check_disk_usage)
00317                 self._usage_timer.start()
00318             else:
00319                 self.cancel_timers()
00320 
00321 
00322     def publish_stats(self):
00323         with self._mutex:
00324             update_status_stale(self._temp_stat, self._last_temp_time)
00325 
00326             msg = DiagnosticArray()
00327             msg.header.stamp = rospy.get_rostime()
00328             msg.status.append(self._temp_stat)
00329             if self._home_dir != '':
00330                 update_status_stale(self._usage_stat, self._last_usage_time)
00331                 msg.status.append(self._usage_stat)
00332 
00333             #for s in msg.status:
00334                 #for v in s.values:
00335                     #print v, type(v.value)
00336 
00337             if rospy.get_time() - self._last_publish_time > 0.5:
00338                 self._diag_pub.publish(msg)
00339                 self._last_publish_time = rospy.get_time()
00340 
00341 
00342 
00343 
00344 ##\todo Need to check HD input/output too using iostat
00345 
00346 if __name__ == '__main__':
00347     hostname = socket.gethostname()
00348 
00349     import optparse
00350     parser = optparse.OptionParser(usage="usage: hd_monitor.py [--diag-hostname=cX]")
00351     parser.add_option("--diag-hostname", dest="diag_hostname",
00352                       help="Computer name in diagnostics output (ex: 'c1')",
00353                       metavar="DIAG_HOSTNAME",
00354                       action="store", default = hostname)
00355     options, args = parser.parse_args(rospy.myargv())
00356 
00357     home_dir = ''
00358     if len(args) > 1:
00359         home_dir = args[1]
00360 
00361     hostname_clean = string.translate(hostname, string.maketrans('-','_'))
00362     try:
00363         rospy.init_node('hd_monitor_%s' % hostname_clean)
00364     except rospy.exceptions.ROSInitException:
00365         print 'HD monitor is unable to initialize node. Master may not be running.'
00366         sys.exit(0)
00367 
00368     hd_monitor = hd_monitor(hostname, options.diag_hostname, home_dir)
00369     rate = rospy.Rate(1.0)
00370 
00371     try:
00372         while not rospy.is_shutdown():
00373             rate.sleep()
00374             hd_monitor.publish_stats()
00375     except KeyboardInterrupt:
00376         pass
00377     except Exception, e:
00378         traceback.print_exc()
00379 
00380     hd_monitor.cancel_timers()
00381     sys.exit(0)
00382 
00383 
00384 


diagnostic_common_diagnostics
Author(s): Brice Rebsamen
autogenerated on Sun Oct 5 2014 23:27:39