$search
00001 #!/usr/bin/env python 00002 # 00003 # Software License Agreement (BSD License) 00004 # 00005 # Copyright (c) 2009, Willow Garage, Inc. 00006 # All rights reserved. 00007 # 00008 # Redistribution and use in source and binary forms, with or without 00009 # modification, are permitted provided that the following conditions 00010 # are met: 00011 # 00012 # * Redistributions of source code must retain the above copyright 00013 # notice, this list of conditions and the following disclaimer. 00014 # * Redistributions in binary form must reproduce the above 00015 # copyright notice, this list of conditions and the following 00016 # disclaimer in the documentation and/or other materials provided 00017 # with the distribution. 00018 # * Neither the name of the Willow Garage nor the names of its 00019 # contributors may be used to endorse or promote products derived 00020 # from this software without specific prior written permission. 00021 # 00022 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 00023 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 00024 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 00025 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 00026 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 00027 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 00028 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 00029 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 00030 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 00031 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 00032 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 00033 # POSSIBILITY OF SUCH DAMAGE. 00034 00035 ##\author Kevin Watts 00036 00037 from __future__ import with_statement 00038 import roslib 00039 roslib.load_manifest('pr2_computer_monitor') 00040 00041 import rospy 00042 00043 import traceback 00044 import threading 00045 from threading import Timer 00046 import sys, os, time 00047 from time import sleep 00048 import subprocess 00049 00050 import socket 00051 00052 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue 00053 00054 low_hd_level = 5 00055 critical_hd_level = 1 00056 00057 hd_temp_warn = 55 #3580, setting to 55C to after checking manual 00058 hd_temp_error = 70 # Above this temperature, hard drives will have serious problems 00059 00060 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' } 00061 temp_dict = { 0: 'OK', 1: 'Hot', 2: 'Critical Hot' } 00062 usage_dict = { 0: 'OK', 1: 'Low Disk Space', 2: 'Very Low Disk Space' } 00063 00064 REMOVABLE = ['/dev/sg1', '/dev/sdb'] # Store removable drives so we can ignore if removed 00065 00066 ## Connects to hddtemp daemon to get temp, HD make. 00067 def get_hddtemp_data(hostname = 'localhost', port = 7634): 00068 try: 00069 hd_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) 00070 hd_sock.connect((hostname, port)) 00071 sock_data = '' 00072 while True: 00073 newdat = hd_sock.recv(1024) 00074 if len(newdat) == 0: 00075 break 00076 sock_data = sock_data + newdat 00077 hd_sock.close() 00078 00079 sock_vals = sock_data.split('|') 00080 00081 # Format of output looks like ' | DRIVE | MAKE | TEMP | ' 00082 idx = 0 00083 00084 drives = [] 00085 makes = [] 00086 temps = [] 00087 while idx + 5 < len(sock_vals): 00088 this_drive = sock_vals[idx + 1] 00089 this_make = sock_vals[idx + 2] 00090 this_temp = sock_vals[idx + 3] 00091 00092 # Sometimes we get duplicate makes if hard drives are mounted 00093 # to two different points 00094 if this_make in makes: 00095 idx += 5 00096 continue 00097 00098 drives.append(this_drive) 00099 makes.append(this_make) 00100 temps.append(this_temp) 00101 00102 idx += 5 00103 00104 return True, drives, makes, temps 00105 except: 00106 rospy.logerr(traceback.format_exc()) 00107 return False, [ 'Exception' ], [ traceback.format_exc() ], [ 0 ] 00108 00109 def update_status_stale(stat, last_update_time): 00110 time_since_update = rospy.get_time() - last_update_time 00111 00112 stale_status = 'OK' 00113 if time_since_update > 20 and time_since_update <= 35: 00114 stale_status = 'Lagging' 00115 if stat.level == DiagnosticStatus.OK: 00116 stat.message = stale_status 00117 elif stat.message.find(stale_status) < 0: 00118 stat.message = ', '.join([stat.message, stale_status]) 00119 stat.level = max(stat.level, DiagnosticStatus.WARN) 00120 if time_since_update > 35: 00121 stale_status = 'Stale' 00122 if stat.level == DiagnosticStatus.OK: 00123 stat.message = stale_status 00124 elif stat.message.find(stale_status) < 0: 00125 stat.message = ', '.join([stat.message, stale_status]) 00126 stat.level = max(stat.level, DiagnosticStatus.ERROR) 00127 00128 stat.values.pop(0) 00129 stat.values.pop(0) 00130 stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status)) 00131 stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update))) 00132 00133 class hd_monitor(): 00134 def __init__(self, hostname, diag_hostname, home_dir = ''): 00135 self._mutex = threading.Lock() 00136 00137 self._hostname = hostname 00138 self._no_temp_warn = rospy.get_param('~no_hd_temp_warn', False) 00139 if self._no_temp_warn: 00140 rospy.logwarn('Not warning for HD temperatures is deprecated. This will be removed in D-turtle') 00141 self._home_dir = home_dir 00142 00143 self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray) 00144 00145 self._last_temp_time = 0 00146 self._last_usage_time = 0 00147 self._last_publish_time = 0 00148 00149 self._temp_timer = None 00150 self._usage_timer = None 00151 00152 self._temp_stat = DiagnosticStatus() 00153 self._temp_stat.name = "%s HD Temperature" % diag_hostname 00154 self._temp_stat.level = DiagnosticStatus.ERROR 00155 self._temp_stat.hardware_id = hostname 00156 self._temp_stat.message = 'No Data' 00157 self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data'), 00158 KeyValue(key = 'Time Since Last Update', value = 'N/A') ] 00159 00160 if self._home_dir != '': 00161 self._usage_stat = DiagnosticStatus() 00162 self._usage_stat.level = DiagnosticStatus.ERROR 00163 self._usage_stat.hardware_id = hostname 00164 self._usage_stat.name = '%s HD Usage' % diag_hostname 00165 self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ), 00166 KeyValue(key = 'Time Since Last Update', value = 'N/A') ] 00167 self.check_disk_usage() 00168 00169 self.check_temps() 00170 00171 ## Must have the lock to cancel everything 00172 def cancel_timers(self): 00173 if self._temp_timer: 00174 self._temp_timer.cancel() 00175 self._temp_timer = None 00176 00177 if self._usage_timer: 00178 self._usage_timer.cancel() 00179 self._usage_timer = None 00180 00181 def check_temps(self): 00182 if rospy.is_shutdown(): 00183 with self._mutex: 00184 self.cancel_timers() 00185 return 00186 00187 diag_strs = [ KeyValue(key = 'Update Status', value = 'OK' ) , 00188 KeyValue(key = 'Time Since Last Update', value = '0' ) ] 00189 diag_level = DiagnosticStatus.OK 00190 diag_message = 'OK' 00191 00192 temp_ok, drives, makes, temps = get_hddtemp_data() 00193 00194 for index in range(0, len(drives)): 00195 temp = temps[index] 00196 00197 if not unicode(temp).isnumeric() and drives[index] not in REMOVABLE: 00198 temp_level = DiagnosticStatus.ERROR 00199 temp_ok = False 00200 elif not unicode(temp).isnumeric() and drives[index] in REMOVABLE: 00201 temp_level = DiagnosticStatus.OK 00202 temp = "Removed" 00203 else: 00204 temp_level = DiagnosticStatus.OK 00205 if float(temp) > hd_temp_warn: 00206 temp_level = DiagnosticStatus.WARN 00207 if float(temp) > hd_temp_error: 00208 temp_level = DiagnosticStatus.ERROR 00209 00210 diag_level = max(diag_level, temp_level) 00211 00212 diag_strs.append(KeyValue(key = 'Disk %d Temp Status' % index, value = temp_dict[temp_level])) 00213 diag_strs.append(KeyValue(key = 'Disk %d Mount Pt.' % index, value = drives[index])) 00214 diag_strs.append(KeyValue(key = 'Disk %d Device ID' % index, value = makes[index])) 00215 diag_strs.append(KeyValue(key = 'Disk %d Temp' % index, value = temp)) 00216 00217 if not temp_ok: 00218 diag_level = DiagnosticStatus.ERROR 00219 00220 with self._mutex: 00221 self._last_temp_time = rospy.get_time() 00222 self._temp_stat.values = diag_strs 00223 self._temp_stat.level = diag_level 00224 00225 # Give No Data message if we have no reading 00226 self._temp_stat.message = temp_dict[diag_level] 00227 if not temp_ok: 00228 self._temp_stat.message = 'Error' 00229 00230 if self._no_temp_warn and temp_ok: 00231 self._temp_stat.level = DiagnosticStatus.OK 00232 00233 if not rospy.is_shutdown(): 00234 self._temp_timer = threading.Timer(10.0, self.check_temps) 00235 self._temp_timer.start() 00236 else: 00237 self.cancel_timers() 00238 00239 def check_disk_usage(self): 00240 if rospy.is_shutdown(): 00241 with self._mutex: 00242 self.cancel_timers() 00243 return 00244 00245 diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ), 00246 KeyValue(key = 'Time Since Last Update', value = '0' ) ] 00247 diag_level = DiagnosticStatus.OK 00248 diag_message = 'OK' 00249 00250 try: 00251 p = subprocess.Popen(["df", "-P", "--block-size=1G", self._home_dir], 00252 stdout=subprocess.PIPE, stderr=subprocess.PIPE) 00253 stdout, stderr = p.communicate() 00254 retcode = p.returncode 00255 00256 if (retcode == 0): 00257 00258 diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'OK')) 00259 row_count = 0 00260 for row in stdout.split('\n'): 00261 if len(row.split()) < 2: 00262 continue 00263 if not unicode(row.split()[1]).isnumeric() or float(row.split()[1]) < 10: # Ignore small drives 00264 continue 00265 00266 row_count += 1 00267 g_available = row.split()[-3] 00268 name = row.split()[0] 00269 size = row.split()[1] 00270 mount_pt = row.split()[-1] 00271 00272 if (float(g_available) > low_hd_level): 00273 level = DiagnosticStatus.OK 00274 elif (float(g_available) > critical_hd_level): 00275 level = DiagnosticStatus.WARN 00276 else: 00277 level = DiagnosticStatus.ERROR 00278 00279 diag_vals.append(KeyValue( 00280 key = 'Disk %d Name' % row_count, value = name)) 00281 diag_vals.append(KeyValue( 00282 key = 'Disk %d Available' % row_count, value = g_available)) 00283 diag_vals.append(KeyValue( 00284 key = 'Disk %d Size' % row_count, value = size)) 00285 diag_vals.append(KeyValue( 00286 key = 'Disk %d Status' % row_count, value = stat_dict[level])) 00287 diag_vals.append(KeyValue( 00288 key = 'Disk %d Mount Point' % row_count, value = mount_pt)) 00289 00290 diag_level = max(diag_level, level) 00291 diag_message = usage_dict[diag_level] 00292 00293 else: 00294 diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Failed')) 00295 diag_level = DiagnosticStatus.ERROR 00296 diag_message = stat_dict[diag_level] 00297 00298 00299 except: 00300 rospy.logerr(traceback.format_exc()) 00301 00302 diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Exception')) 00303 diag_vals.append(KeyValue(key = 'Disk Space Ex', value = traceback.format_exc())) 00304 00305 diag_level = DiagnosticStatus.ERROR 00306 diag_message = stat_dict[diag_level] 00307 00308 # Update status 00309 with self._mutex: 00310 self._last_usage_time = rospy.get_time() 00311 self._usage_stat.values = diag_vals 00312 self._usage_stat.message = diag_message 00313 self._usage_stat.level = diag_level 00314 00315 if not rospy.is_shutdown(): 00316 self._usage_timer = threading.Timer(5.0, self.check_disk_usage) 00317 self._usage_timer.start() 00318 else: 00319 self.cancel_timers() 00320 00321 00322 def publish_stats(self): 00323 with self._mutex: 00324 update_status_stale(self._temp_stat, self._last_temp_time) 00325 00326 msg = DiagnosticArray() 00327 msg.header.stamp = rospy.get_rostime() 00328 msg.status.append(self._temp_stat) 00329 if self._home_dir != '': 00330 update_status_stale(self._usage_stat, self._last_usage_time) 00331 msg.status.append(self._usage_stat) 00332 00333 if rospy.get_time() - self._last_publish_time > 0.5: 00334 self._diag_pub.publish(msg) 00335 self._last_publish_time = rospy.get_time() 00336 00337 00338 00339 00340 ##\todo Need to check HD input/output too using iostat 00341 00342 if __name__ == '__main__': 00343 hostname = socket.gethostname() 00344 00345 import optparse 00346 parser = optparse.OptionParser(usage="usage: hd_monitor.py [--diag-hostname=cX]") 00347 parser.add_option("--diag-hostname", dest="diag_hostname", 00348 help="Computer name in diagnostics output (ex: 'c1')", 00349 metavar="DIAG_HOSTNAME", 00350 action="store", default = hostname) 00351 options, args = parser.parse_args(rospy.myargv()) 00352 00353 home_dir = '' 00354 if len(args) > 1: 00355 home_dir = args[1] 00356 00357 try: 00358 rospy.init_node('hd_monitor_%s' % hostname) 00359 except rospy.exceptions.ROSInitException: 00360 print 'HD monitor is unable to initialize node. Master may not be running.' 00361 sys.exit(0) 00362 00363 hd_monitor = hd_monitor(hostname, options.diag_hostname, home_dir) 00364 rate = rospy.Rate(1.0) 00365 00366 try: 00367 while not rospy.is_shutdown(): 00368 rate.sleep() 00369 hd_monitor.publish_stats() 00370 except KeyboardInterrupt: 00371 pass 00372 except Exception, e: 00373 traceback.print_exc() 00374 00375 hd_monitor.cancel_timers() 00376 sys.exit(0) 00377 00378 00379