00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037
00038 from __future__ import with_statement
00039 import roslib
00040 roslib.load_manifest('diagnostic_common_diagnostics')
00041
00042 import rospy
00043
00044 import traceback
00045 import threading
00046 from threading import Timer
00047 import sys, os, time, string
00048 from time import sleep
00049 import subprocess
00050 import socket
00051
00052 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
00053
00054 low_hd_level = 5
00055 critical_hd_level = 1
00056
00057 hd_temp_warn = 55
00058 hd_temp_error = 70
00059
00060 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
00061 temp_dict = { 0: 'OK', 1: 'Hot', 2: 'Critical Hot' }
00062 usage_dict = { 0: 'OK', 1: 'Low Disk Space', 2: 'Very Low Disk Space' }
00063
00064 REMOVABLE = ['/dev/sg1', '/dev/sdb']
00065
00066
00067 def get_hddtemp_data(hostname = 'localhost', port = 7634):
00068 try:
00069 hd_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
00070 hd_sock.connect((hostname, port))
00071 sock_data = ''
00072 while True:
00073 newdat = hd_sock.recv(1024)
00074 if len(newdat) == 0:
00075 break
00076 sock_data = sock_data + newdat
00077 hd_sock.close()
00078
00079 sock_vals = sock_data.split('|')
00080
00081
00082 idx = 0
00083
00084 drives = []
00085 makes = []
00086 temps = []
00087 while idx + 5 < len(sock_vals):
00088 this_drive = sock_vals[idx + 1]
00089 this_make = sock_vals[idx + 2]
00090 this_temp = sock_vals[idx + 3]
00091
00092
00093
00094 if this_make in makes:
00095 idx += 5
00096 continue
00097
00098 drives.append(this_drive)
00099 makes.append(this_make)
00100 temps.append(this_temp)
00101
00102 idx += 5
00103
00104 return True, drives, makes, temps
00105 except:
00106 rospy.logerr(traceback.format_exc())
00107 return False, [ 'Exception' ], [ traceback.format_exc() ], [ '0' ]
00108
00109 def update_status_stale(stat, last_update_time):
00110 time_since_update = rospy.get_time() - last_update_time
00111
00112 stale_status = 'OK'
00113 if time_since_update > 20 and time_since_update <= 35:
00114 stale_status = 'Lagging'
00115 if stat.level == DiagnosticStatus.OK:
00116 stat.message = stale_status
00117 elif stat.message.find(stale_status) < 0:
00118 stat.message = ', '.join([stat.message, stale_status])
00119 stat.level = max(stat.level, DiagnosticStatus.WARN)
00120 if time_since_update > 35:
00121 stale_status = 'Stale'
00122 if stat.level == DiagnosticStatus.OK:
00123 stat.message = stale_status
00124 elif stat.message.find(stale_status) < 0:
00125 stat.message = ', '.join([stat.message, stale_status])
00126 stat.level = max(stat.level, DiagnosticStatus.ERROR)
00127
00128 stat.values.pop(0)
00129 stat.values.pop(0)
00130 stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
00131 stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
00132
00133 class hd_monitor():
00134 def __init__(self, hostname, diag_hostname, home_dir = ''):
00135 self._mutex = threading.Lock()
00136
00137 self._hostname = hostname
00138 self._no_temp_warn = rospy.get_param('~no_hd_temp_warn', False)
00139 if self._no_temp_warn:
00140 rospy.logwarn('Not warning for HD temperatures is deprecated. This will be removed in D-turtle')
00141 self._home_dir = home_dir
00142
00143 self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10)
00144
00145 self._last_temp_time = 0
00146 self._last_usage_time = 0
00147 self._last_publish_time = 0
00148
00149 self._temp_timer = None
00150 self._usage_timer = None
00151
00152 self._temp_stat = DiagnosticStatus()
00153 self._temp_stat.name = "%s HD Temperature" % diag_hostname
00154 self._temp_stat.level = DiagnosticStatus.ERROR
00155 self._temp_stat.hardware_id = hostname
00156 self._temp_stat.message = 'No Data'
00157 self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data'),
00158 KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00159
00160 if self._home_dir != '':
00161 self._usage_stat = DiagnosticStatus()
00162 self._usage_stat.level = DiagnosticStatus.ERROR
00163 self._usage_stat.hardware_id = hostname
00164 self._usage_stat.name = '%s HD Usage' % diag_hostname
00165 self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00166 KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00167 self.check_disk_usage()
00168
00169 self.check_temps()
00170
00171
00172 def cancel_timers(self):
00173 if self._temp_timer:
00174 self._temp_timer.cancel()
00175 self._temp_timer = None
00176
00177 if self._usage_timer:
00178 self._usage_timer.cancel()
00179 self._usage_timer = None
00180
00181 def check_temps(self):
00182 if rospy.is_shutdown():
00183 with self._mutex:
00184 self.cancel_timers()
00185 return
00186
00187 diag_strs = [ KeyValue(key = 'Update Status', value = 'OK' ) ,
00188 KeyValue(key = 'Time Since Last Update', value = '0' ) ]
00189 diag_level = DiagnosticStatus.OK
00190 diag_message = 'OK'
00191
00192 temp_ok, drives, makes, temps = get_hddtemp_data()
00193
00194 for index in range(0, len(drives)):
00195 temp = temps[index]
00196
00197 if not unicode(temp).isnumeric() and drives[index] not in REMOVABLE:
00198 temp_level = DiagnosticStatus.ERROR
00199 temp_ok = False
00200 elif not unicode(temp).isnumeric() and drives[index] in REMOVABLE:
00201 temp_level = DiagnosticStatus.OK
00202 temp = "Removed"
00203 else:
00204 temp_level = DiagnosticStatus.OK
00205 if float(temp) > hd_temp_warn:
00206 temp_level = DiagnosticStatus.WARN
00207 if float(temp) > hd_temp_error:
00208 temp_level = DiagnosticStatus.ERROR
00209
00210 diag_level = max(diag_level, temp_level)
00211
00212 diag_strs.append(KeyValue(key = 'Disk %d Temp Status' % index, value = temp_dict[temp_level]))
00213 diag_strs.append(KeyValue(key = 'Disk %d Mount Pt.' % index, value = drives[index]))
00214 diag_strs.append(KeyValue(key = 'Disk %d Device ID' % index, value = makes[index]))
00215 diag_strs.append(KeyValue(key = 'Disk %d Temp' % index, value = temp))
00216
00217 if not temp_ok:
00218 diag_level = DiagnosticStatus.ERROR
00219
00220 with self._mutex:
00221 self._last_temp_time = rospy.get_time()
00222 self._temp_stat.values = diag_strs
00223 self._temp_stat.level = diag_level
00224
00225
00226 self._temp_stat.message = temp_dict[diag_level]
00227 if not temp_ok:
00228 self._temp_stat.message = 'Error'
00229
00230 if self._no_temp_warn and temp_ok:
00231 self._temp_stat.level = DiagnosticStatus.OK
00232
00233 if not rospy.is_shutdown():
00234 self._temp_timer = threading.Timer(10.0, self.check_temps)
00235 self._temp_timer.start()
00236 else:
00237 self.cancel_timers()
00238
00239 def check_disk_usage(self):
00240 if rospy.is_shutdown():
00241 with self._mutex:
00242 self.cancel_timers()
00243 return
00244
00245 diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00246 KeyValue(key = 'Time Since Last Update', value = '0' ) ]
00247 diag_level = DiagnosticStatus.OK
00248 diag_message = 'OK'
00249
00250 try:
00251 p = subprocess.Popen(["df", "-P", "--block-size=1G", self._home_dir],
00252 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
00253 stdout, stderr = p.communicate()
00254 retcode = p.returncode
00255
00256 if (retcode == 0):
00257
00258 diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'OK'))
00259 row_count = 0
00260 for row in stdout.split('\n'):
00261 if len(row.split()) < 2:
00262 continue
00263 if not unicode(row.split()[1]).isnumeric() or float(row.split()[1]) < 10:
00264 continue
00265
00266 row_count += 1
00267 g_available = row.split()[-3]
00268 name = row.split()[0]
00269 size = row.split()[1]
00270 mount_pt = row.split()[-1]
00271
00272 if (float(g_available) > low_hd_level):
00273 level = DiagnosticStatus.OK
00274 elif (float(g_available) > critical_hd_level):
00275 level = DiagnosticStatus.WARN
00276 else:
00277 level = DiagnosticStatus.ERROR
00278
00279 diag_vals.append(KeyValue(
00280 key = 'Disk %d Name' % row_count, value = name))
00281 diag_vals.append(KeyValue(
00282 key = 'Disk %d Available' % row_count, value = g_available))
00283 diag_vals.append(KeyValue(
00284 key = 'Disk %d Size' % row_count, value = size))
00285 diag_vals.append(KeyValue(
00286 key = 'Disk %d Status' % row_count, value = stat_dict[level]))
00287 diag_vals.append(KeyValue(
00288 key = 'Disk %d Mount Point' % row_count, value = mount_pt))
00289
00290 diag_level = max(diag_level, level)
00291 diag_message = usage_dict[diag_level]
00292
00293 else:
00294 diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Failed'))
00295 diag_level = DiagnosticStatus.ERROR
00296 diag_message = stat_dict[diag_level]
00297
00298
00299 except:
00300 rospy.logerr(traceback.format_exc())
00301
00302 diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Exception'))
00303 diag_vals.append(KeyValue(key = 'Disk Space Ex', value = traceback.format_exc()))
00304
00305 diag_level = DiagnosticStatus.ERROR
00306 diag_message = stat_dict[diag_level]
00307
00308
00309 with self._mutex:
00310 self._last_usage_time = rospy.get_time()
00311 self._usage_stat.values = diag_vals
00312 self._usage_stat.message = diag_message
00313 self._usage_stat.level = diag_level
00314
00315 if not rospy.is_shutdown():
00316 self._usage_timer = threading.Timer(5.0, self.check_disk_usage)
00317 self._usage_timer.start()
00318 else:
00319 self.cancel_timers()
00320
00321
00322 def publish_stats(self):
00323 with self._mutex:
00324 update_status_stale(self._temp_stat, self._last_temp_time)
00325
00326 msg = DiagnosticArray()
00327 msg.header.stamp = rospy.get_rostime()
00328 msg.status.append(self._temp_stat)
00329 if self._home_dir != '':
00330 update_status_stale(self._usage_stat, self._last_usage_time)
00331 msg.status.append(self._usage_stat)
00332
00333
00334
00335
00336
00337 if rospy.get_time() - self._last_publish_time > 0.5:
00338 self._diag_pub.publish(msg)
00339 self._last_publish_time = rospy.get_time()
00340
00341
00342
00343
00344
00345
00346 if __name__ == '__main__':
00347 hostname = socket.gethostname()
00348
00349 import optparse
00350 parser = optparse.OptionParser(usage="usage: hd_monitor.py [--diag-hostname=cX]")
00351 parser.add_option("--diag-hostname", dest="diag_hostname",
00352 help="Computer name in diagnostics output (ex: 'c1')",
00353 metavar="DIAG_HOSTNAME",
00354 action="store", default = hostname)
00355 options, args = parser.parse_args(rospy.myargv())
00356
00357 home_dir = ''
00358 if len(args) > 1:
00359 home_dir = args[1]
00360
00361 hostname_clean = string.translate(hostname, string.maketrans('-','_'))
00362 try:
00363 rospy.init_node('hd_monitor_%s' % hostname_clean)
00364 except rospy.exceptions.ROSInitException:
00365 print 'HD monitor is unable to initialize node. Master may not be running.'
00366 sys.exit(0)
00367
00368 hd_monitor = hd_monitor(hostname, options.diag_hostname, home_dir)
00369 rate = rospy.Rate(1.0)
00370
00371 try:
00372 while not rospy.is_shutdown():
00373 rate.sleep()
00374 hd_monitor.publish_stats()
00375 except KeyboardInterrupt:
00376 pass
00377 except Exception, e:
00378 traceback.print_exc()
00379
00380 hd_monitor.cancel_timers()
00381 sys.exit(0)
00382
00383
00384