00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 from __future__ import with_statement
00038 import roslib
00039 roslib.load_manifest('labust_diagnostics')
00040
00041 import rospy
00042
00043 import traceback
00044 import threading
00045 from threading import Timer
00046 import sys, os, time
00047 from time import sleep
00048 import subprocess
00049
00050 import socket
00051
00052 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
00053
00054
00055
00056 if sys.version_info[:3] == (2, 7, 3):
00057 import threading
00058 threading._DummyThread._Thread__stop = lambda x: 42
00059
00060
00061 low_hd_level = 5
00062 critical_hd_level = 1
00063
00064 hd_temp_warn = 55
00065 hd_temp_error = 70
00066
00067 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
00068 temp_dict = { 0: 'OK', 1: 'Hot', 2: 'Critical Hot' }
00069 usage_dict = { 0: 'OK', 1: 'Low Disk Space', 2: 'Very Low Disk Space' }
00070
00071 REMOVABLE = ['/dev/sg1', '/dev/sdb']
00072
00073
00074 def get_hddtemp_data(hostname = 'localhost', port = 7634):
00075 try:
00076 hd_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
00077 hd_sock.connect((hostname, port))
00078 sock_data = ''
00079 while True:
00080 newdat = hd_sock.recv(1024)
00081 if len(newdat) == 0:
00082 break
00083 sock_data = sock_data + newdat
00084 hd_sock.close()
00085
00086 sock_vals = sock_data.split('|')
00087
00088
00089 idx = 0
00090
00091 drives = []
00092 makes = []
00093 temps = []
00094 while idx + 5 < len(sock_vals):
00095 this_drive = sock_vals[idx + 1]
00096 this_make = sock_vals[idx + 2]
00097 this_temp = sock_vals[idx + 3]
00098
00099
00100
00101 if this_make in makes:
00102 idx += 5
00103 continue
00104
00105 drives.append(this_drive)
00106 makes.append(this_make)
00107 temps.append(this_temp)
00108
00109 idx += 5
00110
00111 return True, drives, makes, temps
00112 except:
00113 rospy.logerr(traceback.format_exc())
00114 return False, [ 'Exception' ], [ traceback.format_exc() ], [ 0 ]
00115
00116 def update_status_stale(stat, last_update_time):
00117 time_since_update = rospy.get_time() - last_update_time
00118
00119 stale_status = 'OK'
00120 if time_since_update > 20 and time_since_update <= 35:
00121 stale_status = 'Lagging'
00122 if stat.level == DiagnosticStatus.OK:
00123 stat.message = stale_status
00124 elif stat.message.find(stale_status) < 0:
00125 stat.message = ', '.join([stat.message, stale_status])
00126 stat.level = max(stat.level, DiagnosticStatus.WARN)
00127 if time_since_update > 35:
00128 stale_status = 'Stale'
00129 if stat.level == DiagnosticStatus.OK:
00130 stat.message = stale_status
00131 elif stat.message.find(stale_status) < 0:
00132 stat.message = ', '.join([stat.message, stale_status])
00133 stat.level = max(stat.level, DiagnosticStatus.ERROR)
00134
00135 stat.values.pop(0)
00136 stat.values.pop(0)
00137 stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
00138 stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
00139
00140 class hd_monitor():
00141 def __init__(self, hostname, diag_hostname, home_dir = ''):
00142 self._mutex = threading.Lock()
00143
00144 self._hostname = hostname
00145 self._no_temp_warn = rospy.get_param('~no_hd_temp_warn', False)
00146 if self._no_temp_warn:
00147 rospy.logwarn('Not warning for HD temperatures is deprecated. This will be removed in D-turtle')
00148 self._home_dir = home_dir
00149
00150 self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray)
00151
00152 self._last_temp_time = 0
00153 self._last_usage_time = 0
00154 self._last_publish_time = 0
00155
00156 self._temp_timer = None
00157 self._usage_timer = None
00158
00159 self._temp_stat = DiagnosticStatus()
00160 self._temp_stat.name = "%s HD Temperature" % diag_hostname
00161 self._temp_stat.level = DiagnosticStatus.ERROR
00162 self._temp_stat.hardware_id = hostname
00163 self._temp_stat.message = 'No Data'
00164 self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data'),
00165 KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00166
00167 if self._home_dir != '':
00168 self._usage_stat = DiagnosticStatus()
00169 self._usage_stat.level = DiagnosticStatus.ERROR
00170 self._usage_stat.hardware_id = hostname
00171 self._usage_stat.name = '%s HD Usage' % diag_hostname
00172 self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
00173 KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
00174 self.check_disk_usage()
00175
00176 self.check_temps()
00177
00178
00179 def cancel_timers(self):
00180 if self._temp_timer:
00181 self._temp_timer.cancel()
00182 self._temp_timer = None
00183
00184 if self._usage_timer:
00185 self._usage_timer.cancel()
00186 self._usage_timer = None
00187
00188 def check_temps(self):
00189 if rospy.is_shutdown():
00190 with self._mutex:
00191 self.cancel_timers()
00192 return
00193
00194 diag_strs = [ KeyValue(key = 'Update Status', value = 'OK' ) ,
00195 KeyValue(key = 'Time Since Last Update', value = '0' ) ]
00196 diag_level = DiagnosticStatus.OK
00197 diag_message = 'OK'
00198
00199 temp_ok, drives, makes, temps = get_hddtemp_data()
00200
00201 for index in range(0, len(drives)):
00202 temp = temps[index]
00203
00204 if not unicode(temp).isnumeric() and drives[index] not in REMOVABLE:
00205 temp_level = DiagnosticStatus.ERROR
00206 temp_ok = False
00207 elif not unicode(temp).isnumeric() and drives[index] in REMOVABLE:
00208 temp_level = DiagnosticStatus.OK
00209 temp = "Removed"
00210 else:
00211 temp_level = DiagnosticStatus.OK
00212 if float(temp) > hd_temp_warn:
00213 temp_level = DiagnosticStatus.WARN
00214 if float(temp) > hd_temp_error:
00215 temp_level = DiagnosticStatus.ERROR
00216
00217 diag_level = max(diag_level, temp_level)
00218
00219 diag_strs.append(KeyValue(key = 'Disk %d Temp Status' % index, value = temp_dict[temp_level]))
00220 diag_strs.append(KeyValue(key = 'Disk %d Mount Pt.' % index, value = drives[index]))
00221 diag_strs.append(KeyValue(key = 'Disk %d Device ID' % index, value = makes[index]))
00222 diag_strs.append(KeyValue(key = 'Disk %d Temp' % index, value = temp))
00223
00224 if not temp_ok:
00225 diag_level = DiagnosticStatus.ERROR
00226
00227 with self._mutex:
00228 self._last_temp_time = rospy.get_time()
00229 self._temp_stat.values = diag_strs
00230 self._temp_stat.level = diag_level
00231
00232
00233 self._temp_stat.message = temp_dict[diag_level]
00234 if not temp_ok:
00235 self._temp_stat.message = 'Error'
00236
00237 if self._no_temp_warn and temp_ok:
00238 self._temp_stat.level = DiagnosticStatus.OK
00239
00240 if not rospy.is_shutdown():
00241 self._temp_timer = threading.Timer(10.0, self.check_temps)
00242 self._temp_timer.start()
00243 else:
00244 self.cancel_timers()
00245
00246 def check_disk_usage(self):
00247 if rospy.is_shutdown():
00248 with self._mutex:
00249 self.cancel_timers()
00250 return
00251
00252 diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
00253 KeyValue(key = 'Time Since Last Update', value = '0' ) ]
00254 diag_level = DiagnosticStatus.OK
00255 diag_message = 'OK'
00256
00257 try:
00258 p = subprocess.Popen(["df", "-P", "--block-size=1G", self._home_dir],
00259 stdout=subprocess.PIPE, stderr=subprocess.PIPE)
00260 stdout, stderr = p.communicate()
00261 retcode = p.returncode
00262
00263 if (retcode == 0):
00264
00265 diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'OK'))
00266 row_count = 0
00267 for row in stdout.split('\n'):
00268 if len(row.split()) < 2:
00269 continue
00270 if not unicode(row.split()[1]).isnumeric() or float(row.split()[1]) < 10:
00271 continue
00272
00273 row_count += 1
00274 g_available = row.split()[-3]
00275 name = row.split()[0]
00276 size = row.split()[1]
00277 mount_pt = row.split()[-1]
00278
00279 if (float(g_available) > low_hd_level):
00280 level = DiagnosticStatus.OK
00281 elif (float(g_available) > critical_hd_level):
00282 level = DiagnosticStatus.WARN
00283 else:
00284 level = DiagnosticStatus.ERROR
00285
00286 diag_vals.append(KeyValue(
00287 key = 'Disk %d Name' % row_count, value = name))
00288 diag_vals.append(KeyValue(
00289 key = 'Disk %d Available' % row_count, value = g_available))
00290 diag_vals.append(KeyValue(
00291 key = 'Disk %d Size' % row_count, value = size))
00292 diag_vals.append(KeyValue(
00293 key = 'Disk %d Status' % row_count, value = stat_dict[level]))
00294 diag_vals.append(KeyValue(
00295 key = 'Disk %d Mount Point' % row_count, value = mount_pt))
00296
00297 diag_level = max(diag_level, level)
00298 diag_message = usage_dict[diag_level]
00299
00300 else:
00301 diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Failed'))
00302 diag_level = DiagnosticStatus.ERROR
00303 diag_message = stat_dict[diag_level]
00304
00305
00306 except:
00307 rospy.logerr(traceback.format_exc())
00308
00309 diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Exception'))
00310 diag_vals.append(KeyValue(key = 'Disk Space Ex', value = traceback.format_exc()))
00311
00312 diag_level = DiagnosticStatus.ERROR
00313 diag_message = stat_dict[diag_level]
00314
00315
00316 with self._mutex:
00317 self._last_usage_time = rospy.get_time()
00318 self._usage_stat.values = diag_vals
00319 self._usage_stat.message = diag_message
00320 self._usage_stat.level = diag_level
00321
00322 if not rospy.is_shutdown():
00323 self._usage_timer = threading.Timer(5.0, self.check_disk_usage)
00324 self._usage_timer.start()
00325 else:
00326 self.cancel_timers()
00327
00328
00329 def publish_stats(self):
00330 with self._mutex:
00331 update_status_stale(self._temp_stat, self._last_temp_time)
00332
00333 msg = DiagnosticArray()
00334 msg.header.stamp = rospy.get_rostime()
00335 msg.status.append(self._temp_stat)
00336 if self._home_dir != '':
00337 update_status_stale(self._usage_stat, self._last_usage_time)
00338 msg.status.append(self._usage_stat)
00339
00340 if rospy.get_time() - self._last_publish_time > 0.5:
00341 self._diag_pub.publish(msg)
00342 self._last_publish_time = rospy.get_time()
00343
00344
00345
00346
00347
00348
00349 if __name__ == '__main__':
00350 hostname = socket.gethostname()
00351
00352 import optparse
00353 parser = optparse.OptionParser(usage="usage: hd_monitor.py [--diag-hostname=cX]")
00354 parser.add_option("--diag-hostname", dest="diag_hostname",
00355 help="Computer name in diagnostics output (ex: 'c1')",
00356 metavar="DIAG_HOSTNAME",
00357 action="store", default = hostname)
00358 options, args = parser.parse_args(rospy.myargv())
00359
00360 home_dir = ''
00361 if len(args) > 1:
00362 home_dir = args[1]
00363
00364 try:
00365 rospy.init_node('hd_monitor_%s' % hostname)
00366 except rospy.exceptions.ROSInitException:
00367 print 'HD monitor is unable to initialize node. Master may not be running.'
00368 sys.exit(0)
00369
00370 hd_monitor = hd_monitor(hostname, options.diag_hostname, home_dir)
00371 rate = rospy.Rate(1.0)
00372
00373 try:
00374 while not rospy.is_shutdown():
00375 rate.sleep()
00376 hd_monitor.publish_stats()
00377 except KeyboardInterrupt:
00378 pass
00379 except Exception, e:
00380 traceback.print_exc()
00381
00382 hd_monitor.cancel_timers()
00383 sys.exit(0)
00384
00385
00386