hd_monitor.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Software License Agreement (BSD License)
5 #
6 # Copyright (c) 2009, Willow Garage, Inc.
7 # All rights reserved.
8 #
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
11 # are met:
12 #
13 # * Redistributions of source code must retain the above copyright
14 # notice, this list of conditions and the following disclaimer.
15 # * Redistributions in binary form must reproduce the above
16 # copyright notice, this list of conditions and the following
17 # disclaimer in the documentation and/or other materials provided
18 # with the distribution.
19 # * Neither the name of the Willow Garage nor the names of its
20 # contributors may be used to endorse or promote products derived
21 # from this software without specific prior written permission.
22 #
23 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 # POSSIBILITY OF SUCH DAMAGE.
35 
36 ##\author Kevin Watts
37 
38 from __future__ import with_statement
39 import roslib
40 roslib.load_manifest('diagnostic_common_diagnostics')
41 
42 import rospy
43 
44 import traceback
45 import threading
46 from threading import Timer
47 import sys, os, time, string
48 from time import sleep
49 import subprocess
50 import socket
51 
52 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
53 
54 low_hd_level = 5
55 critical_hd_level = 1
56 
57 hd_temp_warn = 55 #3580, setting to 55C to after checking manual
58 hd_temp_error = 70 # Above this temperature, hard drives will have serious problems
59 
60 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
61 temp_dict = { 0: 'OK', 1: 'Hot', 2: 'Critical Hot' }
62 usage_dict = { 0: 'OK', 1: 'Low Disk Space', 2: 'Very Low Disk Space' }
63 
64 REMOVABLE = ['/dev/sg1', '/dev/sdb'] # Store removable drives so we can ignore if removed
65 
66 ## Connects to hddtemp daemon to get temp, HD make.
67 def get_hddtemp_data(hostname = 'localhost', port = 7634):
68  try:
69  hd_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
70  hd_sock.connect((hostname, port))
71  sock_data = ''
72  while True:
73  newdat = hd_sock.recv(1024)
74  if len(newdat) == 0:
75  break
76  sock_data = sock_data + newdat
77  hd_sock.close()
78 
79  sock_vals = sock_data.split('|')
80 
81  # Format of output looks like ' | DRIVE | MAKE | TEMP | '
82  idx = 0
83 
84  drives = []
85  makes = []
86  temps = []
87  while idx + 5 < len(sock_vals):
88  this_drive = sock_vals[idx + 1]
89  this_make = sock_vals[idx + 2]
90  this_temp = sock_vals[idx + 3]
91 
92  # Sometimes we get duplicate makes if hard drives are mounted
93  # to two different points
94  if this_make in makes:
95  idx += 5
96  continue
97 
98  drives.append(this_drive)
99  makes.append(this_make)
100  temps.append(this_temp)
101 
102  idx += 5
103 
104  return True, drives, makes, temps
105  except:
106  rospy.logerr(traceback.format_exc())
107  return False, [ 'Exception' ], [ traceback.format_exc() ], [ '0' ]
108 
109 def update_status_stale(stat, last_update_time):
110  time_since_update = rospy.get_time() - last_update_time
111 
112  stale_status = 'OK'
113  if time_since_update > 20 and time_since_update <= 35:
114  stale_status = 'Lagging'
115  if stat.level == DiagnosticStatus.OK:
116  stat.message = stale_status
117  elif stat.message.find(stale_status) < 0:
118  stat.message = ', '.join([stat.message, stale_status])
119  stat.level = max(stat.level, DiagnosticStatus.WARN)
120  if time_since_update > 35:
121  stale_status = 'Stale'
122  if stat.level == DiagnosticStatus.OK:
123  stat.message = stale_status
124  elif stat.message.find(stale_status) < 0:
125  stat.message = ', '.join([stat.message, stale_status])
126  stat.level = max(stat.level, DiagnosticStatus.ERROR)
127 
128  stat.values.pop(0)
129  stat.values.pop(0)
130  stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
131  stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
132 
133 class hd_monitor():
134  def __init__(self, hostname, diag_hostname, home_dir = ''):
135  self._mutex = threading.Lock()
136 
137  self._hostname = hostname
138  self._no_temp_warn = rospy.get_param('~no_hd_temp_warn', False)
139  if self._no_temp_warn:
140  rospy.logwarn('Not warning for HD temperatures is deprecated. This will be removed in D-turtle')
141  self._home_dir = home_dir
142 
143  self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10)
144 
148 
149  self._temp_timer = None
150  self._usage_timer = None
151 
152  self._temp_stat = DiagnosticStatus()
153  self._temp_stat.name = "%s HD Temperature" % diag_hostname
154  self._temp_stat.level = DiagnosticStatus.ERROR
155  self._temp_stat.hardware_id = hostname
156  self._temp_stat.message = 'No Data'
157  self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data'),
158  KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
159 
160  if self._home_dir != '':
161  self._usage_stat = DiagnosticStatus()
162  self._usage_stat.level = DiagnosticStatus.ERROR
163  self._usage_stat.hardware_id = hostname
164  self._usage_stat.name = '%s HD Usage' % diag_hostname
165  self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
166  KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
167  self.check_disk_usage()
168 
169  self.check_temps()
170 
171  ## Must have the lock to cancel everything
172  def cancel_timers(self):
173  if self._temp_timer:
174  self._temp_timer.cancel()
175  self._temp_timer = None
176 
177  if self._usage_timer:
178  self._usage_timer.cancel()
179  self._usage_timer = None
180 
181  def check_temps(self):
182  if rospy.is_shutdown():
183  with self._mutex:
184  self.cancel_timers()
185  return
186 
187  diag_strs = [ KeyValue(key = 'Update Status', value = 'OK' ) ,
188  KeyValue(key = 'Time Since Last Update', value = '0' ) ]
189  diag_level = DiagnosticStatus.OK
190  diag_message = 'OK'
191 
192  temp_ok, drives, makes, temps = get_hddtemp_data()
193 
194  for index in range(0, len(drives)):
195  temp = temps[index]
196 
197  if not unicode(temp).isnumeric() and drives[index] not in REMOVABLE:
198  temp_level = DiagnosticStatus.ERROR
199  temp_ok = False
200  elif not unicode(temp).isnumeric() and drives[index] in REMOVABLE:
201  temp_level = DiagnosticStatus.OK
202  temp = "Removed"
203  else:
204  temp_level = DiagnosticStatus.OK
205  if float(temp) > hd_temp_warn:
206  temp_level = DiagnosticStatus.WARN
207  if float(temp) > hd_temp_error:
208  temp_level = DiagnosticStatus.ERROR
209 
210  diag_level = max(diag_level, temp_level)
211 
212  diag_strs.append(KeyValue(key = 'Disk %d Temp Status' % index, value = temp_dict[temp_level]))
213  diag_strs.append(KeyValue(key = 'Disk %d Mount Pt.' % index, value = drives[index]))
214  diag_strs.append(KeyValue(key = 'Disk %d Device ID' % index, value = makes[index]))
215  diag_strs.append(KeyValue(key = 'Disk %d Temp' % index, value = temp))
216 
217  if not temp_ok:
218  diag_level = DiagnosticStatus.ERROR
219 
220  with self._mutex:
221  self._last_temp_time = rospy.get_time()
222  self._temp_stat.values = diag_strs
223  self._temp_stat.level = diag_level
224 
225  # Give No Data message if we have no reading
226  self._temp_stat.message = temp_dict[diag_level]
227  if not temp_ok:
228  self._temp_stat.message = 'Error'
229 
230  if self._no_temp_warn and temp_ok:
231  self._temp_stat.level = DiagnosticStatus.OK
232 
233  if not rospy.is_shutdown():
234  self._temp_timer = threading.Timer(10.0, self.check_temps)
235  self._temp_timer.start()
236  else:
237  self.cancel_timers()
238 
239  def check_disk_usage(self):
240  if rospy.is_shutdown():
241  with self._mutex:
242  self.cancel_timers()
243  return
244 
245  diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
246  KeyValue(key = 'Time Since Last Update', value = '0' ) ]
247  diag_level = DiagnosticStatus.OK
248  diag_message = 'OK'
249 
250  try:
251  p = subprocess.Popen(["df", "-P", "--block-size=1G", self._home_dir],
252  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
253  stdout, stderr = p.communicate()
254  retcode = p.returncode
255 
256  if (retcode == 0):
257 
258  diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'OK'))
259  row_count = 0
260  for row in stdout.split('\n'):
261  if len(row.split()) < 2:
262  continue
263  if not unicode(row.split()[1]).isnumeric() or float(row.split()[1]) < 10: # Ignore small drives
264  continue
265 
266  row_count += 1
267  g_available = row.split()[-3]
268  name = row.split()[0]
269  size = row.split()[1]
270  mount_pt = row.split()[-1]
271 
272  if (float(g_available) > low_hd_level):
273  level = DiagnosticStatus.OK
274  elif (float(g_available) > critical_hd_level):
275  level = DiagnosticStatus.WARN
276  else:
277  level = DiagnosticStatus.ERROR
278 
279  diag_vals.append(KeyValue(
280  key = 'Disk %d Name' % row_count, value = name))
281  diag_vals.append(KeyValue(
282  key = 'Disk %d Available' % row_count, value = g_available))
283  diag_vals.append(KeyValue(
284  key = 'Disk %d Size' % row_count, value = size))
285  diag_vals.append(KeyValue(
286  key = 'Disk %d Status' % row_count, value = stat_dict[level]))
287  diag_vals.append(KeyValue(
288  key = 'Disk %d Mount Point' % row_count, value = mount_pt))
289 
290  diag_level = max(diag_level, level)
291  diag_message = usage_dict[diag_level]
292 
293  else:
294  diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Failed'))
295  diag_level = DiagnosticStatus.ERROR
296  diag_message = stat_dict[diag_level]
297 
298 
299  except:
300  rospy.logerr(traceback.format_exc())
301 
302  diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Exception'))
303  diag_vals.append(KeyValue(key = 'Disk Space Ex', value = traceback.format_exc()))
304 
305  diag_level = DiagnosticStatus.ERROR
306  diag_message = stat_dict[diag_level]
307 
308  # Update status
309  with self._mutex:
310  self._last_usage_time = rospy.get_time()
311  self._usage_stat.values = diag_vals
312  self._usage_stat.message = diag_message
313  self._usage_stat.level = diag_level
314 
315  if not rospy.is_shutdown():
316  self._usage_timer = threading.Timer(5.0, self.check_disk_usage)
317  self._usage_timer.start()
318  else:
319  self.cancel_timers()
320 
321 
322  def publish_stats(self):
323  with self._mutex:
325 
326  msg = DiagnosticArray()
327  msg.header.stamp = rospy.get_rostime()
328  msg.status.append(self._temp_stat)
329  if self._home_dir != '':
331  msg.status.append(self._usage_stat)
332 
333  #for s in msg.status:
334  #for v in s.values:
335  #print v, type(v.value)
336 
337  if rospy.get_time() - self._last_publish_time > 0.5:
338  self._diag_pub.publish(msg)
339  self._last_publish_time = rospy.get_time()
340 
341 
342 
343 
344 ##\todo Need to check HD input/output too using iostat
345 
346 if __name__ == '__main__':
347  hostname = socket.gethostname()
348 
349  import optparse
350  parser = optparse.OptionParser(usage="usage: hd_monitor.py [--diag-hostname=cX]")
351  parser.add_option("--diag-hostname", dest="diag_hostname",
352  help="Computer name in diagnostics output (ex: 'c1')",
353  metavar="DIAG_HOSTNAME",
354  action="store", default = hostname)
355  options, args = parser.parse_args(rospy.myargv())
356 
357  home_dir = ''
358  if len(args) > 1:
359  home_dir = args[1]
360 
361  hostname_clean = string.translate(hostname, string.maketrans('-','_'))
362  try:
363  rospy.init_node('hd_monitor_%s' % hostname_clean)
364  except rospy.exceptions.ROSInitException:
365  print 'HD monitor is unable to initialize node. Master may not be running.'
366  sys.exit(0)
367 
368  hd_monitor = hd_monitor(hostname, options.diag_hostname, home_dir)
369  rate = rospy.Rate(1.0)
370 
371  try:
372  while not rospy.is_shutdown():
373  rate.sleep()
374  hd_monitor.publish_stats()
375  except KeyboardInterrupt:
376  pass
377  except Exception, e:
378  traceback.print_exc()
379 
380  hd_monitor.cancel_timers()
381  sys.exit(0)
382 
383 
384 
def update_status_stale(stat, last_update_time)
Definition: hd_monitor.py:109
def cancel_timers(self)
Must have the lock to cancel everything.
Definition: hd_monitor.py:172
def __init__(self, hostname, diag_hostname, home_dir='')
Definition: hd_monitor.py:134
def get_hddtemp_data(hostname='localhost', port=7634)
Connects to hddtemp daemon to get temp, HD make.
Definition: hd_monitor.py:67


diagnostic_common_diagnostics
Author(s): Brice Rebsamen
autogenerated on Thu Oct 8 2020 03:19:37