hd_monitor.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 # -*- coding: utf-8 -*-
3 #
4 # Software License Agreement (BSD License)
5 #
6 # Copyright (c) 2009, Willow Garage, Inc.
7 # All rights reserved.
8 #
9 # Redistribution and use in source and binary forms, with or without
10 # modification, are permitted provided that the following conditions
11 # are met:
12 #
13 # * Redistributions of source code must retain the above copyright
14 # notice, this list of conditions and the following disclaimer.
15 # * Redistributions in binary form must reproduce the above
16 # copyright notice, this list of conditions and the following
17 # disclaimer in the documentation and/or other materials provided
18 # with the distribution.
19 # * Neither the name of the Willow Garage nor the names of its
20 # contributors may be used to endorse or promote products derived
21 # from this software without specific prior written permission.
22 #
23 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
24 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
25 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
26 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
27 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
28 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
29 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
30 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
31 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
32 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
33 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 # POSSIBILITY OF SUCH DAMAGE.
35 
36 # \author Kevin Watts
37 
38 from __future__ import with_statement
39 import roslib
40 import rospy
41 import traceback
42 import threading
43 import sys
44 import subprocess
45 import socket
46 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
47 
48 roslib.load_manifest('diagnostic_common_diagnostics')
49 low_hd_level = 5
50 critical_hd_level = 1
51 
52 hd_temp_warn = 55 # 3580, setting to 55C to after checking manual
53 hd_temp_error = 70 # Above this temperature, hard drives will have serious problems
54 
55 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
56 temp_dict = { 0: 'OK', 1: 'Hot', 2: 'Critical Hot' }
57 usage_dict = { 0: 'OK', 1: 'Low Disk Space', 2: 'Very Low Disk Space' }
58 
59 REMOVABLE = ['/dev/sg1', '/dev/sdb'] # Store removable drives so we can ignore if removed
60 
61 
62 # Connects to hddtemp daemon to get temp, HD make.
63 def get_hddtemp_data(hostname='localhost', port=7634):
64  try:
65  hd_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
66  hd_sock.connect((hostname, port))
67  sock_data = b''
68  while True:
69  newdat = hd_sock.recv(1024)
70  if len(newdat) == 0:
71  break
72  sock_data = sock_data + newdat
73  hd_sock.close()
74 
75  sock_vals = sock_data.decode().split('|')
76 
77  # Format of output looks like ' | DRIVE | MAKE | TEMP | '
78  idx = 0
79 
80  drives = []
81  makes = []
82  temps = []
83  while idx + 5 < len(sock_vals):
84  this_drive = sock_vals[idx + 1]
85  this_make = sock_vals[idx + 2]
86  this_temp = sock_vals[idx + 3]
87 
88  # Sometimes we get duplicate makes if hard drives are mounted
89  # to two different points
90  if this_make in makes:
91  idx += 5
92  continue
93 
94  drives.append(this_drive)
95  makes.append(this_make)
96  temps.append(this_temp)
97 
98  idx += 5
99 
100  return True, drives, makes, temps
101  except Exception:
102  rospy.logerr(traceback.format_exc())
103  return False, ['Exception'], [traceback.format_exc()], ['0']
104 
105 
106 def update_status_stale(stat, last_update_time):
107  time_since_update = rospy.get_time() - last_update_time
108 
109  stale_status = 'OK'
110  if 20 < time_since_update <= 35:
111  stale_status = 'Lagging'
112  if stat.level == DiagnosticStatus.OK:
113  stat.message = stale_status
114  elif stat.message.find(stale_status) < 0:
115  stat.message = ', '.join([stat.message, stale_status])
116  stat.level = max(stat.level, DiagnosticStatus.WARN)
117  if time_since_update > 35:
118  stale_status = 'Stale'
119  if stat.level == DiagnosticStatus.OK:
120  stat.message = stale_status
121  elif stat.message.find(stale_status) < 0:
122  stat.message = ', '.join([stat.message, stale_status])
123  stat.level = max(stat.level, DiagnosticStatus.ERROR)
124 
125  stat.values.pop(0)
126  stat.values.pop(0)
127  stat.values.insert(0, KeyValue(key='Update Status', value=stale_status))
128  stat.values.insert(1, KeyValue(key='Time Since Update', value=str(time_since_update)))
129 
130 
131 class hd_monitor():
132  def __init__(self, hostname, diag_hostname, home_dir=''):
133  self._mutex = threading.Lock()
134 
135  self._hostname = hostname
136  self._no_temp_warn = rospy.get_param('~no_hd_temp_warn', False)
137  if self._no_temp_warn:
138  rospy.logwarn('Not warning for HD temperatures is deprecated. This will be removed in D-turtle')
139  self._home_dir = home_dir
140 
141  self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10)
142 
146 
147  self._temp_timer = None
148  self._usage_timer = None
149 
150  self._temp_stat = DiagnosticStatus()
151  self._temp_stat.name = "%s HD Temperature" % diag_hostname
152  self._temp_stat.level = DiagnosticStatus.ERROR
153  self._temp_stat.hardware_id = hostname
154  self._temp_stat.message = 'No Data'
155  self._temp_stat.values = [KeyValue(key='Update Status', value='No Data'),
156  KeyValue(key='Time Since Last Update', value='N/A')]
157 
158  if self._home_dir != '':
159  self._usage_stat = DiagnosticStatus()
160  self._usage_stat.level = DiagnosticStatus.ERROR
161  self._usage_stat.hardware_id = hostname
162  self._usage_stat.name = '%s HD Usage' % diag_hostname
163  self._usage_stat.values = [KeyValue(key='Update Status', value='No Data' ),
164  KeyValue(key='Time Since Last Update', value='N/A')]
165  self.check_disk_usage()
166 
167  self.check_temps()
168 
169  # Must have the lock to cancel everything
170  def cancel_timers(self):
171  if self._temp_timer:
172  self._temp_timer.cancel()
173  self._temp_timer = None
174 
175  if self._usage_timer:
176  self._usage_timer.cancel()
177  self._usage_timer = None
178 
179  def check_temps(self):
180  if rospy.is_shutdown():
181  with self._mutex:
182  self.cancel_timers()
183  return
184 
185  diag_strs = [KeyValue(key='Update Status', value='OK'),
186  KeyValue(key='Time Since Last Update', value='0')]
187  diag_level = DiagnosticStatus.OK
188 
189  temp_ok, drives, makes, temps = get_hddtemp_data()
190 
191  for index in range(0, len(drives)):
192  temp = temps[index]
193 
194  if not temp.isnumeric() and drives[index] not in REMOVABLE:
195  temp_level = DiagnosticStatus.ERROR
196  temp_ok = False
197  elif not temp.isnumeric() and drives[index] in REMOVABLE:
198  temp_level = DiagnosticStatus.OK
199  temp = "Removed"
200  else:
201  temp_level = DiagnosticStatus.OK
202  if float(temp) > hd_temp_warn:
203  temp_level = DiagnosticStatus.WARN
204  if float(temp) > hd_temp_error:
205  temp_level = DiagnosticStatus.ERROR
206 
207  diag_level = max(diag_level, temp_level)
208 
209  diag_strs.append(KeyValue(key='Disk %d Temp Status' % index, value=temp_dict[temp_level]))
210  diag_strs.append(KeyValue(key='Disk %d Mount Pt.' % index, value=drives[index]))
211  diag_strs.append(KeyValue(key='Disk %d Device ID' % index, value=makes[index]))
212  diag_strs.append(KeyValue(key='Disk %d Temp' % index, value=temp))
213 
214  if not temp_ok:
215  diag_level = DiagnosticStatus.ERROR
216 
217  with self._mutex:
218  self._last_temp_time = rospy.get_time()
219  self._temp_stat.values = diag_strs
220  self._temp_stat.level = diag_level
221 
222  # Give No Data message if we have no reading
223  self._temp_stat.message = temp_dict[diag_level]
224  if not temp_ok:
225  self._temp_stat.message = 'Error'
226 
227  if self._no_temp_warn and temp_ok:
228  self._temp_stat.level = DiagnosticStatus.OK
229 
230  if not rospy.is_shutdown():
231  self._temp_timer = threading.Timer(10.0, self.check_temps)
232  self._temp_timer.start()
233  else:
234  self.cancel_timers()
235 
236  def check_disk_usage(self):
237  if rospy.is_shutdown():
238  with self._mutex:
239  self.cancel_timers()
240  return
241 
242  diag_vals = [KeyValue(key='Update Status', value='OK'),
243  KeyValue(key='Time Since Last Update', value='0')]
244  diag_level = DiagnosticStatus.OK
245  diag_message = 'OK'
246 
247  try:
248  p = subprocess.Popen(["df", "-P", "--block-size=1G", self._home_dir],
249  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
250  stdout, stderr = p.communicate()
251  retcode = p.returncode
252 
253  if retcode == 0:
254 
255  diag_vals.append(KeyValue(key='Disk Space Reading', value='OK'))
256  row_count = 0
257  for row in stdout.split('\n'):
258  if len(row.split()) < 2:
259  continue
260  if not row.split()[1].isnumeric() or float(row.split()[1]) < 10: # Ignore small drives
261  continue
262 
263  row_count += 1
264  g_available = row.split()[-3]
265  name = row.split()[0]
266  size = row.split()[1]
267  mount_pt = row.split()[-1]
268 
269  if float(g_available) > low_hd_level:
270  level = DiagnosticStatus.OK
271  elif float(g_available) > critical_hd_level:
272  level = DiagnosticStatus.WARN
273  else:
274  level = DiagnosticStatus.ERROR
275 
276  diag_vals.append(KeyValue(
277  key='Disk %d Name' % row_count, value=name))
278  diag_vals.append(KeyValue(
279  key='Disk %d Available' % row_count, value=g_available))
280  diag_vals.append(KeyValue(
281  key='Disk %d Size' % row_count, value=size))
282  diag_vals.append(KeyValue(
283  key='Disk %d Status' % row_count, value=stat_dict[level]))
284  diag_vals.append(KeyValue(
285  key='Disk %d Mount Point' % row_count, value=mount_pt))
286 
287  diag_level = max(diag_level, level)
288  diag_message = usage_dict[diag_level]
289 
290  else:
291  diag_vals.append(KeyValue(key='Disk Space Reading', value='Failed'))
292  diag_level = DiagnosticStatus.ERROR
293  diag_message = stat_dict[diag_level]
294 
295  except Exception:
296  rospy.logerr(traceback.format_exc())
297 
298  diag_vals.append(KeyValue(key='Disk Space Reading', value='Exception'))
299  diag_vals.append(KeyValue(key='Disk Space Ex', value=traceback.format_exc()))
300 
301  diag_level = DiagnosticStatus.ERROR
302  diag_message = stat_dict[diag_level]
303 
304  # Update status
305  with self._mutex:
306  self._last_usage_time = rospy.get_time()
307  self._usage_stat.values = diag_vals
308  self._usage_stat.message = diag_message
309  self._usage_stat.level = diag_level
310 
311  if not rospy.is_shutdown():
312  self._usage_timer = threading.Timer(5.0, self.check_disk_usage)
313  self._usage_timer.start()
314  else:
315  self.cancel_timers()
316 
317  def publish_stats(self):
318  with self._mutex:
320 
321  msg = DiagnosticArray()
322  msg.header.stamp = rospy.get_rostime()
323  msg.status.append(self._temp_stat)
324  if self._home_dir != '':
326  msg.status.append(self._usage_stat)
327 
328  if rospy.get_time() - self._last_publish_time > 0.5:
329  self._diag_pub.publish(msg)
330  self._last_publish_time = rospy.get_time()
331 
332 
333 # Todo: Need to check HD input/output too using iostat
334 
335 if __name__ == '__main__':
336  hostname = socket.gethostname()
337 
338  import optparse
339  parser = optparse.OptionParser(usage="usage: hd_monitor.py [--diag-hostname=cX]")
340  parser.add_option("--diag-hostname", dest="diag_hostname",
341  help="Computer name in diagnostics output (ex: 'c1')",
342  metavar="DIAG_HOSTNAME",
343  action="store", default = hostname)
344  options, args = parser.parse_args(rospy.myargv())
345 
346  home_dir = ''
347  if len(args) > 1:
348  home_dir = args[1]
349 
350  hostname_clean = hostname.translate(hostname.maketrans('-', '_'))
351  try:
352  rospy.init_node('hd_monitor_%s' % hostname_clean)
353  except rospy.exceptions.ROSInitException:
354  print('HD monitor is unable to initialize node. Master may not be running.')
355  sys.exit(0)
356 
357  hd_monitor = hd_monitor(hostname, options.diag_hostname, home_dir)
358  rate = rospy.Rate(1.0)
359 
360  try:
361  while not rospy.is_shutdown():
362  rate.sleep()
363  hd_monitor.publish_stats()
364  except KeyboardInterrupt:
365  pass
366  except Exception as e:
367  traceback.print_exc()
368  raise e
369 
370  hd_monitor.cancel_timers()
371  sys.exit(0)
372 
373 
374 
diagnostic_common_diagnostics.hd_monitor.hd_monitor._mutex
_mutex
Definition: hd_monitor.py:133
diagnostic_common_diagnostics.hd_monitor.hd_monitor.check_disk_usage
def check_disk_usage(self)
Definition: hd_monitor.py:236
diagnostic_common_diagnostics.hd_monitor.hd_monitor._last_publish_time
_last_publish_time
Definition: hd_monitor.py:145
diagnostic_common_diagnostics.hd_monitor.hd_monitor._usage_timer
_usage_timer
Definition: hd_monitor.py:148
diagnostic_common_diagnostics.hd_monitor.update_status_stale
def update_status_stale(stat, last_update_time)
Definition: hd_monitor.py:106
diagnostic_common_diagnostics.hd_monitor.get_hddtemp_data
def get_hddtemp_data(hostname='localhost', port=7634)
Definition: hd_monitor.py:63
diagnostic_common_diagnostics.hd_monitor.hd_monitor._last_temp_time
_last_temp_time
Definition: hd_monitor.py:143
diagnostic_common_diagnostics.hd_monitor.hd_monitor._temp_stat
_temp_stat
Definition: hd_monitor.py:150
diagnostic_common_diagnostics.hd_monitor.hd_monitor._diag_pub
_diag_pub
Definition: hd_monitor.py:141
diagnostic_common_diagnostics.hd_monitor.hd_monitor._home_dir
_home_dir
Definition: hd_monitor.py:139
diagnostic_common_diagnostics.hd_monitor.hd_monitor._last_usage_time
_last_usage_time
Definition: hd_monitor.py:144
diagnostic_common_diagnostics.hd_monitor.hd_monitor.cancel_timers
def cancel_timers(self)
Definition: hd_monitor.py:170
diagnostic_common_diagnostics.hd_monitor.hd_monitor._hostname
_hostname
Definition: hd_monitor.py:135
diagnostic_common_diagnostics.hd_monitor.hd_monitor._no_temp_warn
_no_temp_warn
Definition: hd_monitor.py:136
diagnostic_common_diagnostics.hd_monitor.hd_monitor._usage_stat
_usage_stat
Definition: hd_monitor.py:159
diagnostic_common_diagnostics.hd_monitor.hd_monitor.publish_stats
def publish_stats(self)
Definition: hd_monitor.py:317
diagnostic_common_diagnostics.hd_monitor.hd_monitor.check_temps
def check_temps(self)
Definition: hd_monitor.py:179
diagnostic_common_diagnostics.hd_monitor.hd_monitor._temp_timer
_temp_timer
Definition: hd_monitor.py:147
diagnostic_common_diagnostics.hd_monitor.hd_monitor.__init__
def __init__(self, hostname, diag_hostname, home_dir='')
Definition: hd_monitor.py:132
diagnostic_common_diagnostics.hd_monitor.hd_monitor
Definition: hd_monitor.py:131


diagnostic_common_diagnostics
Author(s): Brice Rebsamen
autogenerated on Tue Nov 15 2022 03:17:21