hd_monitor.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 #
3 # Software License Agreement (BSD License)
4 #
5 # Copyright (c) 2009, Willow Garage, Inc.
6 # All rights reserved.
7 #
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions
10 # are met:
11 #
12 # * Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # * Redistributions in binary form must reproduce the above
15 # copyright notice, this list of conditions and the following
16 # disclaimer in the documentation and/or other materials provided
17 # with the distribution.
18 # * Neither the name of the Willow Garage nor the names of its
19 # contributors may be used to endorse or promote products derived
20 # from this software without specific prior written permission.
21 #
22 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 # POSSIBILITY OF SUCH DAMAGE.
34 
35 ##\author Kevin Watts
36 
37 from __future__ import with_statement
38 import roslib
39 roslib.load_manifest('pr2_computer_monitor')
40 
41 import rospy
42 
43 import traceback
44 import threading
45 from threading import Timer
46 import sys, os, time
47 from time import sleep
48 import subprocess
49 
50 import socket
51 
52 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
53 
54 ##### monkey-patch to suppress threading error message in python 2.7.3
55 ##### See http://stackoverflow.com/questions/13193278/understand-python-threading-bug
56 if sys.version_info[:3] == (2, 7, 3):
57  import threading
58  threading._DummyThread._Thread__stop = lambda x: 42
59 #####
60 
61 low_hd_level = 5
62 critical_hd_level = 1
63 
64 hd_temp_warn = 55 #3580, setting to 55C to after checking manual
65 hd_temp_error = 70 # Above this temperature, hard drives will have serious problems
66 
67 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
68 temp_dict = { 0: 'OK', 1: 'Hot', 2: 'Critical Hot' }
69 usage_dict = { 0: 'OK', 1: 'Low Disk Space', 2: 'Very Low Disk Space' }
70 
71 REMOVABLE = ['/dev/sda'] # Store removable drives so we can ignore if removed
72 
73 ## Connects to hddtemp daemon to get temp, HD make.
74 def get_hddtemp_data(hostname = 'localhost', port = 7634):
75  try:
76  hd_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
77  hd_sock.connect((hostname, port))
78  sock_data = ''
79  while True:
80  newdat = hd_sock.recv(1024)
81  if len(newdat) == 0:
82  break
83  sock_data = sock_data + newdat
84  hd_sock.close()
85 
86  sock_vals = sock_data.split('|')
87 
88  # Format of output looks like ' | DRIVE | MAKE | TEMP | '
89  idx = 0
90 
91  drives = []
92  makes = []
93  temps = []
94  while idx + 5 < len(sock_vals):
95  this_drive = sock_vals[idx + 1]
96  this_make = sock_vals[idx + 2]
97  this_temp = sock_vals[idx + 3]
98 
99  # Sometimes we get duplicate makes if hard drives are mounted
100  # to two different points
101  if this_make in makes:
102  idx += 5
103  continue
104 
105  drives.append(this_drive)
106  makes.append(this_make)
107  temps.append(this_temp)
108 
109  idx += 5
110 
111  return True, drives, makes, temps
112  except:
113  rospy.logerr(traceback.format_exc())
114  return False, [ 'Exception' ], [ traceback.format_exc() ], [ 0 ]
115 
116 def update_status_stale(stat, last_update_time):
117  time_since_update = rospy.get_time() - last_update_time
118 
119  stale_status = 'OK'
120  if time_since_update > 20 and time_since_update <= 35:
121  stale_status = 'Lagging'
122  if stat.level == DiagnosticStatus.OK:
123  stat.message = stale_status
124  elif stat.message.find(stale_status) < 0:
125  stat.message = ', '.join([stat.message, stale_status])
126  stat.level = max(stat.level, DiagnosticStatus.WARN)
127  if time_since_update > 35:
128  stale_status = 'Stale'
129  if stat.level == DiagnosticStatus.OK:
130  stat.message = stale_status
131  elif stat.message.find(stale_status) < 0:
132  stat.message = ', '.join([stat.message, stale_status])
133  stat.level = max(stat.level, DiagnosticStatus.ERROR)
134 
135  stat.values.pop(0)
136  stat.values.pop(0)
137  stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
138  stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
139 
140 class hd_monitor():
141  def __init__(self, hostname, diag_hostname, home_dir = ''):
142  self._mutex = threading.Lock()
143 
144  self._hostname = hostname
145  self._no_temp_warn = rospy.get_param('~no_hd_temp_warn', False)
146  if self._no_temp_warn:
147  rospy.logwarn('Not warning for HD temperatures is deprecated. This will be removed in D-turtle')
148  self._home_dir = home_dir
149 
150  self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10)
151 
155 
156  self._temp_timer = None
157  self._usage_timer = None
158 
159  self._temp_stat = DiagnosticStatus()
160  self._temp_stat.name = "%s HD Temperature" % diag_hostname
161  self._temp_stat.level = DiagnosticStatus.ERROR
162  self._temp_stat.hardware_id = hostname
163  self._temp_stat.message = 'No Data'
164  self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data'),
165  KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
166 
167  if self._home_dir != '':
168  self._usage_stat = DiagnosticStatus()
169  self._usage_stat.level = DiagnosticStatus.ERROR
170  self._usage_stat.hardware_id = hostname
171  self._usage_stat.name = '%s HD Usage' % diag_hostname
172  self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
173  KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
174  self.check_disk_usage()
175 
176  self.check_temps()
177 
178  ## Must have the lock to cancel everything
179  def cancel_timers(self):
180  if self._temp_timer:
181  self._temp_timer.cancel()
182  self._temp_timer = None
183 
184  if self._usage_timer:
185  self._usage_timer.cancel()
186  self._usage_timer = None
187 
188  def check_temps(self):
189  if rospy.is_shutdown():
190  with self._mutex:
191  self.cancel_timers()
192  return
193 
194  diag_strs = [ KeyValue(key = 'Update Status', value = 'OK' ) ,
195  KeyValue(key = 'Time Since Last Update', value = '0' ) ]
196  diag_level = DiagnosticStatus.OK
197  diag_message = 'OK'
198 
199  temp_ok, drives, makes, temps = get_hddtemp_data()
200 
201  for index in range(0, len(drives)):
202  temp = temps[index]
203 
204  if not unicode(temp).isnumeric() and drives[index] not in REMOVABLE:
205  temp_level = DiagnosticStatus.ERROR
206  temp_ok = False
207  elif not unicode(temp).isnumeric() and drives[index] in REMOVABLE:
208  temp_level = DiagnosticStatus.OK
209  temp = "Removed"
210  else:
211  temp_level = DiagnosticStatus.OK
212  if float(temp) > hd_temp_warn:
213  temp_level = DiagnosticStatus.WARN
214  if float(temp) > hd_temp_error:
215  temp_level = DiagnosticStatus.ERROR
216 
217  diag_level = max(diag_level, temp_level)
218 
219  diag_strs.append(KeyValue(key = 'Disk %d Temp Status' % index, value = temp_dict[temp_level]))
220  diag_strs.append(KeyValue(key = 'Disk %d Mount Pt.' % index, value = drives[index]))
221  diag_strs.append(KeyValue(key = 'Disk %d Device ID' % index, value = makes[index]))
222  diag_strs.append(KeyValue(key = 'Disk %d Temp' % index, value = temp))
223 
224  if not temp_ok:
225  diag_level = DiagnosticStatus.ERROR
226 
227  with self._mutex:
228  self._last_temp_time = rospy.get_time()
229  self._temp_stat.values = diag_strs
230  self._temp_stat.level = diag_level
231 
232  # Give No Data message if we have no reading
233  self._temp_stat.message = temp_dict[diag_level]
234  if not temp_ok:
235  self._temp_stat.message = 'Error'
236 
237  if self._no_temp_warn and temp_ok:
238  self._temp_stat.level = DiagnosticStatus.OK
239 
240  if not rospy.is_shutdown():
241  self._temp_timer = threading.Timer(10.0, self.check_temps)
242  self._temp_timer.start()
243  else:
244  self.cancel_timers()
245 
246  def check_disk_usage(self):
247  if rospy.is_shutdown():
248  with self._mutex:
249  self.cancel_timers()
250  return
251 
252  diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
253  KeyValue(key = 'Time Since Last Update', value = '0' ) ]
254  diag_level = DiagnosticStatus.OK
255  diag_message = 'OK'
256 
257  try:
258  p = subprocess.Popen(["df", "-P", "--block-size=1G", self._home_dir],
259  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
260  stdout, stderr = p.communicate()
261  retcode = p.returncode
262 
263  if (retcode == 0):
264 
265  diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'OK'))
266  row_count = 0
267  for row in stdout.split('\n'):
268  if len(row.split()) < 2:
269  continue
270  if not unicode(row.split()[1]).isnumeric() or float(row.split()[1]) < 10: # Ignore small drives
271  continue
272 
273  row_count += 1
274  g_available = row.split()[-3]
275  name = row.split()[0]
276  size = row.split()[1]
277  mount_pt = row.split()[-1]
278 
279  if (float(g_available) > low_hd_level):
280  level = DiagnosticStatus.OK
281  elif (float(g_available) > critical_hd_level):
282  level = DiagnosticStatus.WARN
283  else:
284  level = DiagnosticStatus.ERROR
285 
286  diag_vals.append(KeyValue(
287  key = 'Disk %d Name' % row_count, value = name))
288  diag_vals.append(KeyValue(
289  key = 'Disk %d Available' % row_count, value = g_available))
290  diag_vals.append(KeyValue(
291  key = 'Disk %d Size' % row_count, value = size))
292  diag_vals.append(KeyValue(
293  key = 'Disk %d Status' % row_count, value = stat_dict[level]))
294  diag_vals.append(KeyValue(
295  key = 'Disk %d Mount Point' % row_count, value = mount_pt))
296 
297  diag_level = max(diag_level, level)
298  diag_message = usage_dict[diag_level]
299 
300  else:
301  diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Failed'))
302  diag_level = DiagnosticStatus.ERROR
303  diag_message = stat_dict[diag_level]
304 
305 
306  except:
307  rospy.logerr(traceback.format_exc())
308 
309  diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Exception'))
310  diag_vals.append(KeyValue(key = 'Disk Space Ex', value = traceback.format_exc()))
311 
312  diag_level = DiagnosticStatus.ERROR
313  diag_message = stat_dict[diag_level]
314 
315  # Update status
316  with self._mutex:
317  self._last_usage_time = rospy.get_time()
318  self._usage_stat.values = diag_vals
319  self._usage_stat.message = diag_message
320  self._usage_stat.level = diag_level
321 
322  if not rospy.is_shutdown():
323  self._usage_timer = threading.Timer(5.0, self.check_disk_usage)
324  self._usage_timer.start()
325  else:
326  self.cancel_timers()
327 
328 
329  def publish_stats(self):
330  with self._mutex:
332 
333  msg = DiagnosticArray()
334  msg.header.stamp = rospy.get_rostime()
335  msg.status.append(self._temp_stat)
336  if self._home_dir != '':
338  msg.status.append(self._usage_stat)
339 
340  if rospy.get_time() - self._last_publish_time > 0.5:
341  self._diag_pub.publish(msg)
342  self._last_publish_time = rospy.get_time()
343 
344 
345 
346 
347 ##\todo Need to check HD input/output too using iostat
348 
349 if __name__ == '__main__':
350  hostname = socket.gethostname()
351 
352  import optparse
353  parser = optparse.OptionParser(usage="usage: hd_monitor.py [--diag-hostname=cX]")
354  parser.add_option("--diag-hostname", dest="diag_hostname",
355  help="Computer name in diagnostics output (ex: 'c1')",
356  metavar="DIAG_HOSTNAME",
357  action="store", default = hostname)
358  options, args = parser.parse_args(rospy.myargv())
359 
360  home_dir = ''
361  if len(args) > 1:
362  home_dir = args[1]
363 
364  try:
365  rospy.init_node('hd_monitor_%s' % hostname)
366  except rospy.exceptions.ROSInitException:
367  print('HD monitor is unable to initialize node. Master may not be running.')
368  sys.exit(0)
369 
370  hd_monitor = hd_monitor(hostname, options.diag_hostname, home_dir)
371  rate = rospy.Rate(1.0)
372 
373  try:
374  while not rospy.is_shutdown():
375  rate.sleep()
376  hd_monitor.publish_stats()
377  except KeyboardInterrupt:
378  pass
379  except Exception as e:
380  traceback.print_exc()
381 
382  hd_monitor.cancel_timers()
383  sys.exit(0)
384 
385 
386 
def get_hddtemp_data(hostname='localhost', port=7634)
Connects to hddtemp daemon to get temp, HD make.
Definition: hd_monitor.py:74
def publish_stats(self)
Definition: hd_monitor.py:329
def cancel_timers(self)
Must have the lock to cancel everything.
Definition: hd_monitor.py:179
def __init__(self, hostname, diag_hostname, home_dir='')
Definition: hd_monitor.py:141
def update_status_stale(stat, last_update_time)
Definition: hd_monitor.py:116
def check_disk_usage(self)
Definition: hd_monitor.py:246


pr2_computer_monitor
Author(s): Kevin Watts (watts@willowgarage.com)
autogenerated on Tue Jun 1 2021 02:50:51