hd_monitor.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 #
3 # Software License Agreement (BSD License)
4 #
5 # Copyright (c) 2009, Willow Garage, Inc.
6 # All rights reserved.
7 #
8 # Redistribution and use in source and binary forms, with or without
9 # modification, are permitted provided that the following conditions
10 # are met:
11 #
12 # * Redistributions of source code must retain the above copyright
13 # notice, this list of conditions and the following disclaimer.
14 # * Redistributions in binary form must reproduce the above
15 # copyright notice, this list of conditions and the following
16 # disclaimer in the documentation and/or other materials provided
17 # with the distribution.
18 # * Neither the name of the Willow Garage nor the names of its
19 # contributors may be used to endorse or promote products derived
20 # from this software without specific prior written permission.
21 #
22 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
23 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
24 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
25 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
26 # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
27 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
28 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
29 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
30 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
32 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
33 # POSSIBILITY OF SUCH DAMAGE.
34 
35 
36 
37 from __future__ import with_statement
38 import roslib
39 roslib.load_manifest('pr2_computer_monitor')
40 
41 import rospy
42 
43 import traceback
44 import threading
45 from threading import Timer
46 import sys, os, time
47 from time import sleep
48 import subprocess
49 
50 import socket
51 
52 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
53 
54 
56 if sys.version_info[:3] == (2, 7, 3):
57  import threading
58  threading._DummyThread._Thread__stop = lambda x: 42
59 
60 
61 low_hd_level = 5
62 critical_hd_level = 1
63 
64 hd_temp_warn = 55 #3580, setting to 55C to after checking manual
65 hd_temp_error = 70 # Above this temperature, hard drives will have serious problems
66 
67 stat_dict = { 0: 'OK', 1: 'Warning', 2: 'Error' }
68 temp_dict = { 0: 'OK', 1: 'Hot', 2: 'Critical Hot' }
69 usage_dict = { 0: 'OK', 1: 'Low Disk Space', 2: 'Very Low Disk Space' }
70 
71 REMOVABLE = ['/dev/sda'] # Store removable drives so we can ignore if removed
72 
73 
74 def get_hddtemp_data(hostname = 'localhost', port = 7634):
75  try:
76  hd_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
77  hd_sock.connect((hostname, port))
78  sock_data = ''
79  while True:
80  newdat = hd_sock.recv(1024)
81  if len(newdat) == 0:
82  break
83  newdat = newdat.decode()
84  sock_data = sock_data + newdat
85  hd_sock.close()
86 
87  sock_vals = sock_data.split('|')
88 
89  # Format of output looks like ' | DRIVE | MAKE | TEMP | '
90  idx = 0
91 
92  drives = []
93  makes = []
94  temps = []
95  while idx + 5 < len(sock_vals):
96  this_drive = sock_vals[idx + 1]
97  this_make = sock_vals[idx + 2]
98  this_temp = sock_vals[idx + 3]
99 
100  # Sometimes we get duplicate makes if hard drives are mounted
101  # to two different points
102  if this_make in makes:
103  idx += 5
104  continue
105 
106  drives.append(this_drive)
107  makes.append(this_make)
108  temps.append(this_temp)
109 
110  idx += 5
111 
112  return True, drives, makes, temps
113  except:
114  rospy.logerr(traceback.format_exc())
115  return False, [ 'Exception' ], [ traceback.format_exc() ], [ 0 ]
116 
117 def update_status_stale(stat, last_update_time):
118  time_since_update = rospy.get_time() - last_update_time
119 
120  stale_status = 'OK'
121  if time_since_update > 20 and time_since_update <= 35:
122  stale_status = 'Lagging'
123  if stat.level == DiagnosticStatus.OK:
124  stat.message = stale_status
125  elif stat.message.find(stale_status) < 0:
126  stat.message = ', '.join([stat.message, stale_status])
127  stat.level = max(stat.level, DiagnosticStatus.WARN)
128  if time_since_update > 35:
129  stale_status = 'Stale'
130  if stat.level == DiagnosticStatus.OK:
131  stat.message = stale_status
132  elif stat.message.find(stale_status) < 0:
133  stat.message = ', '.join([stat.message, stale_status])
134  stat.level = max(stat.level, DiagnosticStatus.ERROR)
135 
136  stat.values.pop(0)
137  stat.values.pop(0)
138  stat.values.insert(0, KeyValue(key = 'Update Status', value = stale_status))
139  stat.values.insert(1, KeyValue(key = 'Time Since Update', value = str(time_since_update)))
140 
141 class hd_monitor():
142  def __init__(self, hostname, diag_hostname, home_dir = ''):
143  self._mutex = threading.Lock()
144 
145  self._hostname = hostname
146  self._no_temp_warn = rospy.get_param('~no_hd_temp_warn', False)
147  if self._no_temp_warn:
148  rospy.logwarn('Not warning for HD temperatures is deprecated. This will be removed in D-turtle')
149  self._home_dir = home_dir
150 
151  self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=10)
152 
156 
157  self._temp_timer = None
158  self._usage_timer = None
159 
160  self._temp_stat = DiagnosticStatus()
161  self._temp_stat.name = "%s HD Temperature" % diag_hostname
162  self._temp_stat.level = DiagnosticStatus.ERROR
163  self._temp_stat.hardware_id = hostname
164  self._temp_stat.message = 'No Data'
165  self._temp_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data'),
166  KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
167 
168  if self._home_dir != '':
169  self._usage_stat = DiagnosticStatus()
170  self._usage_stat.level = DiagnosticStatus.ERROR
171  self._usage_stat.hardware_id = hostname
172  self._usage_stat.name = '%s HD Usage' % diag_hostname
173  self._usage_stat.values = [ KeyValue(key = 'Update Status', value = 'No Data' ),
174  KeyValue(key = 'Time Since Last Update', value = 'N/A') ]
175  self.check_disk_usage()
176 
177  self.check_temps()
178 
179 
180  def cancel_timers(self):
181  if self._temp_timer:
182  self._temp_timer.cancel()
183  self._temp_timer = None
184 
185  if self._usage_timer:
186  self._usage_timer.cancel()
187  self._usage_timer = None
188 
189  def check_temps(self):
190  if rospy.is_shutdown():
191  with self._mutex:
192  self.cancel_timers()
193  return
194 
195  diag_strs = [ KeyValue(key = 'Update Status', value = 'OK' ) ,
196  KeyValue(key = 'Time Since Last Update', value = '0' ) ]
197  diag_level = DiagnosticStatus.OK
198  diag_message = 'OK'
199 
200  temp_ok, drives, makes, temps = get_hddtemp_data()
201 
202  for index in range(0, len(drives)):
203  temp = temps[index]
204 
205  if not temp.isnumeric() and drives[index] not in REMOVABLE:
206  temp_level = DiagnosticStatus.ERROR
207  temp_ok = False
208  elif not temp.isnumeric() and drives[index] in REMOVABLE:
209  temp_level = DiagnosticStatus.OK
210  temp = "Removed"
211  else:
212  temp_level = DiagnosticStatus.OK
213  if float(temp) > hd_temp_warn:
214  temp_level = DiagnosticStatus.WARN
215  if float(temp) > hd_temp_error:
216  temp_level = DiagnosticStatus.ERROR
217 
218  diag_level = max(diag_level, temp_level)
219 
220  diag_strs.append(KeyValue(key = 'Disk %d Temp Status' % index, value = temp_dict[temp_level]))
221  diag_strs.append(KeyValue(key = 'Disk %d Mount Pt.' % index, value = drives[index]))
222  diag_strs.append(KeyValue(key = 'Disk %d Device ID' % index, value = makes[index]))
223  diag_strs.append(KeyValue(key = 'Disk %d Temp' % index, value = temp))
224 
225  if not temp_ok:
226  diag_level = DiagnosticStatus.ERROR
227 
228  with self._mutex:
229  self._last_temp_time = rospy.get_time()
230  self._temp_stat.values = diag_strs
231  self._temp_stat.level = diag_level
232 
233  # Give No Data message if we have no reading
234  self._temp_stat.message = temp_dict[diag_level]
235  if not temp_ok:
236  self._temp_stat.message = 'Error'
237 
238  if self._no_temp_warn and temp_ok:
239  self._temp_stat.level = DiagnosticStatus.OK
240 
241  if not rospy.is_shutdown():
242  self._temp_timer = threading.Timer(10.0, self.check_temps)
243  self._temp_timer.start()
244  else:
245  self.cancel_timers()
246 
247  def check_disk_usage(self):
248  if rospy.is_shutdown():
249  with self._mutex:
250  self.cancel_timers()
251  return
252 
253  diag_vals = [ KeyValue(key = 'Update Status', value = 'OK' ),
254  KeyValue(key = 'Time Since Last Update', value = '0' ) ]
255  diag_level = DiagnosticStatus.OK
256  diag_message = 'OK'
257 
258  try:
259  p = subprocess.Popen(["df", "-P", "--block-size=1G", self._home_dir],
260  stdout=subprocess.PIPE, stderr=subprocess.PIPE)
261  stdout, stderr = p.communicate()
262  stdout = stdout.decode()
263  retcode = p.returncode
264 
265  if (retcode == 0):
266 
267  diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'OK'))
268  row_count = 0
269  for row in stdout.split('\n'):
270  if len(row.split()) < 2:
271  continue
272  if not row.split()[1].isnumeric() or float(row.split()[1]) < 10: # Ignore small drives
273  continue
274 
275  row_count += 1
276  g_available = row.split()[-3]
277  name = row.split()[0]
278  size = row.split()[1]
279  mount_pt = row.split()[-1]
280 
281  if (float(g_available) > low_hd_level):
282  level = DiagnosticStatus.OK
283  elif (float(g_available) > critical_hd_level):
284  level = DiagnosticStatus.WARN
285  else:
286  level = DiagnosticStatus.ERROR
287 
288  diag_vals.append(KeyValue(
289  key = 'Disk %d Name' % row_count, value = name))
290  diag_vals.append(KeyValue(
291  key = 'Disk %d Available' % row_count, value = g_available))
292  diag_vals.append(KeyValue(
293  key = 'Disk %d Size' % row_count, value = size))
294  diag_vals.append(KeyValue(
295  key = 'Disk %d Status' % row_count, value = stat_dict[level]))
296  diag_vals.append(KeyValue(
297  key = 'Disk %d Mount Point' % row_count, value = mount_pt))
298 
299  diag_level = max(diag_level, level)
300  diag_message = usage_dict[diag_level]
301 
302  else:
303  diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Failed'))
304  diag_level = DiagnosticStatus.ERROR
305  diag_message = stat_dict[diag_level]
306 
307 
308  except:
309  rospy.logerr(traceback.format_exc())
310 
311  diag_vals.append(KeyValue(key = 'Disk Space Reading', value = 'Exception'))
312  diag_vals.append(KeyValue(key = 'Disk Space Ex', value = traceback.format_exc()))
313 
314  diag_level = DiagnosticStatus.ERROR
315  diag_message = stat_dict[diag_level]
316 
317  # Update status
318  with self._mutex:
319  self._last_usage_time = rospy.get_time()
320  self._usage_stat.values = diag_vals
321  self._usage_stat.message = diag_message
322  self._usage_stat.level = diag_level
323 
324  if not rospy.is_shutdown():
325  self._usage_timer = threading.Timer(5.0, self.check_disk_usage)
326  self._usage_timer.start()
327  else:
328  self.cancel_timers()
329 
330 
331  def publish_stats(self):
332  with self._mutex:
334 
335  msg = DiagnosticArray()
336  msg.header.stamp = rospy.get_rostime()
337  msg.status.append(self._temp_stat)
338  if self._home_dir != '':
340  msg.status.append(self._usage_stat)
341 
342  if rospy.get_time() - self._last_publish_time > 0.5:
343  self._diag_pub.publish(msg)
344  self._last_publish_time = rospy.get_time()
345 
346 
347 
348 
349 
350 
351 if __name__ == '__main__':
352  hostname = socket.gethostname()
353 
354  import optparse
355  parser = optparse.OptionParser(usage="usage: hd_monitor.py [--diag-hostname=cX]")
356  parser.add_option("--diag-hostname", dest="diag_hostname",
357  help="Computer name in diagnostics output (ex: 'c1')",
358  metavar="DIAG_HOSTNAME",
359  action="store", default = hostname)
360  options, args = parser.parse_args(rospy.myargv())
361 
362  home_dir = ''
363  if len(args) > 1:
364  home_dir = args[1]
365 
366  try:
367  rospy.init_node('hd_monitor_%s' % hostname)
368  except rospy.exceptions.ROSInitException:
369  print('HD monitor is unable to initialize node. Master may not be running.')
370  sys.exit(0)
371 
372  hd_monitor = hd_monitor(hostname, options.diag_hostname, home_dir)
373  rate = rospy.Rate(1.0)
374 
375  try:
376  while not rospy.is_shutdown():
377  rate.sleep()
378  hd_monitor.publish_stats()
379  except KeyboardInterrupt:
380  pass
381  except Exception as e:
382  traceback.print_exc()
383 
384  hd_monitor.cancel_timers()
385  sys.exit(0)
386 
387 
388 
hd_monitor.hd_monitor._mutex
_mutex
Definition: hd_monitor.py:143
hd_monitor.hd_monitor.publish_stats
def publish_stats(self)
Definition: hd_monitor.py:331
hd_monitor.hd_monitor.check_temps
def check_temps(self)
Definition: hd_monitor.py:189
hd_monitor.get_hddtemp_data
def get_hddtemp_data(hostname='localhost', port=7634)
Connects to hddtemp daemon to get temp, HD make.
Definition: hd_monitor.py:74
hd_monitor.hd_monitor._no_temp_warn
_no_temp_warn
Definition: hd_monitor.py:146
hd_monitor.hd_monitor._hostname
_hostname
Definition: hd_monitor.py:145
hd_monitor.hd_monitor.cancel_timers
def cancel_timers(self)
Must have the lock to cancel everything.
Definition: hd_monitor.py:180
hd_monitor.hd_monitor._last_usage_time
_last_usage_time
Definition: hd_monitor.py:154
hd_monitor.hd_monitor._usage_timer
_usage_timer
Definition: hd_monitor.py:158
hd_monitor.hd_monitor._temp_stat
_temp_stat
Definition: hd_monitor.py:160
hd_monitor.hd_monitor._last_temp_time
_last_temp_time
Definition: hd_monitor.py:153
hd_monitor.hd_monitor._diag_pub
_diag_pub
Definition: hd_monitor.py:151
hd_monitor.hd_monitor._last_publish_time
_last_publish_time
Definition: hd_monitor.py:155
hd_monitor.hd_monitor._home_dir
_home_dir
Definition: hd_monitor.py:149
hd_monitor.hd_monitor._usage_stat
_usage_stat
Definition: hd_monitor.py:169
hd_monitor.hd_monitor.check_disk_usage
def check_disk_usage(self)
Definition: hd_monitor.py:247
hd_monitor.hd_monitor._temp_timer
_temp_timer
Definition: hd_monitor.py:157
hd_monitor.update_status_stale
def update_status_stale(stat, last_update_time)
Definition: hd_monitor.py:117
hd_monitor.hd_monitor.__init__
def __init__(self, hostname, diag_hostname, home_dir='')
Definition: hd_monitor.py:142
hd_monitor.hd_monitor
Definition: hd_monitor.py:141


pr2_computer_monitor
Author(s): Kevin Watts (watts@willowgarage.com)
autogenerated on Thu Sep 26 2024 02:53:24