cpu_monitor.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 #
3 # Copyright 2017 Fraunhofer Institute for Manufacturing Engineering and Automation (IPA)
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 
19 
20 import sys, os, time
21 import subprocess
22 import string
23 import socket
24 import psutil
25 import requests
26 import numpy as np
27 import math
28 
29 import rospy
30 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
31 
32 stat_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'Warning', DiagnosticStatus.ERROR: 'Error', DiagnosticStatus.STALE: 'Stale' }
33 
34 class CPUMonitor():
35  def __init__(self, hostname, diag_hostname):
36  self._check_ipmi = rospy.get_param('~check_ipmi_tool', False)
37  self._check_core_temps = rospy.get_param('~check_core_temps', False)
38 
39  self._core_load_warn = rospy.get_param('~core_load_warn', 90)
40  self._core_load_error = rospy.get_param('~core_load_error', 110)
41  self._load1_threshold = rospy.get_param('~load1_threshold', 5.0)
42  self._load5_threshold = rospy.get_param('~load5_threshold', 3.0)
43  self._core_temp_warn = rospy.get_param('~core_temp_warn', 90)
44  self._core_temp_error = rospy.get_param('~core_temp_error', 95)
45  self._mem_warn = rospy.get_param('~mem_warn', 25)
46  self._mem_error = rospy.get_param('~mem_error', 1)
47 
48  self._check_thermal_throttling_events = rospy.get_param('~check_thermal_throttling_events', False)
49  self._thermal_throttling_threshold = rospy.get_param('~thermal_throttling_threshold', 1000)
50 
51  self._check_idlejitter = rospy.get_param('~check_idlejitter', False)
52  self._idlejitter_min_threshold = rospy.get_param('~idlejitter_min_threshold', 50000)
53  self._idlejitter_max_threshold = rospy.get_param('~idlejitter_max_threshold', 2000000)
54  self._idlejitter_average_threshold = rospy.get_param('~idlejitter_average_threshold', 200000)
55 
56  self._num_cores = rospy.get_param('~num_cores', psutil.cpu_count())
57 
58  # Get temp_input files
60 
61  # CPU stats
62  self._info_stat = DiagnosticStatus()
63  self._info_stat.name = '%s CPU Info' % diag_hostname
64  self._info_stat.level = DiagnosticStatus.WARN
65  self._info_stat.hardware_id = hostname
66  self._info_stat.message = 'No Data'
67  self._info_stat.values = []
68 
69  self._usage_stat = DiagnosticStatus()
70  self._usage_stat.name = '%s CPU Usage' % diag_hostname
71  self._usage_stat.level = DiagnosticStatus.WARN
72  self._usage_stat.hardware_id = hostname
73  self._usage_stat.message = 'No Data'
74  self._usage_stat.values = []
75 
76  self._memory_stat = DiagnosticStatus()
77  self._memory_stat.name = '%s Memory Usage' % diag_hostname
78  self._memory_stat.level = DiagnosticStatus.WARN
79  self._memory_stat.hardware_id = hostname
80  self._memory_stat.message = 'No Data'
81  self._memory_stat.values = []
82 
83  self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=1)
84  self._publish_timer = rospy.Timer(rospy.Duration(1.0), self.publish_stats)
85  self._info_timer = rospy.Timer(rospy.Duration(5.0), self.check_info)
86  self._usage_timer = rospy.Timer(rospy.Duration(5.0), self.check_usage)
87  self._memory_timer = rospy.Timer(rospy.Duration(5.0), self.check_memory)
88 
89  ##\brief Output entire IPMI data set
90  def check_ipmi(self):
91  diag_vals = []
92  diag_msgs = []
93  diag_level = DiagnosticStatus.OK
94 
95  try:
96  p = subprocess.Popen('sudo ipmitool sdr',
97  stdout = subprocess.PIPE,
98  stderr = subprocess.PIPE, shell = True)
99  stdout, stderr = p.communicate()
100  retcode = p.returncode
101  try:
102  stdout = stdout.decode() #python3
103  except (UnicodeDecodeError, AttributeError):
104  pass
105 
106  if retcode != 0:
107  diag_level = DiagnosticStatus.ERROR
108  diag_msgs = [ 'ipmitool Error' ]
109  diag_vals = [ KeyValue(key = 'IPMI Error', value = stderr) ,
110  KeyValue(key = 'Output', value = stdout) ]
111  return diag_vals, diag_msgs, diag_level
112 
113  lines = stdout.split('\n')
114  if len(lines) < 2:
115  diag_vals = [ KeyValue(key = 'ipmitool status', value = 'No output') ]
116 
117  diag_msgs = [ 'No ipmitool response' ]
118  diag_level = DiagnosticStatus.ERROR
119 
120  return diag_vals, diag_msgs, diag_level
121 
122  for ln in lines:
123  if len(ln) < 3:
124  continue
125 
126  words = ln.split('|')
127  if len(words) < 3:
128  continue
129 
130  name = words[0].strip()
131  ipmi_val = words[1].strip()
132  #stat_byte = words[2].strip()
133 
134  # CPU temps
135  if words[0].startswith('CPU') and words[0].strip().endswith('Temp'):
136  if words[1].strip().endswith('degrees C'):
137  tmp = ipmi_val.rstrip(' degrees C').lstrip()
138  try:
139  temperature = float(tmp)
140  diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
141 
142  #cpu_name = name.split()[0]
143  if temperature >= self._core_temp_error: # CPU should shut down here
144  diag_level = max(diag_level, DiagnosticStatus.ERROR)
145  diag_msgs.append('CPU Hot')
146  elif temperature >= self._core_temp_warn:
147  diag_level = max(diag_level, DiagnosticStatus.WARN)
148  diag_msgs.append('CPU Warm')
149  except ValueError:
150  diag_level = max(diag_level, DiagnosticStatus.ERROR)
151  diag_msgs.append('Error: temperature not numeric')
152  else:
153  diag_vals.append(KeyValue(key = name, value = words[1]))
154 
155 
156  # MP, BP, FP temps
157  if name == 'MB Temp' or name == 'BP Temp' or name == 'FP Temp':
158  if ipmi_val.endswith('degrees C'):
159  tmp = ipmi_val.rstrip(' degrees C').lstrip()
160  diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
161  # Give temp warning
162  dev_name = name.split()[0]
163  try:
164  temperature = float(tmp)
165 
166  if temperature >= 60 and temperature < 75:
167  diag_level = max(diag_level, DiagnosticStatus.WARN)
168  diag_msgs.append('%s Warm' % dev_name)
169 
170  if temperature >= 75:
171  diag_level = max(diag_level, DiagnosticStatus.ERROR)
172  diag_msgs.append('%s Hot' % dev_name)
173  except ValueError:
174  diag_level = max(diag_level, DiagnosticStatus.ERROR)
175  diag_msgs.append('%s Error: temperature not numeric' % dev_name)
176  else:
177  diag_vals.append(KeyValue(key = name, value = ipmi_val))
178 
179  # CPU fan speeds
180  if (name.startswith('CPU') and name.endswith('Fan')) or name == 'MB Fan':
181  if ipmi_val.endswith('RPM'):
182  rpm = ipmi_val.rstrip(' RPM').lstrip()
183  try:
184  if int(rpm) == 0:
185  diag_level = max(diag_level, DiagnosticStatus.ERROR)
186  diag_msgs.append('CPU Fan Off')
187 
188  diag_vals.append(KeyValue(key = name + ' RPM', value = rpm))
189  except ValueError:
190  diag_vals.append(KeyValue(key = name, value = ipmi_val))
191 
192  # If CPU is hot we get an alarm from ipmitool, report that too
193  # CPU should shut down if we get a hot alarm, so report as error
194  if name.startswith('CPU') and name.endswith('hot'):
195  if ipmi_val == '0x01':
196  diag_vals.append(KeyValue(key = name, value = 'OK'))
197  else:
198  diag_vals.append(KeyValue(key = name, value = 'Hot'))
199  diag_level = max(diag_level, DiagnosticStatus.ERROR)
200  diag_msgs.append('CPU Hot Alarm')
201 
202  except Exception as e:
203  diag_level = DiagnosticStatus.ERROR
204  diag_msgs = [ 'IPMI Exception' ]
205  diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
206 
207  return diag_vals, diag_msgs, diag_level
208 
209  ##\brief Check CPU core temps
210 
212  def check_core_temps(self):
213  diag_vals = []
214  diag_msgs = []
215  diag_level = DiagnosticStatus.OK
216 
217  try:
218  for device_type, devices in list(self._temp_vals.items()):
219  for dev in devices:
220  cmd = 'cat %s' % dev[1]
221  p = subprocess.Popen(cmd, stdout = subprocess.PIPE,
222  stderr = subprocess.PIPE, shell = True)
223  stdout, stderr = p.communicate()
224  retcode = p.returncode
225  try:
226  stdout = stdout.decode() #python3
227  except (UnicodeDecodeError, AttributeError):
228  pass
229 
230  if retcode != 0:
231  diag_level = DiagnosticStatus.ERROR
232  diag_msgs = [ 'Core Temp Error' ]
233  diag_vals = [ KeyValue(key = 'Core Temp Error', value = stderr),
234  KeyValue(key = 'Output', value = stdout) ]
235  return diag_vals, diag_msgs, diag_level
236 
237  tmp = stdout.strip()
238  if device_type == 'platform':
239  try:
240  temp = float(tmp) / 1000
241  diag_vals.append(KeyValue(key = 'Temp '+dev[0], value = str(temp)))
242 
243  if temp >= self._core_temp_error:
244  diag_level = max(diag_level, DiagnosticStatus.OK) #do not set ERROR
245  diag_msgs.append('CPU Hot')
246  elif temp >= self._core_temp_warn:
247  diag_level = max(diag_level, DiagnosticStatus.OK) #do not set WARN
248  diag_msgs.append('CPU Warm')
249  except ValueError:
250  diag_level = max(diag_level, DiagnosticStatus.ERROR) # Error if not numeric value
251  diag_vals.append(KeyValue(key = 'Temp '+dev[0], value = tmp))
252  else: # device_type == `virtual`
253  diag_vals.append(KeyValue(key = 'Temp '+dev[0], value = tmp))
254 
255  except Exception as e:
256  diag_level = DiagnosticStatus.ERROR
257  diag_msgs = [ 'Core Temp Exception' ]
258  diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
259 
260  return diag_vals, diag_msgs, diag_level
261 
262  ##\brief Checks clock speed from reading from CPU info
263  def check_clock_speed(self):
264  diag_vals = []
265  diag_msgs = []
266  diag_level = DiagnosticStatus.OK
267 
268  try:
269  # get current freq
270  p = subprocess.Popen('cat /proc/cpuinfo | grep MHz',
271  stdout = subprocess.PIPE,
272  stderr = subprocess.PIPE, shell = True)
273  stdout, stderr = p.communicate()
274  retcode = p.returncode
275  try:
276  stdout = stdout.decode() #python3
277  except (UnicodeDecodeError, AttributeError):
278  pass
279 
280  if retcode != 0:
281  diag_level = DiagnosticStatus.ERROR
282  diag_msgs = [ 'Clock Speed Error' ]
283  diag_vals = [ KeyValue(key = 'Clock Speed Error', value = stderr),
284  KeyValue(key = 'Output', value = stdout) ]
285  return (diag_vals, diag_msgs, diag_level)
286 
287  for index, ln in enumerate(stdout.split('\n')):
288  words = ln.split(':')
289  if len(words) < 2:
290  continue
291 
292  speed = words[1].strip().split('.')[0] # Conversion to float doesn't work with decimal
293  diag_vals.append(KeyValue(key = 'Core %d MHz' % index, value = speed))
294  try:
295  _ = float(speed)
296  except ValueError:
297  diag_level = max(diag_level, DiagnosticStatus.ERROR)
298  diag_msgs = [ 'Clock speed not numeric' ]
299 
300  # get max freq
301  p = subprocess.Popen('lscpu | grep "max MHz"',
302  stdout = subprocess.PIPE,
303  stderr = subprocess.PIPE, shell = True)
304  stdout, stderr = p.communicate()
305  retcode = p.returncode
306  try:
307  stdout = stdout.decode() #python3
308  except (UnicodeDecodeError, AttributeError):
309  pass
310 
311  if retcode != 0:
312  diag_level = DiagnosticStatus.ERROR
313  diag_msgs = [ 'Clock Speed Error' ]
314  diag_vals = [ KeyValue(key = 'Clock Speed Error', value = stderr),
315  KeyValue(key = 'Output', value = stdout) ]
316  return (diag_vals, diag_msgs, diag_level)
317 
318  diag_vals.append(KeyValue(key = stdout.split(':')[0].strip(), value = str(stdout.split(':')[1].strip())))
319 
320  # get min freq
321  p = subprocess.Popen('lscpu | grep "min MHz"',
322  stdout = subprocess.PIPE,
323  stderr = subprocess.PIPE, shell = True)
324  stdout, stderr = p.communicate()
325  retcode = p.returncode
326  try:
327  stdout = stdout.decode() #python3
328  except (UnicodeDecodeError, AttributeError):
329  pass
330 
331  if retcode != 0:
332  diag_level = DiagnosticStatus.ERROR
333  diag_msgs = [ 'Clock Speed Error' ]
334  diag_vals = [ KeyValue(key = 'Clock Speed Error', value = stderr),
335  KeyValue(key = 'Output', value = stdout) ]
336  return (diag_vals, diag_msgs, diag_level)
337 
338  diag_vals.append(KeyValue(key = stdout.split(':')[0].strip(), value = str(stdout.split(':')[1].strip())))
339 
340  except Exception as e:
341  diag_level = DiagnosticStatus.ERROR
342  diag_msgs = [ 'Clock Speed Exception' ]
343  diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
344 
345  return diag_vals, diag_msgs, diag_level
346 
347  ##\brief Uses 'uptime' to see load average
348  def check_uptime(self):
349  diag_vals = []
350  diag_msg = ''
351  diag_level = DiagnosticStatus.OK
352 
353  load_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High Load', DiagnosticStatus.ERROR: 'Very High Load' }
354 
355  try:
356  p = subprocess.Popen('uptime', stdout = subprocess.PIPE,
357  stderr = subprocess.PIPE, shell = True)
358  stdout, stderr = p.communicate()
359  retcode = p.returncode
360  try:
361  stdout = stdout.decode() #python3
362  except (UnicodeDecodeError, AttributeError):
363  pass
364 
365  if retcode != 0:
366  diag_level = DiagnosticStatus.ERROR
367  diag_msg = 'Uptime Error'
368  diag_vals = [ KeyValue(key = 'Uptime Error', value = stderr),
369  KeyValue(key = 'Output', value = stdout) ]
370  return (diag_vals, diag_msg, diag_level)
371 
372  upvals = stdout.split()
373  load1 = upvals[-3].rstrip(',').replace(',', '.')
374  load5 = upvals[-2].rstrip(',').replace(',', '.')
375  load15 = upvals[-1].replace(',', '.')
376  num_users = upvals[-7]
377 
378  # Give warning if we go over load limit
379  if float(load1) > self._load1_threshold or float(load5) > self._load5_threshold:
380  diag_level = DiagnosticStatus.WARN
381 
382  diag_vals.append(KeyValue(key = 'Load Average Status', value = load_dict[diag_level]))
383  diag_vals.append(KeyValue(key = '1 min Load Average', value = load1))
384  diag_vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(self._load1_threshold)))
385  diag_vals.append(KeyValue(key = '5 min Load Average', value = load5))
386  diag_vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(self._load5_threshold)))
387  diag_vals.append(KeyValue(key = '15 min Load Average', value = load15))
388  diag_vals.append(KeyValue(key = 'Number of Users', value = num_users))
389 
390  diag_msg = load_dict[diag_level]
391 
392  except Exception as e:
393  diag_level = DiagnosticStatus.ERROR
394  diag_msg = 'Uptime Exception'
395  diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
396 
397  return diag_vals, diag_msg, diag_level
398 
399  ##\brief Uses 'free -m' to check free memory
400  def check_free_memory(self):
401  diag_vals = []
402  diag_msg = ''
403  diag_level = DiagnosticStatus.OK
404 
405  mem_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'Low Memory', DiagnosticStatus.ERROR: 'Very Low Memory' }
406 
407  try:
408  p = subprocess.Popen('free -m',
409  stdout = subprocess.PIPE,
410  stderr = subprocess.PIPE, shell = True)
411  stdout, stderr = p.communicate()
412  retcode = p.returncode
413  try:
414  stdout = stdout.decode() #python3
415  except (UnicodeDecodeError, AttributeError):
416  pass
417 
418  if retcode != 0:
419  diag_level = DiagnosticStatus.ERROR
420  diag_msg = 'Memory Usage Error'
421  diag_vals = [ KeyValue(key = 'Memory Usage Error', value = stderr),
422  KeyValue(key = 'Output', value = stdout) ]
423  return (diag_vals, diag_msg, diag_level)
424 
425  rows = stdout.split('\n')
426 
427  # Mem
428  data = rows[1].split()
429  total_mem = data[1]
430  used_mem = data[2]
431  free_mem = data[3]
432  shared_mem = data[4]
433  cache_mem = data[5]
434  available_mem = data[6]
435 
436  diag_level = DiagnosticStatus.OK
437  if float(free_mem) < self._mem_warn:
438  diag_level = DiagnosticStatus.WARN
439  if float(free_mem) < self._mem_error:
440  diag_level = DiagnosticStatus.ERROR
441 
442  diag_vals.append(KeyValue(key = 'Mem Status', value = mem_dict[diag_level]))
443  diag_vals.append(KeyValue(key = 'Mem Total', value = total_mem))
444  diag_vals.append(KeyValue(key = 'Mem Used', value = used_mem))
445  diag_vals.append(KeyValue(key = 'Mem Free', value = free_mem))
446  diag_vals.append(KeyValue(key = 'Mem Shared', value = shared_mem))
447  diag_vals.append(KeyValue(key = 'Mem Buff/Cache', value = cache_mem))
448  diag_vals.append(KeyValue(key = 'Mem Available', value = available_mem))
449 
450  # Swap
451  data = rows[2].split()
452  total_mem = data[1]
453  used_mem = data[2]
454  free_mem = data[3]
455 
456  diag_vals.append(KeyValue(key = 'Swap Total', value = total_mem))
457  diag_vals.append(KeyValue(key = 'Swap Used', value = used_mem))
458  diag_vals.append(KeyValue(key = 'Swap Free', value = free_mem))
459 
460  diag_msg = mem_dict[diag_level]
461 
462  except Exception as e:
463  diag_level = DiagnosticStatus.ERROR
464  diag_msg = 'Memory Usage Exception'
465  diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
466 
467  return diag_vals, diag_msg, diag_level
468 
469  ##\brief Use mpstat to find CPU usage
470  def check_mpstat(self):
471  diag_vals = []
472  diag_msg = ''
473  diag_level = DiagnosticStatus.OK
474 
475  load_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High Load', DiagnosticStatus.ERROR: 'Error' }
476 
477  try:
478  p = subprocess.Popen('mpstat -P ALL 1 1',
479  stdout = subprocess.PIPE,
480  stderr = subprocess.PIPE, shell = True)
481  stdout, stderr = p.communicate()
482  retcode = p.returncode
483  try:
484  stdout = stdout.decode() #python3
485  except (UnicodeDecodeError, AttributeError):
486  pass
487 
488  if retcode != 0:
489  diag_level = DiagnosticStatus.ERROR
490  diag_msg = 'CPU Usage Error'
491  diag_vals = [ KeyValue(key = 'CPU Usage Error', value = stderr),
492  KeyValue(key = 'Output', value = stdout) ]
493  return (diag_vals, diag_msg, diag_level)
494 
495  # Check which column '%idle' is, #4539
496  # mpstat output changed between 8.06 and 8.1
497  rows = stdout.split('\n')
498  col_names = rows[2].split()
499  idle_col = -1 if (len(col_names) > 2 and col_names[-1] == '%idle') else -2
500 
501  num_cores = 0
502  cores_loaded = 0
503  for index, row in enumerate(stdout.split('\n')):
504  if index < 3:
505  continue
506 
507  # Skip row containing 'all' data
508  if row.find('all') > -1:
509  continue
510 
511  lst = row.split()
512  if len(lst) < 8:
513  continue
514 
515  ## Ignore 'Average: ...' data
516  if lst[0].startswith('Average'):
517  continue
518 
519  cpu_name = '%d' % (num_cores)
520  idle = lst[idle_col].replace(',', '.')
521  user = lst[3].replace(',', '.')
522  nice = lst[4].replace(',', '.')
523  system = lst[5].replace(',', '.')
524 
525  core_level = DiagnosticStatus.OK
526  usage = float(user) + float(nice)
527  if usage > self._core_load_warn:
528  cores_loaded += 1
529  core_level = DiagnosticStatus.WARN
530  if usage > self._core_load_error:
531  core_level = DiagnosticStatus.ERROR
532 
533  diag_vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level]))
534  diag_vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = user))
535  diag_vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = nice))
536  diag_vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = system))
537  diag_vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = idle))
538 
539  num_cores += 1
540 
541  # Warn for high load only if we have <= 2 cores that aren't loaded
542  if num_cores - cores_loaded <= 2 and num_cores > 2:
543  diag_level = DiagnosticStatus.WARN
544 
545  # Check the number of cores if self._num_cores > 0, #4850
546  if self._num_cores > 0 and self._num_cores != num_cores:
547  diag_level = DiagnosticStatus.ERROR
548  diag_msg = 'Incorrect number of CPU cores: Expected %d, got %d. Computer may have not booted properly.' % self._num_cores, num_cores
549  return diag_vals, diag_msg, diag_level
550 
551  diag_msg = load_dict[diag_level]
552 
553  except Exception as e:
554  diag_level = DiagnosticStatus.ERROR
555  diag_msg = 'CPU Usage Exception'
556  diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
557 
558  return diag_vals, diag_msg, diag_level
559 
560 
561  def query_netdata(self, chart, after):
562  NETDATA_URI = 'http://127.0.0.1:19999/api/v1/data?chart=%s&format=json&after=-%d'
563  url = NETDATA_URI % (chart, int(after))
564 
565  try:
566  r = requests.get(url)
567  except requests.ConnectionError as ex:
568  rospy.logerr("NetData ConnectionError %r", ex)
569  return None
570 
571  if r.status_code != 200:
572  rospy.logerr("NetData request not successful with status_code %d", r.status_code)
573  return None
574 
575  rdata = r.json()
576 
577  sdata = list(zip(*rdata['data']))
578  d = dict()
579 
580  for idx, label in enumerate(rdata['labels']):
581  np_array = np.array(sdata[idx])
582  if np_array.dtype == np.object:
583  rospy.logwarn("Data from NetData malformed")
584  return None
585  d[label] = np_array
586 
587  return d
588 
589 
590  def check_core_throttling(self, interval=1):
591  throt_dict = {DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High Thermal Throttling Events',
592  DiagnosticStatus.ERROR: 'No valid Data from NetData'}
593 
594  throt_level = DiagnosticStatus.OK
595 
596  vals = []
597 
598  netdata = self.query_netdata('cpu.core_throttling', interval)
599 
600  for i in range(self._num_cores):
601  lbl = 'CPU %d Thermal Throttling Events' % i
602  netdata_key = 'cpu%d' % i
603 
604  core_mean = 'N/A'
605  if netdata is not None and netdata_key in netdata:
606  core_data = netdata[netdata_key]
607  if core_data is not None:
608  core_mean = np.mean(core_data)
609 
610  if core_mean > self._thermal_throttling_threshold:
611  throt_level = DiagnosticStatus.WARN
612  else:
613  throt_level = DiagnosticStatus.ERROR
614 
615  vals.append(KeyValue(key=lbl, value='%r' % core_mean))
616 
617  vals.insert(0, KeyValue(key='Thermal Throttling Status', value=throt_dict[throt_level]))
618  vals.append(KeyValue(key='Thermal Throttling Threshold', value=str(self._thermal_throttling_threshold)))
619 
620  return throt_level, throt_dict[throt_level], vals
621 
622 
623  def check_idlejitter(self, interval=1):
624  jitter_dict = {DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High IDLE Jitter',
625  DiagnosticStatus.ERROR: 'No valid Data from NetData'}
626 
627  jitter_level = DiagnosticStatus.OK
628 
629  vals = []
630 
631  netdata = self.query_netdata('system.idlejitter', interval)
632 
633  metric_list = [
634  ('IDLE Jitter Min', 'min', self._idlejitter_min_threshold, np.min),
635  ('IDLE Jitter Max', 'max', self._idlejitter_max_threshold, np.max),
636  ('IDLE Jitter Average', 'average', self._idlejitter_average_threshold, np.mean),
637  ]
638 
639  for metric_label, metric_key, metric_threshold, aggregate_fnc in metric_list:
640  metric_aggreagte = 'N/A'
641  if netdata is not None and metric_key in netdata:
642  metric_data = netdata[metric_key]
643  if metric_data is not None:
644  metric_aggreagte = aggregate_fnc(metric_data)
645 
646  if metric_aggreagte > metric_threshold:
647  jitter_level = DiagnosticStatus.WARN
648  else:
649  jitter_level = DiagnosticStatus.ERROR
650 
651  vals.append(KeyValue(key=metric_label, value=str(metric_aggreagte)))
652  vals.append(KeyValue(key=metric_label + ' Threshold', value=str(metric_threshold)))
653 
654  vals.insert(0, KeyValue(key='IDLE Jitter Status', value=jitter_dict[jitter_level]))
655 
656  return jitter_level, jitter_dict[jitter_level], vals
657 
658 
659  ##\brief Returns names for core temperature files
661  devices = {}
662  platform_vals = []
663  virtual_vals = []
664  try:
665  #platform devices
666  p = subprocess.Popen('find /sys/devices/platform -name temp*_input',
667  stdout = subprocess.PIPE,
668  stderr = subprocess.PIPE, shell = True)
669  stdout, stderr = p.communicate()
670  retcode = p.returncode
671  try:
672  stdout = stdout.decode() #python3
673  except (UnicodeDecodeError, AttributeError):
674  pass
675 
676  if retcode != 0:
677  rospy.logerr('Error find core temp locations: %s' % stderr)
678  return []
679 
680  for ln in stdout.split('\n'):
681  if ln:
682  device_path, device_file = os.path.split(ln.strip())
683  device_label = device_path+'/'+device_file.split('_')[0]+'_label'
684  name = open(device_label, 'r').read()
685  pair = (name.strip(), ln.strip())
686  platform_vals.append(pair)
687 
688  #virtual devices
689  p = subprocess.Popen('find /sys/devices/virtual -name temp*_input',
690  stdout = subprocess.PIPE,
691  stderr = subprocess.PIPE, shell = True)
692  stdout, stderr = p.communicate()
693  retcode = p.returncode
694  try:
695  stdout = stdout.decode() #python3
696  except (UnicodeDecodeError, AttributeError):
697  pass
698 
699  if retcode != 0:
700  rospy.logerr('Error find core temp locations: %s' % stderr)
701  return []
702 
703  for ln in stdout.split('\n'):
704  if ln:
705  device_path, device_file = os.path.split(ln.strip())
706  name = open(device_path+'/name', 'r').read()
707  pair = (name.strip(), ln.strip())
708  virtual_vals.append(pair)
709 
710  devices['platform'] = platform_vals
711  devices['virtual'] = virtual_vals
712  return devices
713  except Exception as e:
714  rospy.logerr('Exception finding temp vals: {}'.format(e))
715  return []
716 
717 
718  def check_info(self, event):
719  diag_vals = []
720  diag_msgs = []
721  diag_level = DiagnosticStatus.OK
722 
723  if self._check_ipmi:
724  ipmi_vals, ipmi_msgs, ipmi_level = self.check_ipmi()
725  diag_vals.extend(ipmi_vals)
726  diag_msgs.extend(ipmi_msgs)
727  diag_level = max(diag_level, ipmi_level)
728 
729  if self._check_core_temps:
730  core_vals, core_msgs, core_level = self.check_core_temps()
731  diag_vals.extend(core_vals)
732  diag_msgs.extend(core_msgs)
733  diag_level = max(diag_level, core_level)
734 
735  clock_vals, clock_msgs, clock_level = self.check_clock_speed()
736  diag_vals.extend(clock_vals)
737  diag_msgs.extend(clock_msgs)
738  diag_level = max(diag_level, clock_level)
739 
740  diag_log = set(diag_msgs)
741  if len(diag_log) > DiagnosticStatus.OK:
742  message = ', '.join(diag_log)
743  else:
744  message = stat_dict[diag_level]
745 
746  self._info_stat.values = diag_vals
747  self._info_stat.message = message
748  self._info_stat.level = diag_level
749 
750  def check_usage(self, event):
751  diag_vals = []
752  diag_msgs = []
753  diag_level = DiagnosticStatus.OK
754 
755  # Check mpstat
756  mp_vals, mp_msg, mp_level = self.check_mpstat()
757  diag_vals.extend(mp_vals)
758  if mp_level > DiagnosticStatus.OK:
759  diag_msgs.append(mp_msg)
760  diag_level = max(diag_level, mp_level)
761 
762  # Check NetData cpu.core_throttling
764  interval = math.ceil(self._usage_timer._period.to_sec())
765  throt_level, throt_msg, throt_vals = self.check_core_throttling(interval=interval)
766  diag_vals.extend(throt_vals)
767  if throt_level > 0:
768  diag_msgs.append(throt_msg)
769  diag_level = max(diag_level, throt_level)
770 
771  # Check NetData system.idlejitter
772  if self._check_idlejitter:
773  interval = math.ceil(self._usage_timer._period.to_sec())
774  jitter_level, jitter_msg, jitter_vals = self.check_idlejitter(interval=interval)
775  diag_vals.extend(jitter_vals)
776  if jitter_level > 0:
777  diag_msgs.append(jitter_msg)
778  diag_level = max(diag_level, jitter_level)
779 
780  # Check uptime
781  up_vals, up_msg, up_level = self.check_uptime()
782  diag_vals.extend(up_vals)
783  if up_level > DiagnosticStatus.OK:
784  diag_msgs.append(up_msg)
785  diag_level = max(diag_level, up_level)
786 
787  if diag_msgs and diag_level > DiagnosticStatus.OK:
788  usage_msg = ', '.join(set(diag_msgs))
789  else:
790  usage_msg = stat_dict[diag_level]
791 
792  self._usage_stat.values = diag_vals
793  self._usage_stat.message = usage_msg
794  self._usage_stat.level = diag_level
795 
796  def check_memory(self, event):
797  diag_vals = []
798  diag_msgs = []
799  diag_level = DiagnosticStatus.OK
800 
801  # Check memory
802  mem_vals, mem_msg, mem_level = self.check_free_memory()
803  diag_vals.extend(mem_vals)
804  if mem_level > DiagnosticStatus.OK:
805  diag_msgs.append(mem_msg)
806  diag_level = max(diag_level, mem_level)
807 
808  if diag_msgs and diag_level > DiagnosticStatus.OK:
809  memory_msg = ', '.join(set(diag_msgs))
810  else:
811  memory_msg = stat_dict[diag_level]
812 
813  self._memory_stat.values = diag_vals
814  self._memory_stat.message = memory_msg
815  self._memory_stat.level = diag_level
816 
817  def publish_stats(self, event):
818  msg = DiagnosticArray()
819  msg.header.stamp = rospy.get_rostime()
820  msg.status.append(self._info_stat)
821  msg.status.append(self._usage_stat)
822  msg.status.append(self._memory_stat)
823  self._diag_pub.publish(msg)
824 
825 
826 if __name__ == '__main__':
827  hostname = socket.gethostname()
828 
829  import optparse
830  parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]")
831  parser.add_option("--diag-hostname", dest="diag_hostname",
832  help="Computer name in diagnostics output (ex: 'c1')",
833  metavar="DIAG_HOSTNAME",
834  action="store", default = hostname)
835  options, args = parser.parse_args(rospy.myargv())
836 
837  try:
838  rospy.init_node('cpu_monitor_%s' % hostname)
839  except rospy.exceptions.ROSInitException:
840  print('CPU monitor is unable to initialize node. Master may not be running.')
841  sys.exit(0)
842 
843  cpu_node = CPUMonitor(hostname, options.diag_hostname)
844  rospy.spin()
def check_clock_speed(self)
Checks clock speed from reading from CPU info.
Definition: cpu_monitor.py:263
def check_memory(self, event)
Definition: cpu_monitor.py:796
def check_usage(self, event)
Definition: cpu_monitor.py:750
def check_core_throttling(self, interval=1)
Definition: cpu_monitor.py:590
def check_free_memory(self)
Uses &#39;free -m&#39; to check free memory.
Definition: cpu_monitor.py:400
def check_ipmi(self)
Output entire IPMI data set.
Definition: cpu_monitor.py:90
def get_core_temp_names(self)
Returns names for core temperature files.
Definition: cpu_monitor.py:660
def query_netdata(self, chart, after)
Definition: cpu_monitor.py:561
def check_uptime(self)
Uses &#39;uptime&#39; to see load average.
Definition: cpu_monitor.py:348
def __init__(self, hostname, diag_hostname)
Definition: cpu_monitor.py:35
def publish_stats(self, event)
Definition: cpu_monitor.py:817
def check_info(self, event)
Definition: cpu_monitor.py:718
def check_idlejitter(self, interval=1)
Definition: cpu_monitor.py:623
def check_mpstat(self)
Use mpstat to find CPU usage.
Definition: cpu_monitor.py:470
def check_core_temps(self)
Check CPU core temps.
Definition: cpu_monitor.py:212


cob_monitoring
Author(s): Florian Weisshardt , Felix Messmer
autogenerated on Wed Apr 7 2021 03:03:11