cpu_monitor.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 #
3 # Copyright 2017 Fraunhofer Institute for Manufacturing Engineering and Automation (IPA)
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 
19 
20 import sys
21 import traceback
22 import socket
23 import psutil
24 import numpy as np
25 import math
26 import requests
27 
28 import rospy
29 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
30 
31 from netdata_interface.netdata_interface import NetdataInterface
32 
33 stat_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'Warning', DiagnosticStatus.ERROR: 'Error', DiagnosticStatus.STALE: 'Stale' }
34 
35 class CPUMonitor():
36  def __init__(self, hostname, diag_hostname):
38 
39  self._check_core_temps = rospy.get_param('~check_core_temps', False)
40  self._core_load_warn = rospy.get_param('~core_load_warn', 90)
41  self._core_load_error = rospy.get_param('~core_load_error', 110)
42  self._load1_threshold = rospy.get_param('~load1_threshold', 5.0)
43  self._load5_threshold = rospy.get_param('~load5_threshold', 3.0)
44  self._core_temp_warn = rospy.get_param('~core_temp_warn', 90)
45  self._core_temp_error = rospy.get_param('~core_temp_error', 95)
46  self._mem_warn = rospy.get_param('~mem_warn', 25)
47  self._mem_error = rospy.get_param('~mem_error', 1)
48 
49  self._check_thermal_throttling_events = rospy.get_param('~check_thermal_throttling_events', False)
50  self._thermal_throttling_threshold = rospy.get_param('~thermal_throttling_threshold', 1000)
51 
52  self._check_idlejitter = rospy.get_param('~check_idlejitter', False)
53  self._idlejitter_min_threshold = rospy.get_param('~idlejitter_min_threshold', 50000)
54  self._idlejitter_max_threshold = rospy.get_param('~idlejitter_max_threshold', 2000000)
55  self._idlejitter_average_threshold = rospy.get_param('~idlejitter_average_threshold', 200000)
56 
57  self._num_cores = rospy.get_param('~num_cores', psutil.cpu_count())
58 
59  # CPU stats
60  self._info_stat = DiagnosticStatus()
61  self._info_stat.name = '%s CPU Info' % diag_hostname
62  self._info_stat.level = DiagnosticStatus.WARN
63  self._info_stat.hardware_id = hostname
64  self._info_stat.message = 'No Data'
65  self._info_stat.values = []
66 
67  self._usage_stat = DiagnosticStatus()
68  self._usage_stat.name = '%s CPU Usage' % diag_hostname
69  self._usage_stat.level = DiagnosticStatus.WARN
70  self._usage_stat.hardware_id = hostname
71  self._usage_stat.message = 'No Data'
72  self._usage_stat.values = []
73 
74  self._memory_stat = DiagnosticStatus()
75  self._memory_stat.name = '%s Memory Usage' % diag_hostname
76  self._memory_stat.level = DiagnosticStatus.WARN
77  self._memory_stat.hardware_id = hostname
78  self._memory_stat.message = 'No Data'
79  self._memory_stat.values = []
80 
81  self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=1)
82  self._publish_timer = rospy.Timer(rospy.Duration(1.0), self.publish_stats)
83  self._info_timer = rospy.Timer(rospy.Duration(5.0), self.check_info)
84  self._usage_timer = rospy.Timer(rospy.Duration(5.0), self.check_usage)
85  self._memory_timer = rospy.Timer(rospy.Duration(5.0), self.check_memory)
86 
87 
88  def check_core_temps(self, interval=1):
89  diag_vals = []
90  diag_msgs = []
91  diag_level = DiagnosticStatus.OK
92 
93  try:
94  # _ vs -
95  netdata_module_name_core_temps = ['sensors.coretemp_isa_0000_temperature',
96  'sensors.coretemp-isa-0000_temperature']
97  error_count = 0
98  netdata_module_name_err = ''
99  for name in netdata_module_name_core_temps:
100  try:
101  netdata_core_temp, error = self._netdata_interface.query_netdata(name, interval)
102 
103  # count individual connection errors for mutliple chart names (different netdata versions)
104  except requests.ConnectionError as err:
105  error_count += 1
106  netdata_module_name_err += name + ' '
107 
108  netdata_core_temp = None
109  error = str(err)
110 
111 
112  if netdata_core_temp:
113  break
114 
115  netdata_module_name_err = "{} of {} failed: {}".format(error_count, len(netdata_module_name_core_temps), netdata_module_name_err)
116 
117  if not netdata_core_temp:
118  diag_level = DiagnosticStatus.WARN
119  diag_msgs = [ 'Core Temp Error' ]
120  diag_vals = [ KeyValue(key = 'Core Temp Error', value = 'Could not fetch data from netdata'),
121  KeyValue(key = 'Failed Chart Names', value = netdata_module_name_err),
122  KeyValue(key = 'Output', value = netdata_core_temp),
123  KeyValue(key = 'Error', value= error) ]
124  return (diag_vals, diag_msgs, diag_level)
125 
126  del netdata_core_temp['time']
127  del netdata_core_temp['Package id 0']
128 
129  for core_no, values in netdata_core_temp.items():
130  mean_temp = np.mean(values)
131  try:
132  diag_vals.append(KeyValue(key = 'Temp %s' % core_no, value = str(mean_temp)))
133 
134  if mean_temp >= self._core_temp_error:
135  diag_level = max(diag_level, DiagnosticStatus.OK) #do not set ERROR
136  diag_msgs.append('CPU Hot')
137  elif mean_temp >= self._core_temp_warn:
138  diag_level = max(diag_level, DiagnosticStatus.OK) #do not set WARN
139  diag_msgs.append('CPU Warm')
140  except ValueError:
141  diag_level = max(diag_level, DiagnosticStatus.ERROR) # Error if not numeric value
142  diag_vals.append(KeyValue(key = 'Temp %s' % core_no, value = str(mean_temp)))
143 
144  except Exception as e:
145  diag_level = DiagnosticStatus.ERROR
146  diag_msgs = [ 'Core Temp Exception' ]
147  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
148 
149  return diag_vals, diag_msgs, diag_level
150 
151 
152  def check_clock_speed(self, interval=1):
153  diag_vals = []
154  diag_msgs = []
155  diag_level = DiagnosticStatus.OK
156 
157  try:
158  netdata_cpu_freq, error = self._netdata_interface.query_netdata('cpu.cpufreq', interval)
159  if not netdata_cpu_freq:
160  diag_level = DiagnosticStatus.WARN
161  diag_msgs = [ 'Clock Speed Error' ]
162  diag_vals = [ KeyValue(key = 'Clock Speed Error', value = 'Could not fetch data from netdata'),
163  KeyValue(key = 'Output', value = netdata_cpu_freq),
164  KeyValue(key = 'Error', value= error) ]
165  return (diag_vals, diag_msgs, diag_level)
166 
167  del netdata_cpu_freq["time"]
168 
169  for cpu_name, values in netdata_cpu_freq.items():
170  diag_vals.append(KeyValue(key = 'Core %d (MHz)' % int(cpu_name[-1]), value = str(np.mean(values))))
171 
172  # get max freq
173  netdata_info = self._netdata_interface.query_netdata_info()
174  if not netdata_info:
175  diag_level = DiagnosticStatus.WARN
176  diag_msgs = [ 'Clock Speed Error' ]
177  diag_vals = [ KeyValue(key = 'Clock Speed Error', value = 'Could not fetch data from netdata'),
178  KeyValue(key = 'Output', value = netdata_info) ]
179  return (diag_vals, diag_msgs, diag_level)
180 
181  max_cpu_freq = float(netdata_info['cpu_freq'])/1e6
182  diag_vals.append(KeyValue(key = 'Maximum Frequency (MHz)', value = str(max_cpu_freq)))
183 
184  except requests.ConnectionError as e:
185  diag_level = DiagnosticStatus.ERROR
186  diag_msgs = [ 'Clock Speed Connection Error' ]
187  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
188 
189  except Exception as e:
190  diag_level = DiagnosticStatus.ERROR
191  diag_msgs = [ 'Clock Speed Exception' ]
192  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
193 
194  return diag_vals, diag_msgs, diag_level
195 
196 
197  def check_uptime(self, interval=1):
198  diag_vals = []
199  diag_msg = ''
200  diag_level = DiagnosticStatus.OK
201 
202  try:
203  netdata_uptime, error = self._netdata_interface.query_netdata('system.uptime', interval)
204  if not netdata_uptime:
205  diag_level = DiagnosticStatus.WARN
206  diag_msg = 'Uptime Error'
207  diag_vals = [ KeyValue(key = 'Uptime Error', value = 'Could not fetch data from netdata'),
208  KeyValue(key = 'Output', value = netdata_uptime),
209  KeyValue(key = 'Error', value= error) ]
210  return (diag_vals, diag_msg, diag_level)
211 
212  del netdata_uptime['time']
213 
214  diag_vals.append(KeyValue(key = 'Uptime', value = str(np.max(netdata_uptime['uptime'].astype(float)))))
215 
216  except requests.ConnectionError as e:
217  diag_level = DiagnosticStatus.ERROR
218  diag_msg = 'Uptime Connection Error'
219  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
220 
221  except Exception as e:
222  diag_level = DiagnosticStatus.ERROR
223  diag_msg = 'Uptime Exception'
224  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
225 
226  return diag_vals, diag_msg, diag_level
227 
228 
229  def check_load(self, interval=1):
230  diag_vals = []
231  diag_msg = ''
232  diag_level = DiagnosticStatus.OK
233 
234  load_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High Load', DiagnosticStatus.ERROR: 'Very High Load' }
235 
236  try:
237  netdata_cpu_load, error = self._netdata_interface.query_netdata('system.load', interval)
238  if not netdata_cpu_load:
239  diag_level = DiagnosticStatus.WARN
240  diag_msg = 'Load Error'
241  diag_vals = [ KeyValue(key = 'Load Error', value = 'Could not fetch data from netdata'),
242  KeyValue(key = 'Output', value = netdata_cpu_load),
243  KeyValue(key = 'Error', value= error) ]
244  return (diag_vals, diag_msg, diag_level)
245 
246  del netdata_cpu_load['time']
247 
248  load1 = np.mean(netdata_cpu_load['load1'].astype(float))
249  load5 = np.mean(netdata_cpu_load['load5'].astype(float))
250  load15 = np.mean(netdata_cpu_load['load15'].astype(float))
251 
252  # Give warning if we go over load limit
253  if float(load1) > self._load1_threshold or float(load5) > self._load5_threshold:
254  diag_level = DiagnosticStatus.WARN
255 
256  diag_vals.append(KeyValue(key = 'Load Average Status', value = load_dict[diag_level]))
257  diag_vals.append(KeyValue(key = '1 min Load Average', value = str(load1)))
258  diag_vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(self._load1_threshold)))
259  diag_vals.append(KeyValue(key = '5 min Load Average', value = str(load5)))
260  diag_vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(self._load5_threshold)))
261  diag_vals.append(KeyValue(key = '15 min Load Average', value = str(load15)))
262 
263  diag_msg = load_dict[diag_level]
264 
265  except requests.ConnectionError as e:
266  diag_level = DiagnosticStatus.ERROR
267  diag_msg = 'Load Connection Error'
268  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
269 
270  except Exception as e:
271  diag_level = DiagnosticStatus.ERROR
272  diag_msg = 'Load Exception'
273  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
274 
275  return diag_vals, diag_msg, diag_level
276 
277 
278 
279  def check_free_memory(self, interval=1):
280  diag_vals = []
281  diag_msg = ''
282  diag_level = DiagnosticStatus.OK
283 
284  mem_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'Low Memory', DiagnosticStatus.ERROR: 'Very Low Memory' }
285 
286  try:
287  netdata_mem, error = self._netdata_interface.query_netdata('system.ram', interval)
288  if not netdata_mem:
289  diag_level = DiagnosticStatus.WARN
290  diag_msg = 'Memory Usage Error'
291  diag_vals = [ KeyValue(key = 'Memory Usage Error', value = 'Could not fetch data from netdata'),
292  KeyValue(key = 'Output', value = netdata_mem),
293  KeyValue(key = 'Error', value= error) ]
294  return (diag_vals, diag_msg, diag_level)
295 
296  del netdata_mem['time']
297 
298  # Mem
299  memory_vals = {k: np.mean(v.astype(float)) for k, v in netdata_mem.items()}
300  total_mem = sum([val for val in memory_vals.values()])
301  free_mem = memory_vals['free']
302  used_mem = memory_vals['used']
303  cache_mem = memory_vals['cached'] + memory_vals['buffers']
304 
305  diag_level = DiagnosticStatus.OK
306  if float(free_mem) < self._mem_warn:
307  diag_level = DiagnosticStatus.WARN
308  if float(free_mem) < self._mem_error:
309  diag_level = DiagnosticStatus.ERROR
310 
311  diag_vals.append(KeyValue(key = 'Mem Status', value = mem_dict[diag_level]))
312  diag_vals.append(KeyValue(key = 'Mem Total', value = str(total_mem)))
313  diag_vals.append(KeyValue(key = 'Mem Used', value = str(used_mem)))
314  diag_vals.append(KeyValue(key = 'Mem Free', value = str(free_mem)))
315  diag_vals.append(KeyValue(key = 'Mem Buff/Cache', value = str(cache_mem)))
316 
317  netdata_swp, error = self._netdata_interface.query_netdata('system.swap', interval)
318  if not netdata_swp:
319  diag_level = DiagnosticStatus.WARN
320  diag_msg = 'Swap Usage Error'
321  diag_vals = [ KeyValue(key = 'Swap Usage Error', value = 'Could not fetch data from netdata'),
322  KeyValue(key = 'Output', value = netdata_swp),
323  KeyValue(key = 'Error', value= error) ]
324  return (diag_vals, diag_msg, diag_level)
325 
326  del netdata_swp['time']
327 
328  # Swap
329  swap_vals = {k: np.mean(v.astype(float)) for k, v in netdata_swp.items()}
330  total_swp = sum([val for val in swap_vals.values()])
331  free_swp = swap_vals['free']
332  used_swp = swap_vals['used']
333 
334  diag_vals.append(KeyValue(key = 'Swap Total', value = str(total_swp)))
335  diag_vals.append(KeyValue(key = 'Swap Used', value = str(used_swp)))
336  diag_vals.append(KeyValue(key = 'Swap Free', value = str(free_swp)))
337 
338  diag_msg = mem_dict[diag_level]
339 
340  except requests.ConnectionError as e:
341  diag_level = DiagnosticStatus.ERROR
342  diag_msg = 'Memory Usage Connection Error'
343  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
344 
345  except Exception as e:
346  diag_level = DiagnosticStatus.ERROR
347  diag_msg = 'Memory Usage Exception'
348  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
349 
350  return diag_vals, diag_msg, diag_level
351 
352 
353  def check_cpu_util(self, interval=1):
354  diag_vals = []
355  diag_msg = ''
356  diag_level = DiagnosticStatus.OK
357 
358  load_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High Load', DiagnosticStatus.ERROR: 'Error' }
359 
360  try:
361  netdata_info = self._netdata_interface.query_netdata_info()
362  if not netdata_info:
363  diag_level = DiagnosticStatus.WARN
364  diag_msg = 'CPU Usage Error'
365  diag_vals = [ KeyValue(key = 'CPU Usage Error', value = 'Could not fetch data from netdata'),
366  KeyValue(key = 'Output', value = netdata_info) ]
367  return (diag_vals, diag_msg, diag_level)
368 
369  num_cores = int(netdata_info['cores_total'])
370  netdata_system_cpu, error = self._netdata_interface.query_netdata('system.cpu', interval)
371  if not netdata_system_cpu:
372  diag_level = DiagnosticStatus.WARN
373  diag_msg = 'CPU Usage Error'
374  diag_vals = [ KeyValue(key = 'CPU Usage Error', value = 'Could not fetch data from netdata'),
375  KeyValue(key = 'Output', value = netdata_system_cpu),
376  KeyValue(key = 'Error', value= error) ]
377  return (diag_vals, diag_msg, diag_level)
378 
379  netdata_cpu_util = [self._netdata_interface.query_netdata('cpu.cpu%d' % i, interval) for i in range(num_cores)]
380  netdata_cpu_idle = [self._netdata_interface.query_netdata('cpu.cpu%d_cpuidle' % i, interval) for i in range(num_cores)]
381 
382  if any([data == None for data, error in netdata_cpu_util]):
383  diag_level = DiagnosticStatus.ERROR
384  diag_msg = 'CPU Util Error'
385  diag_vals = [ KeyValue(key = 'CPU Util Error', value = 'Could not fetch data from netdata'),
386  KeyValue(key = 'Output', value = netdata_cpu_util) ]
387  return (diag_vals, diag_msg, diag_level)
388  if any([data == None for data, error in netdata_cpu_idle]):
389  diag_level = DiagnosticStatus.ERROR
390  diag_msg = 'CPU Idle Error'
391  diag_vals = [ KeyValue(key = 'CPU Idle Error', value = 'Could not fetch data from netdata'),
392  KeyValue(key = 'Output', value = netdata_cpu_idle) ]
393  return (diag_vals, diag_msg, diag_level)
394 
395  cores_loaded = 0
396  for i_cpu in range(num_cores):
397 
398  cpu_name = 'Core %d' % (i_cpu)
399  idle = 100 - np.mean(netdata_cpu_idle[i_cpu][0]['C0 (active)'])
400  user = np.mean(netdata_cpu_util[i_cpu][0]['user'])
401  nice = np.mean(netdata_cpu_util[i_cpu][0]['nice'])
402  system = np.mean(netdata_cpu_util[i_cpu][0]['system'])
403 
404  core_level = DiagnosticStatus.OK
405  usage = float(user) + float(nice)
406  if usage > self._core_load_warn:
407  cores_loaded += 1
408  core_level = DiagnosticStatus.WARN
409  if usage > self._core_load_error:
410  core_level = DiagnosticStatus.ERROR
411 
412  diag_vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level]))
413  diag_vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = str(user)))
414  diag_vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = str(nice)))
415  diag_vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = str(system)))
416  diag_vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = str(idle)))
417 
418  # Warn for high load only if we have <= 2 cores that aren't loaded
419  if num_cores - cores_loaded <= 2 and num_cores > 2:
420  diag_level = DiagnosticStatus.WARN
421 
422  diag_msg = load_dict[diag_level]
423 
424  except requests.ConnectionError as e:
425  diag_level = DiagnosticStatus.ERROR
426  diag_msg = 'CPU Usage Connection Error'
427  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
428 
429  except Exception as e:
430  diag_level = DiagnosticStatus.ERROR
431  diag_msg = 'CPU Usage Exception'
432  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
433 
434  return diag_vals, diag_msg, diag_level
435 
436 
437  def check_core_throttling(self, interval=1):
438  throt_dict = {DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High Thermal Throttling Events',
439  DiagnosticStatus.ERROR: 'No valid Data from NetData'}
440 
441  throt_level = DiagnosticStatus.OK
442  throt_msg = ''
443  throt_vals = []
444 
445  try:
446  netdata, error = self._netdata_interface.query_netdata('cpu.core_throttling', interval)
447  if not netdata:
448  throt_level = DiagnosticStatus.WARN
449  throt_msg = 'Core Throttling Error'
450  throt_vals = [ KeyValue(key = 'Core Throttling Error', value = 'Could not fetch data from netdata'),
451  KeyValue(key = 'Output', value = netdata),
452  KeyValue(key = 'Error', value= error) ]
453  return (throt_vals, throt_msg, throt_level)
454 
455  for i in range(self._num_cores):
456  lbl = 'CPU %d Thermal Throttling Events' % i
457  netdata_key = 'cpu%d' % i
458 
459  core_mean = 'N/A'
460  if netdata_key in netdata:
461  core_data = netdata[netdata_key]
462  if core_data is not None:
463  core_mean = np.mean(core_data)
464 
465  if core_mean > self._thermal_throttling_threshold:
466  throt_level = DiagnosticStatus.WARN
467  else:
468  throt_level = DiagnosticStatus.ERROR
469 
470  throt_vals.append(KeyValue(key=lbl, value='%r' % core_mean))
471 
472  throt_vals.insert(0, KeyValue(key='Thermal Throttling Status', value=throt_msg))
473  throt_vals.append(KeyValue(key='Thermal Throttling Threshold', value=str(self._thermal_throttling_threshold)))
474 
475  throt_msg = throt_dict[throt_level]
476 
477  except requests.ConnectionError as e:
478  throt_level = DiagnosticStatus.ERROR
479  throt_msg = 'Thermal Throttling Connection Error'
480  throt_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
481 
482  except Exception as e:
483  throt_level = DiagnosticStatus.ERROR
484  throt_msg = 'Thermal Throttling Exception'
485  throt_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
486 
487  return throt_vals, throt_msg, throt_level
488 
489 
490  def check_idlejitter(self, interval=1):
491  jitter_dict = {DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High IDLE Jitter',
492  DiagnosticStatus.ERROR: 'No valid Data from NetData'}
493 
494  jitter_level = DiagnosticStatus.OK
495  jitter_msg = ''
496  jitter_vals = []
497 
498  try:
499  netdata, error = self._netdata_interface.query_netdata('system.idlejitter', interval)
500  if not netdata:
501  jitter_level = DiagnosticStatus.WARN
502  jitter_msg = 'IDLE Jitter Error'
503  jitter_vals = [ KeyValue(key = 'Core Throttling Error', value = 'Could not fetch data from netdata'),
504  KeyValue(key = 'Output', value = netdata),
505  KeyValue(key = 'Error', value= error) ]
506  return (jitter_vals, jitter_msg, jitter_level)
507 
508  metric_list = [
509  ('IDLE Jitter Min', 'min', self._idlejitter_min_threshold, np.min),
510  ('IDLE Jitter Max', 'max', self._idlejitter_max_threshold, np.max),
511  ('IDLE Jitter Average', 'average', self._idlejitter_average_threshold, np.mean),
512  ]
513 
514  for metric_label, metric_key, metric_threshold, aggregate_fnc in metric_list:
515  metric_aggreagte = 'N/A'
516  if netdata is not None and metric_key in netdata:
517  metric_data = netdata[metric_key]
518  if metric_data is not None:
519  metric_aggreagte = aggregate_fnc(metric_data)
520 
521  if metric_aggreagte > metric_threshold:
522  jitter_level = DiagnosticStatus.WARN
523  else:
524  jitter_level = DiagnosticStatus.ERROR
525 
526  jitter_vals.append(KeyValue(key=metric_label, value=str(metric_aggreagte)))
527  jitter_vals.append(KeyValue(key=metric_label + ' Threshold', value=str(metric_threshold)))
528 
529  jitter_vals.insert(0, KeyValue(key='IDLE Jitter Status', value=jitter_msg))
530  jitter_msg = jitter_dict[jitter_level]
531 
532  except requests.ConnectionError as e:
533  jitter_level = DiagnosticStatus.ERROR
534  jitter_msg = 'IDLE Jitter Connection Error'
535  jitter_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
536 
537  except Exception as e:
538  jitter_level = DiagnosticStatus.ERROR
539  jitter_msg = 'IDLE Jitter Exception'
540  jitter_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
541 
542  return jitter_vals, jitter_msg, jitter_level
543 
544 
545  def check_info(self, event):
546  diag_vals = []
547  diag_msgs = []
548  diag_level = DiagnosticStatus.OK
549 
550  if self._check_core_temps:
551  interval = math.ceil(self._usage_timer._period.to_sec())
552  core_vals, core_msgs, core_level = self.check_core_temps(interval=interval)
553  diag_vals.extend(core_vals)
554  diag_msgs.extend(core_msgs)
555  diag_level = max(diag_level, core_level)
556 
557  clock_vals, clock_msgs, clock_level = self.check_clock_speed()
558  diag_vals.extend(clock_vals)
559  diag_msgs.extend(clock_msgs)
560  diag_level = max(diag_level, clock_level)
561 
562  diag_log = set(diag_msgs)
563  if len(diag_log) > DiagnosticStatus.OK:
564  message = ', '.join(diag_log)
565  else:
566  message = stat_dict[diag_level]
567 
568  self._info_stat.values = diag_vals
569  self._info_stat.message = message
570  self._info_stat.level = diag_level
571 
572  def check_usage(self, event):
573  diag_vals = []
574  diag_msgs = []
575  diag_level = DiagnosticStatus.OK
576 
577  interval = math.ceil(self._usage_timer._period.to_sec())
578 
579  # Check mpstat
580  mp_vals, mp_msg, mp_level = self.check_cpu_util(interval=interval)
581  diag_vals.extend(mp_vals)
582  if mp_level > DiagnosticStatus.OK:
583  diag_msgs.append(mp_msg)
584  diag_level = max(diag_level, mp_level)
585 
586  # Check NetData cpu.core_throttling
588  throt_vals, throt_msg, throt_level = self.check_core_throttling(interval=interval)
589  diag_vals.extend(throt_vals)
590  if throt_level > 0:
591  diag_msgs.append(throt_msg)
592  diag_level = max(diag_level, throt_level)
593 
594  # Check NetData system.idlejitter
595  if self._check_idlejitter:
596  jitter_vals, jitter_msg, jitter_level = self.check_idlejitter(interval=interval)
597  diag_vals.extend(jitter_vals)
598  if jitter_level > 0:
599  diag_msgs.append(jitter_msg)
600  diag_level = max(diag_level, jitter_level)
601 
602  # Check uptime
603  up_vals, up_msg, up_level = self.check_uptime(interval=interval)
604  diag_vals.extend(up_vals)
605  if up_level > DiagnosticStatus.OK:
606  diag_msgs.append(up_msg)
607  diag_level = max(diag_level, up_level)
608 
609  # Check load
610  load_vals, load_msg, load_level = self.check_load(interval=interval)
611  diag_vals.extend(load_vals)
612  if load_level > DiagnosticStatus.OK:
613  diag_msgs.append(load_msg)
614  diag_level = max(diag_level, load_level)
615 
616  if diag_msgs and diag_level > DiagnosticStatus.OK:
617  usage_msg = ', '.join(set(diag_msgs))
618  else:
619  usage_msg = stat_dict[diag_level]
620 
621  self._usage_stat.values = diag_vals
622  self._usage_stat.message = usage_msg
623  self._usage_stat.level = diag_level
624 
625  def check_memory(self, event):
626  diag_vals = []
627  diag_msgs = []
628  diag_level = DiagnosticStatus.OK
629 
630  # Check memory
631  interval = math.ceil(self._memory_timer._period.to_sec())
632  mem_vals, mem_msg, mem_level = self.check_free_memory(interval=interval)
633  diag_vals.extend(mem_vals)
634  if mem_level > DiagnosticStatus.OK:
635  diag_msgs.append(mem_msg)
636  diag_level = max(diag_level, mem_level)
637 
638  if diag_msgs and diag_level > DiagnosticStatus.OK:
639  memory_msg = ', '.join(set(diag_msgs))
640  else:
641  memory_msg = stat_dict[diag_level]
642 
643  self._memory_stat.values = diag_vals
644  self._memory_stat.message = memory_msg
645  self._memory_stat.level = diag_level
646 
647  def publish_stats(self, event):
648  msg = DiagnosticArray()
649  msg.header.stamp = rospy.get_rostime()
650  msg.status.append(self._info_stat)
651  msg.status.append(self._usage_stat)
652  msg.status.append(self._memory_stat)
653  self._diag_pub.publish(msg)
654 
655 
656 if __name__ == '__main__':
657  hostname = socket.gethostname()
658 
659  import optparse
660  parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]")
661  parser.add_option("--diag-hostname", dest="diag_hostname",
662  help="Computer name in diagnostics output (ex: 'c1')",
663  metavar="DIAG_HOSTNAME",
664  action="store", default = hostname)
665  options, args = parser.parse_args(rospy.myargv())
666 
667  try:
668  rospy.init_node('cpu_monitor_%s' % hostname)
669  except rospy.exceptions.ROSInitException:
670  print('CPU monitor is unable to initialize node. Master may not be running.')
671  sys.exit(0)
672 
673  cpu_node = CPUMonitor(hostname, options.diag_hostname)
674  rospy.spin()
def check_memory(self, event)
Definition: cpu_monitor.py:625
def check_usage(self, event)
Definition: cpu_monitor.py:572
def check_core_throttling(self, interval=1)
Definition: cpu_monitor.py:437
def check_uptime(self, interval=1)
Uses &#39;uptime&#39; to see system uptime.
Definition: cpu_monitor.py:197
def check_clock_speed(self, interval=1)
Checks clock speed from reading from CPU info.
Definition: cpu_monitor.py:152
def check_cpu_util(self, interval=1)
Definition: cpu_monitor.py:353
def check_load(self, interval=1)
Uses &#39;system.load&#39; to see load average.
Definition: cpu_monitor.py:229
def __init__(self, hostname, diag_hostname)
Definition: cpu_monitor.py:36
def publish_stats(self, event)
Definition: cpu_monitor.py:647
def check_info(self, event)
Definition: cpu_monitor.py:545
def check_idlejitter(self, interval=1)
Definition: cpu_monitor.py:490
def check_core_temps(self, interval=1)
Check CPU core temps.
Definition: cpu_monitor.py:88
def check_free_memory(self, interval=1)
Uses &#39;free -m&#39; to check free memory.
Definition: cpu_monitor.py:279


cob_monitoring
Author(s): Florian Weisshardt , Felix Messmer
autogenerated on Tue May 2 2023 02:26:09