cpu_monitor.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 #
3 # Copyright 2017 Fraunhofer Institute for Manufacturing Engineering and Automation (IPA)
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 
17 
18 
19 
20 import sys
21 import traceback
22 import socket
23 import psutil
24 import numpy as np
25 import math
26 import requests
27 
28 import rospy
29 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
30 
31 from netdata_interface.netdata_interface import NetdataInterface
32 
33 stat_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'Warning', DiagnosticStatus.ERROR: 'Error', DiagnosticStatus.STALE: 'Stale' }
34 
35 class CPUMonitor():
36  def __init__(self, hostname, diag_hostname):
38 
39  self._check_core_temps = rospy.get_param('~check_core_temps', False)
40  self._core_load_warn = rospy.get_param('~core_load_warn', 90)
41  self._core_load_error = rospy.get_param('~core_load_error', 110)
42  self._load1_threshold = rospy.get_param('~load1_threshold', 5.0)
43  self._load5_threshold = rospy.get_param('~load5_threshold', 3.0)
44  self._core_temp_warn = rospy.get_param('~core_temp_warn', 90)
45  self._core_temp_error = rospy.get_param('~core_temp_error', 95)
46  self._mem_warn = rospy.get_param('~mem_warn', 25)
47  self._mem_error = rospy.get_param('~mem_error', 1)
48 
49  self._check_thermal_throttling_events = rospy.get_param('~check_thermal_throttling_events', False)
50  self._thermal_throttling_threshold = rospy.get_param('~thermal_throttling_threshold', 1000)
51 
52  self._check_idlejitter = rospy.get_param('~check_idlejitter', False)
53  self._idlejitter_min_threshold = rospy.get_param('~idlejitter_min_threshold', 50000)
54  self._idlejitter_max_threshold = rospy.get_param('~idlejitter_max_threshold', 2000000)
55  self._idlejitter_average_threshold = rospy.get_param('~idlejitter_average_threshold', 200000)
56 
57  self._num_cores = rospy.get_param('~num_cores', psutil.cpu_count())
58 
59  self._netdata_version = None
60 
61  # CPU stats
62  self._info_stat = DiagnosticStatus()
63  self._info_stat.name = '%s CPU Info' % diag_hostname
64  self._info_stat.level = DiagnosticStatus.WARN
65  self._info_stat.hardware_id = hostname
66  self._info_stat.message = 'No Data'
67  self._info_stat.values = []
68 
69  self._usage_stat = DiagnosticStatus()
70  self._usage_stat.name = '%s CPU Usage' % diag_hostname
71  self._usage_stat.level = DiagnosticStatus.WARN
72  self._usage_stat.hardware_id = hostname
73  self._usage_stat.message = 'No Data'
74  self._usage_stat.values = []
75 
76  self._memory_stat = DiagnosticStatus()
77  self._memory_stat.name = '%s Memory Usage' % diag_hostname
78  self._memory_stat.level = DiagnosticStatus.WARN
79  self._memory_stat.hardware_id = hostname
80  self._memory_stat.message = 'No Data'
81  self._memory_stat.values = []
82 
83  self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=1)
84  self._publish_timer = rospy.Timer(rospy.Duration(1.0), self.publish_stats)
85  self._info_timer = rospy.Timer(rospy.Duration(5.0), self.check_info)
86  self._usage_timer = rospy.Timer(rospy.Duration(5.0), self.check_usage)
87  self._memory_timer = rospy.Timer(rospy.Duration(5.0), self.check_memory)
88 
89 
90  def check_core_temps(self, interval=1):
91  diag_vals = []
92  diag_msgs = []
93  diag_level = DiagnosticStatus.OK
94 
95  try:
96  # _ vs -
97  netdata_module_name_core_temps = ['sensors.coretemp_isa_0000_temperature',
98  'sensors.coretemp-isa-0000_temperature']
99  error_count = 0
100  netdata_module_name_err = ''
101  for name in netdata_module_name_core_temps:
102  try:
103  netdata_core_temp, error = self._netdata_interface.query_netdata(name, interval)
104 
105  # count individual connection errors for mutliple chart names (different netdata versions)
106  except requests.ConnectionError as err:
107  error_count += 1
108  netdata_module_name_err += name + ' '
109 
110  netdata_core_temp = None
111  error = str(err)
112 
113 
114  if netdata_core_temp:
115  break
116 
117  netdata_module_name_err = "{} of {} failed: {}".format(error_count, len(netdata_module_name_core_temps), netdata_module_name_err)
118 
119  if not netdata_core_temp:
120  diag_level = DiagnosticStatus.WARN
121  diag_msgs = [ 'Core Temp Error' ]
122  diag_vals = [ KeyValue(key = 'Core Temp Error', value = 'Could not fetch data from netdata'),
123  KeyValue(key = 'Failed Chart Names', value = netdata_module_name_err),
124  KeyValue(key = 'Output', value = netdata_core_temp),
125  KeyValue(key = 'Error', value= error) ]
126  return (diag_vals, diag_msgs, diag_level)
127 
128  del netdata_core_temp['time']
129  del netdata_core_temp['Package id 0']
130 
131  for core_no, values in netdata_core_temp.items():
132  mean_temp = np.mean(values)
133  try:
134  diag_vals.append(KeyValue(key = 'Temp %s' % core_no, value = str(mean_temp)))
135 
136  if mean_temp >= self._core_temp_error:
137  diag_level = max(diag_level, DiagnosticStatus.OK) #do not set ERROR
138  diag_msgs.append('CPU Hot')
139  elif mean_temp >= self._core_temp_warn:
140  diag_level = max(diag_level, DiagnosticStatus.OK) #do not set WARN
141  diag_msgs.append('CPU Warm')
142  except ValueError:
143  diag_level = max(diag_level, DiagnosticStatus.ERROR) # Error if not numeric value
144  diag_vals.append(KeyValue(key = 'Temp %s' % core_no, value = str(mean_temp)))
145 
146  except Exception as e:
147  diag_level = DiagnosticStatus.ERROR
148  diag_msgs = [ 'Core Temp Exception' ]
149  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
150 
151  return diag_vals, diag_msgs, diag_level
152 
153 
154  def check_clock_speed(self, interval=1):
155  diag_vals = []
156  diag_msgs = []
157  diag_level = DiagnosticStatus.OK
158 
159  try:
160  netdata_cpu_freq, error = self._netdata_interface.query_netdata('cpu.cpufreq', interval)
161  if not netdata_cpu_freq:
162  diag_level = DiagnosticStatus.WARN
163  diag_msgs = [ 'Clock Speed Error' ]
164  diag_vals = [ KeyValue(key = 'Clock Speed Error', value = 'Could not fetch data from netdata'),
165  KeyValue(key = 'Output', value = netdata_cpu_freq),
166  KeyValue(key = 'Error', value= error) ]
167  return (diag_vals, diag_msgs, diag_level)
168 
169  del netdata_cpu_freq["time"]
170 
171  for cpu_name, values in netdata_cpu_freq.items():
172  diag_vals.append(KeyValue(key = 'Core %d (MHz)' % int(cpu_name[-1]), value = str(np.mean(values))))
173 
174  # get max freq
175  netdata_info = self._netdata_interface.query_netdata_info()
176  if not netdata_info:
177  diag_level = DiagnosticStatus.WARN
178  diag_msgs = [ 'Clock Speed Error' ]
179  diag_vals = [ KeyValue(key = 'Clock Speed Error', value = 'Could not fetch data from netdata'),
180  KeyValue(key = 'Output', value = netdata_info) ]
181  return (diag_vals, diag_msgs, diag_level)
182 
183  max_cpu_freq = float(netdata_info['cpu_freq'])/1e6
184  diag_vals.append(KeyValue(key = 'Maximum Frequency (MHz)', value = str(max_cpu_freq)))
185 
186  except requests.ConnectionError as e:
187  diag_level = DiagnosticStatus.ERROR
188  diag_msgs = [ 'Clock Speed Connection Error' ]
189  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
190 
191  except Exception as e:
192  diag_level = DiagnosticStatus.ERROR
193  diag_msgs = [ 'Clock Speed Exception' ]
194  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
195 
196  return diag_vals, diag_msgs, diag_level
197 
198 
199  def check_uptime(self, interval=1):
200  diag_vals = []
201  diag_msg = ''
202  diag_level = DiagnosticStatus.OK
203 
204  try:
205  netdata_uptime, error = self._netdata_interface.query_netdata('system.uptime', interval)
206  if not netdata_uptime:
207  diag_level = DiagnosticStatus.WARN
208  diag_msg = 'Uptime Error'
209  diag_vals = [ KeyValue(key = 'Uptime Error', value = 'Could not fetch data from netdata'),
210  KeyValue(key = 'Output', value = netdata_uptime),
211  KeyValue(key = 'Error', value= error) ]
212  return (diag_vals, diag_msg, diag_level)
213 
214  del netdata_uptime['time']
215 
216  diag_vals.append(KeyValue(key = 'Uptime', value = str(np.max(netdata_uptime['uptime'].astype(float)))))
217 
218  except requests.ConnectionError as e:
219  diag_level = DiagnosticStatus.ERROR
220  diag_msg = 'Uptime Connection Error'
221  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
222 
223  except Exception as e:
224  diag_level = DiagnosticStatus.ERROR
225  diag_msg = 'Uptime Exception'
226  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
227 
228  return diag_vals, diag_msg, diag_level
229 
230 
231  def check_load(self, interval=1):
232  diag_vals = []
233  diag_msg = ''
234  diag_level = DiagnosticStatus.OK
235 
236  load_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High Load', DiagnosticStatus.ERROR: 'Very High Load' }
237 
238  try:
239  netdata_cpu_load, error = self._netdata_interface.query_netdata('system.load', interval)
240  if not netdata_cpu_load:
241  diag_level = DiagnosticStatus.WARN
242  diag_msg = 'Load Error'
243  diag_vals = [ KeyValue(key = 'Load Error', value = 'Could not fetch data from netdata'),
244  KeyValue(key = 'Output', value = netdata_cpu_load),
245  KeyValue(key = 'Error', value= error) ]
246  return (diag_vals, diag_msg, diag_level)
247 
248  del netdata_cpu_load['time']
249 
250  load1 = np.mean(netdata_cpu_load['load1'].astype(float))
251  load5 = np.mean(netdata_cpu_load['load5'].astype(float))
252  load15 = np.mean(netdata_cpu_load['load15'].astype(float))
253 
254  # Give warning if we go over load limit
255  if float(load1) > self._load1_threshold or float(load5) > self._load5_threshold:
256  diag_level = DiagnosticStatus.WARN
257 
258  diag_vals.append(KeyValue(key = 'Load Average Status', value = load_dict[diag_level]))
259  diag_vals.append(KeyValue(key = '1 min Load Average', value = str(load1)))
260  diag_vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(self._load1_threshold)))
261  diag_vals.append(KeyValue(key = '5 min Load Average', value = str(load5)))
262  diag_vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(self._load5_threshold)))
263  diag_vals.append(KeyValue(key = '15 min Load Average', value = str(load15)))
264 
265  diag_msg = load_dict[diag_level]
266 
267  except requests.ConnectionError as e:
268  diag_level = DiagnosticStatus.ERROR
269  diag_msg = 'Load Connection Error'
270  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
271 
272  except Exception as e:
273  diag_level = DiagnosticStatus.ERROR
274  diag_msg = 'Load Exception'
275  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
276 
277  return diag_vals, diag_msg, diag_level
278 
279 
280 
281  def check_free_memory(self, interval=1):
282  diag_vals = []
283  diag_msg = ''
284  diag_level = DiagnosticStatus.OK
285 
286  mem_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'Low Memory', DiagnosticStatus.ERROR: 'Very Low Memory' }
287 
288  try:
289  netdata_mem, error = self._netdata_interface.query_netdata('system.ram', interval)
290  if not netdata_mem:
291  diag_level = DiagnosticStatus.WARN
292  diag_msg = 'Memory Usage Error'
293  diag_vals = [ KeyValue(key = 'Memory Usage Error', value = 'Could not fetch data from netdata'),
294  KeyValue(key = 'Output', value = netdata_mem),
295  KeyValue(key = 'Error', value= error) ]
296  return (diag_vals, diag_msg, diag_level)
297 
298  del netdata_mem['time']
299 
300  # Mem
301  memory_vals = {k: np.mean(v.astype(float)) for k, v in netdata_mem.items()}
302  total_mem = sum([val for val in memory_vals.values()])
303  free_mem = memory_vals['free']
304  used_mem = memory_vals['used']
305  cache_mem = memory_vals['cached'] + memory_vals['buffers']
306 
307  diag_level = DiagnosticStatus.OK
308  if float(free_mem) < self._mem_warn:
309  diag_level = DiagnosticStatus.WARN
310  if float(free_mem) < self._mem_error:
311  diag_level = DiagnosticStatus.ERROR
312 
313  diag_vals.append(KeyValue(key = 'Mem Status', value = mem_dict[diag_level]))
314  diag_vals.append(KeyValue(key = 'Mem Total', value = str(total_mem)))
315  diag_vals.append(KeyValue(key = 'Mem Used', value = str(used_mem)))
316  diag_vals.append(KeyValue(key = 'Mem Free', value = str(free_mem)))
317  diag_vals.append(KeyValue(key = 'Mem Buff/Cache', value = str(cache_mem)))
318 
319  # Netdata versions differ in chart names
320  netdata_swap_charts = ['mem.swap', 'system.swap']
321  error_count = 0
322  netdata_chart_err = ''
323  for chart in netdata_swap_charts:
324  try:
325  netdata_swp, error = self._netdata_interface.query_netdata(chart, interval)
326 
327  # Count individual connection errors for mutliple chart names
328  except requests.ConnectionError as err:
329  error_count += 1
330  netdata_chart_err += chart + ' '
331 
332  netdata_swp = None
333  error = str(err)
334 
335  if netdata_swp:
336  break
337 
338  netdata_chart_err = "{} of {} failed: {}".format(error_count, len(netdata_swap_charts), netdata_chart_err)
339 
340  if not netdata_swp:
341  diag_level = DiagnosticStatus.WARN
342  diag_msg = 'Swap Usage Error'
343  diag_vals = [ KeyValue(key='Swap Usage Error', value='Could not fetch data from netdata'),
344  KeyValue(key='Failed Chart Names', value=netdata_chart_err),
345  KeyValue(key='Output', value=netdata_swp),
346  KeyValue(key='Error', value=error) ]
347  return (diag_vals, diag_msg, diag_level)
348 
349  del netdata_swp['time']
350 
351  # Swap
352  swap_vals = {k: np.mean(v.astype(float)) for k, v in netdata_swp.items()}
353  total_swp = sum([val for val in swap_vals.values()])
354  free_swp = swap_vals['free']
355  used_swp = swap_vals['used']
356 
357  diag_vals.append(KeyValue(key = 'Swap Total', value = str(total_swp)))
358  diag_vals.append(KeyValue(key = 'Swap Used', value = str(used_swp)))
359  diag_vals.append(KeyValue(key = 'Swap Free', value = str(free_swp)))
360 
361  diag_msg = mem_dict[diag_level]
362 
363  except requests.ConnectionError as e:
364  diag_level = DiagnosticStatus.ERROR
365  diag_msg = 'Memory Usage Connection Error'
366  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
367 
368  except Exception as e:
369  diag_level = DiagnosticStatus.ERROR
370  diag_msg = 'Memory Usage Exception'
371  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
372 
373  return diag_vals, diag_msg, diag_level
374 
375 
376  def check_cpu_util(self, interval=1):
377  diag_vals = []
378  diag_msg = ''
379  diag_level = DiagnosticStatus.OK
380 
381  load_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High Load', DiagnosticStatus.ERROR: 'Error' }
382 
383  try:
384  netdata_info = self._netdata_interface.query_netdata_info()
385  if not netdata_info:
386  diag_level = DiagnosticStatus.WARN
387  diag_msg = 'CPU Usage Error'
388  diag_vals = [ KeyValue(key = 'CPU Usage Error', value = 'Could not fetch data from netdata'),
389  KeyValue(key = 'Output', value = netdata_info) ]
390  return (diag_vals, diag_msg, diag_level)
391 
392  num_cores = int(netdata_info['cores_total'])
393  netdata_system_cpu, error = self._netdata_interface.query_netdata('system.cpu', interval)
394  if not netdata_system_cpu:
395  diag_level = DiagnosticStatus.WARN
396  diag_msg = 'CPU Usage Error'
397  diag_vals = [ KeyValue(key = 'CPU Usage Error', value = 'Could not fetch data from netdata'),
398  KeyValue(key = 'Output', value = netdata_system_cpu),
399  KeyValue(key = 'Error', value= error) ]
400  return (diag_vals, diag_msg, diag_level)
401 
402  netdata_cpu_util = [self._netdata_interface.query_netdata('cpu.cpu%d' % i, interval) for i in range(num_cores)]
403  netdata_cpu_idle = [self._netdata_interface.query_netdata('cpu.cpu%d_cpuidle' % i, interval) for i in range(num_cores)]
404 
405  if any([data == None for data, error in netdata_cpu_util]):
406  diag_level = DiagnosticStatus.ERROR
407  diag_msg = 'CPU Util Error'
408  diag_vals = [ KeyValue(key = 'CPU Util Error', value = 'Could not fetch data from netdata'),
409  KeyValue(key = 'Output', value = netdata_cpu_util) ]
410  return (diag_vals, diag_msg, diag_level)
411  if any([data == None for data, error in netdata_cpu_idle]):
412  diag_level = DiagnosticStatus.ERROR
413  diag_msg = 'CPU Idle Error'
414  diag_vals = [ KeyValue(key = 'CPU Idle Error', value = 'Could not fetch data from netdata'),
415  KeyValue(key = 'Output', value = netdata_cpu_idle) ]
416  return (diag_vals, diag_msg, diag_level)
417 
418  cores_loaded = 0
419  for i_cpu in range(num_cores):
420 
421  cpu_name = 'Core %d' % (i_cpu)
422  idle = 100 - np.mean(netdata_cpu_idle[i_cpu][0]['C0 (active)'])
423  user = np.mean(netdata_cpu_util[i_cpu][0]['user'])
424  nice = np.mean(netdata_cpu_util[i_cpu][0]['nice'])
425  system = np.mean(netdata_cpu_util[i_cpu][0]['system'])
426 
427  core_level = DiagnosticStatus.OK
428  usage = float(user) + float(nice)
429  if usage > self._core_load_warn:
430  cores_loaded += 1
431  core_level = DiagnosticStatus.WARN
432  if usage > self._core_load_error:
433  core_level = DiagnosticStatus.ERROR
434 
435  diag_vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level]))
436  diag_vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = str(user)))
437  diag_vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = str(nice)))
438  diag_vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = str(system)))
439  diag_vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = str(idle)))
440 
441  # Warn for high load only if we have <= 2 cores that aren't loaded
442  if num_cores - cores_loaded <= 2 and num_cores > 2:
443  diag_level = DiagnosticStatus.WARN
444 
445  diag_msg = load_dict[diag_level]
446 
447  except requests.ConnectionError as e:
448  diag_level = DiagnosticStatus.ERROR
449  diag_msg = 'CPU Usage Connection Error'
450  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
451 
452  except Exception as e:
453  diag_level = DiagnosticStatus.ERROR
454  diag_msg = 'CPU Usage Exception'
455  diag_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
456 
457  return diag_vals, diag_msg, diag_level
458 
459 
460  def check_core_throttling(self, interval=1):
461  throt_dict = {DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High Thermal Throttling Events',
462  DiagnosticStatus.ERROR: 'No valid Data from NetData'}
463 
464  throt_level = DiagnosticStatus.OK
465  throt_msg = ''
466  throt_vals = []
467 
468  try:
469  netdata, error = self._netdata_interface.query_netdata('cpu.core_throttling', interval)
470  if not netdata:
471  throt_level = DiagnosticStatus.WARN
472  throt_msg = 'Core Throttling Error'
473  throt_vals = [ KeyValue(key = 'Core Throttling Error', value = 'Could not fetch data from netdata'),
474  KeyValue(key = 'Output', value = netdata),
475  KeyValue(key = 'Error', value= error) ]
476  return (throt_vals, throt_msg, throt_level)
477 
478  for i in range(self._num_cores):
479  lbl = 'CPU %d Thermal Throttling Events' % i
480  netdata_key = 'cpu%d' % i
481 
482  core_mean = 'N/A'
483  if netdata_key in netdata:
484  core_data = netdata[netdata_key]
485  if core_data is not None:
486  core_mean = np.mean(core_data)
487 
488  if core_mean > self._thermal_throttling_threshold:
489  throt_level = DiagnosticStatus.WARN
490  else:
491  throt_level = DiagnosticStatus.ERROR
492 
493  throt_vals.append(KeyValue(key=lbl, value='%r' % core_mean))
494 
495  throt_vals.insert(0, KeyValue(key='Thermal Throttling Status', value=throt_msg))
496  throt_vals.append(KeyValue(key='Thermal Throttling Threshold', value=str(self._thermal_throttling_threshold)))
497 
498  throt_msg = throt_dict[throt_level]
499 
500  except requests.ConnectionError as e:
501  throt_level = DiagnosticStatus.ERROR
502  throt_msg = 'Thermal Throttling Connection Error'
503  throt_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
504 
505  except Exception as e:
506  throt_level = DiagnosticStatus.ERROR
507  throt_msg = 'Thermal Throttling Exception'
508  throt_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
509 
510  return throt_vals, throt_msg, throt_level
511 
512 
513  def check_idlejitter(self, interval=1):
514  jitter_dict = {DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High IDLE Jitter',
515  DiagnosticStatus.ERROR: 'No valid Data from NetData'}
516 
517  jitter_level = DiagnosticStatus.OK
518  jitter_msg = ''
519  jitter_vals = []
520 
521  try:
522  netdata, error = self._netdata_interface.query_netdata('system.idlejitter', interval)
523  if not netdata:
524  jitter_level = DiagnosticStatus.WARN
525  jitter_msg = 'IDLE Jitter Error'
526  jitter_vals = [ KeyValue(key = 'Core Throttling Error', value = 'Could not fetch data from netdata'),
527  KeyValue(key = 'Output', value = netdata),
528  KeyValue(key = 'Error', value= error) ]
529  return (jitter_vals, jitter_msg, jitter_level)
530 
531  metric_list = [
532  ('IDLE Jitter Min', 'min', self._idlejitter_min_threshold, np.min),
533  ('IDLE Jitter Max', 'max', self._idlejitter_max_threshold, np.max),
534  ('IDLE Jitter Average', 'average', self._idlejitter_average_threshold, np.mean),
535  ]
536 
537  for metric_label, metric_key, metric_threshold, aggregate_fnc in metric_list:
538  metric_aggreagte = 'N/A'
539  if netdata is not None and metric_key in netdata:
540  metric_data = netdata[metric_key]
541  if metric_data is not None:
542  metric_aggreagte = aggregate_fnc(metric_data)
543 
544  if metric_aggreagte > metric_threshold:
545  jitter_level = DiagnosticStatus.WARN
546  else:
547  jitter_level = DiagnosticStatus.ERROR
548 
549  jitter_vals.append(KeyValue(key=metric_label, value=str(metric_aggreagte)))
550  jitter_vals.append(KeyValue(key=metric_label + ' Threshold', value=str(metric_threshold)))
551 
552  jitter_vals.insert(0, KeyValue(key='IDLE Jitter Status', value=jitter_msg))
553  jitter_msg = jitter_dict[jitter_level]
554 
555  except requests.ConnectionError as e:
556  jitter_level = DiagnosticStatus.ERROR
557  jitter_msg = 'IDLE Jitter Connection Error'
558  jitter_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
559 
560  except Exception as e:
561  jitter_level = DiagnosticStatus.ERROR
562  jitter_msg = 'IDLE Jitter Exception'
563  jitter_vals = [ KeyValue(key = 'Exception', value = str(e)), KeyValue(key = 'Traceback', value = str(traceback.format_exc())) ]
564 
565  return jitter_vals, jitter_msg, jitter_level
566 
567 
568  def check_info(self, event):
569  diag_vals = []
570  diag_msgs = []
571  diag_level = DiagnosticStatus.OK
572 
573  if self._netdata_version is None:
574  self._netdata_version = self._netdata_interface.query_netdata_info()['version']
575 
576  diag_vals.append(KeyValue(key='Netdata version', value=self._netdata_version))
577 
578  if self._check_core_temps:
579  interval = math.ceil(self._usage_timer._period.to_sec())
580  core_vals, core_msgs, core_level = self.check_core_temps(interval=interval)
581  diag_vals.extend(core_vals)
582  diag_msgs.extend(core_msgs)
583  diag_level = max(diag_level, core_level)
584 
585  clock_vals, clock_msgs, clock_level = self.check_clock_speed()
586  diag_vals.extend(clock_vals)
587  diag_msgs.extend(clock_msgs)
588  diag_level = max(diag_level, clock_level)
589 
590  diag_log = set(diag_msgs)
591  if len(diag_log) > DiagnosticStatus.OK:
592  message = ', '.join(diag_log)
593  else:
594  message = stat_dict[diag_level]
595 
596  self._info_stat.values = diag_vals
597  self._info_stat.message = message
598  self._info_stat.level = diag_level
599 
600  def check_usage(self, event):
601  diag_vals = []
602  diag_msgs = []
603  diag_level = DiagnosticStatus.OK
604 
605  interval = math.ceil(self._usage_timer._period.to_sec())
606 
607  # Check mpstat
608  mp_vals, mp_msg, mp_level = self.check_cpu_util(interval=interval)
609  diag_vals.extend(mp_vals)
610  if mp_level > DiagnosticStatus.OK:
611  diag_msgs.append(mp_msg)
612  diag_level = max(diag_level, mp_level)
613 
614  # Check NetData cpu.core_throttling
616  throt_vals, throt_msg, throt_level = self.check_core_throttling(interval=interval)
617  diag_vals.extend(throt_vals)
618  if throt_level > 0:
619  diag_msgs.append(throt_msg)
620  diag_level = max(diag_level, throt_level)
621 
622  # Check NetData system.idlejitter
623  if self._check_idlejitter:
624  jitter_vals, jitter_msg, jitter_level = self.check_idlejitter(interval=interval)
625  diag_vals.extend(jitter_vals)
626  if jitter_level > 0:
627  diag_msgs.append(jitter_msg)
628  diag_level = max(diag_level, jitter_level)
629 
630  # Check uptime
631  up_vals, up_msg, up_level = self.check_uptime(interval=interval)
632  diag_vals.extend(up_vals)
633  if up_level > DiagnosticStatus.OK:
634  diag_msgs.append(up_msg)
635  diag_level = max(diag_level, up_level)
636 
637  # Check load
638  load_vals, load_msg, load_level = self.check_load(interval=interval)
639  diag_vals.extend(load_vals)
640  if load_level > DiagnosticStatus.OK:
641  diag_msgs.append(load_msg)
642  diag_level = max(diag_level, load_level)
643 
644  if diag_msgs and diag_level > DiagnosticStatus.OK:
645  usage_msg = ', '.join(set(diag_msgs))
646  else:
647  usage_msg = stat_dict[diag_level]
648 
649  self._usage_stat.values = diag_vals
650  self._usage_stat.message = usage_msg
651  self._usage_stat.level = diag_level
652 
653  def check_memory(self, event):
654  diag_vals = []
655  diag_msgs = []
656  diag_level = DiagnosticStatus.OK
657 
658  # Check memory
659  interval = math.ceil(self._memory_timer._period.to_sec())
660  mem_vals, mem_msg, mem_level = self.check_free_memory(interval=interval)
661  diag_vals.extend(mem_vals)
662  if mem_level > DiagnosticStatus.OK:
663  diag_msgs.append(mem_msg)
664  diag_level = max(diag_level, mem_level)
665 
666  if diag_msgs and diag_level > DiagnosticStatus.OK:
667  memory_msg = ', '.join(set(diag_msgs))
668  else:
669  memory_msg = stat_dict[diag_level]
670 
671  self._memory_stat.values = diag_vals
672  self._memory_stat.message = memory_msg
673  self._memory_stat.level = diag_level
674 
675  def publish_stats(self, event):
676  msg = DiagnosticArray()
677  msg.header.stamp = rospy.get_rostime()
678  msg.status.append(self._info_stat)
679  msg.status.append(self._usage_stat)
680  msg.status.append(self._memory_stat)
681  self._diag_pub.publish(msg)
682 
683 
684 if __name__ == '__main__':
685  hostname = socket.gethostname()
686 
687  import optparse
688  parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]")
689  parser.add_option("--diag-hostname", dest="diag_hostname",
690  help="Computer name in diagnostics output (ex: 'c1')",
691  metavar="DIAG_HOSTNAME",
692  action="store", default = hostname)
693  options, args = parser.parse_args(rospy.myargv())
694 
695  try:
696  rospy.init_node('cpu_monitor_%s' % hostname)
697  except rospy.exceptions.ROSInitException:
698  print('CPU monitor is unable to initialize node. Master may not be running.')
699  sys.exit(0)
700 
701  cpu_node = CPUMonitor(hostname, options.diag_hostname)
702  rospy.spin()
cpu_monitor.CPUMonitor._core_temp_warn
_core_temp_warn
Definition: cpu_monitor.py:44
cpu_monitor.CPUMonitor._info_timer
_info_timer
Definition: cpu_monitor.py:85
cpu_monitor.CPUMonitor.check_memory
def check_memory(self, event)
Definition: cpu_monitor.py:653
cpu_monitor.CPUMonitor._netdata_version
_netdata_version
Definition: cpu_monitor.py:59
cpu_monitor.CPUMonitor._diag_pub
_diag_pub
Definition: cpu_monitor.py:83
netdata_interface.netdata_interface.NetdataInterface
Definition: netdata_interface.py:6
cpu_monitor.CPUMonitor._idlejitter_max_threshold
_idlejitter_max_threshold
Definition: cpu_monitor.py:54
cpu_monitor.CPUMonitor.__init__
def __init__(self, hostname, diag_hostname)
Definition: cpu_monitor.py:36
cpu_monitor.CPUMonitor.check_uptime
def check_uptime(self, interval=1)
Uses 'uptime' to see system uptime.
Definition: cpu_monitor.py:199
cpu_monitor.CPUMonitor.check_usage
def check_usage(self, event)
Definition: cpu_monitor.py:600
netdata_interface.netdata_interface
Definition: netdata_interface.py:1
cpu_monitor.CPUMonitor.check_cpu_util
def check_cpu_util(self, interval=1)
Definition: cpu_monitor.py:376
cpu_monitor.CPUMonitor._core_load_error
_core_load_error
Definition: cpu_monitor.py:41
cpu_monitor.CPUMonitor._usage_stat
_usage_stat
Definition: cpu_monitor.py:69
cpu_monitor.CPUMonitor.check_load
def check_load(self, interval=1)
Uses 'system.load' to see load average.
Definition: cpu_monitor.py:231
cpu_monitor.CPUMonitor.check_core_throttling
def check_core_throttling(self, interval=1)
Definition: cpu_monitor.py:460
cpu_monitor.CPUMonitor._core_temp_error
_core_temp_error
Definition: cpu_monitor.py:45
cpu_monitor.CPUMonitor._netdata_interface
_netdata_interface
Definition: cpu_monitor.py:37
cpu_monitor.CPUMonitor._publish_timer
_publish_timer
Definition: cpu_monitor.py:84
cpu_monitor.CPUMonitor._check_idlejitter
_check_idlejitter
Definition: cpu_monitor.py:52
cpu_monitor.CPUMonitor._info_stat
_info_stat
Definition: cpu_monitor.py:62
cpu_monitor.CPUMonitor._idlejitter_average_threshold
_idlejitter_average_threshold
Definition: cpu_monitor.py:55
cpu_monitor.CPUMonitor.publish_stats
def publish_stats(self, event)
Definition: cpu_monitor.py:675
cpu_monitor.CPUMonitor._mem_warn
_mem_warn
Definition: cpu_monitor.py:46
cpu_monitor.CPUMonitor._memory_timer
_memory_timer
Definition: cpu_monitor.py:87
cpu_monitor.CPUMonitor
Definition: cpu_monitor.py:35
cpu_monitor.CPUMonitor._idlejitter_min_threshold
_idlejitter_min_threshold
Definition: cpu_monitor.py:53
cpu_monitor.CPUMonitor._check_thermal_throttling_events
_check_thermal_throttling_events
Definition: cpu_monitor.py:49
cpu_monitor.CPUMonitor._check_core_temps
_check_core_temps
Definition: cpu_monitor.py:39
cpu_monitor.CPUMonitor.check_clock_speed
def check_clock_speed(self, interval=1)
Checks clock speed from reading from CPU info.
Definition: cpu_monitor.py:154
cpu_monitor.CPUMonitor._memory_stat
_memory_stat
Definition: cpu_monitor.py:76
cpu_monitor.CPUMonitor._thermal_throttling_threshold
_thermal_throttling_threshold
Definition: cpu_monitor.py:50
cpu_monitor.CPUMonitor._load5_threshold
_load5_threshold
Definition: cpu_monitor.py:43
cpu_monitor.CPUMonitor.check_info
def check_info(self, event)
Definition: cpu_monitor.py:568
cpu_monitor.CPUMonitor._num_cores
_num_cores
Definition: cpu_monitor.py:57
cpu_monitor.CPUMonitor._load1_threshold
_load1_threshold
Definition: cpu_monitor.py:42
cpu_monitor.CPUMonitor._usage_timer
_usage_timer
Definition: cpu_monitor.py:86
cpu_monitor.CPUMonitor.check_free_memory
def check_free_memory(self, interval=1)
Uses 'free -m' to check free memory.
Definition: cpu_monitor.py:281
cpu_monitor.CPUMonitor._core_load_warn
_core_load_warn
Definition: cpu_monitor.py:40
cpu_monitor.CPUMonitor.check_core_temps
def check_core_temps(self, interval=1)
Check CPU core temps.
Definition: cpu_monitor.py:90
cpu_monitor.CPUMonitor._mem_error
_mem_error
Definition: cpu_monitor.py:47
cpu_monitor.CPUMonitor.check_idlejitter
def check_idlejitter(self, interval=1)
Definition: cpu_monitor.py:513


cob_monitoring
Author(s): Florian Weisshardt , Felix Messmer
autogenerated on Fri Aug 2 2024 09:45:52