29 from diagnostic_msgs.msg
import DiagnosticArray, DiagnosticStatus, KeyValue
33 stat_dict = { DiagnosticStatus.OK:
'OK', DiagnosticStatus.WARN:
'Warning', DiagnosticStatus.ERROR:
'Error', DiagnosticStatus.STALE:
'Stale' }
57 self.
_num_cores = rospy.get_param(
'~num_cores', psutil.cpu_count())
63 self.
_info_stat.name =
'%s CPU Info' % diag_hostname
70 self.
_usage_stat.name =
'%s CPU Usage' % diag_hostname
77 self.
_memory_stat.name =
'%s Memory Usage' % diag_hostname
83 self.
_diag_pub = rospy.Publisher(
'/diagnostics', DiagnosticArray, queue_size=1)
93 diag_level = DiagnosticStatus.OK
97 netdata_module_name_core_temps = [
'sensors.coretemp_isa_0000_temperature',
98 'sensors.coretemp-isa-0000_temperature']
100 netdata_module_name_err =
''
101 for name
in netdata_module_name_core_temps:
106 except requests.ConnectionError
as err:
108 netdata_module_name_err += name +
' '
110 netdata_core_temp =
None
114 if netdata_core_temp:
117 netdata_module_name_err =
"{} of {} failed: {}".format(error_count, len(netdata_module_name_core_temps), netdata_module_name_err)
119 if not netdata_core_temp:
120 diag_level = DiagnosticStatus.WARN
121 diag_msgs = [
'Core Temp Error' ]
122 diag_vals = [ KeyValue(key =
'Core Temp Error', value =
'Could not fetch data from netdata'),
123 KeyValue(key =
'Failed Chart Names', value = netdata_module_name_err),
124 KeyValue(key =
'Output', value = netdata_core_temp),
125 KeyValue(key =
'Error', value= error) ]
126 return (diag_vals, diag_msgs, diag_level)
128 del netdata_core_temp[
'time']
129 del netdata_core_temp[
'Package id 0']
131 for core_no, values
in netdata_core_temp.items():
132 mean_temp = np.mean(values)
134 diag_vals.append(KeyValue(key =
'Temp %s' % core_no, value = str(mean_temp)))
137 diag_level = max(diag_level, DiagnosticStatus.OK)
138 diag_msgs.append(
'CPU Hot')
140 diag_level = max(diag_level, DiagnosticStatus.OK)
141 diag_msgs.append(
'CPU Warm')
143 diag_level = max(diag_level, DiagnosticStatus.ERROR)
144 diag_vals.append(KeyValue(key =
'Temp %s' % core_no, value = str(mean_temp)))
146 except Exception
as e:
147 diag_level = DiagnosticStatus.ERROR
148 diag_msgs = [
'Core Temp Exception' ]
149 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
151 return diag_vals, diag_msgs, diag_level
157 diag_level = DiagnosticStatus.OK
160 netdata_cpu_freq, error = self.
_netdata_interface.query_netdata(
'cpu.cpufreq', interval)
161 if not netdata_cpu_freq:
162 diag_level = DiagnosticStatus.WARN
163 diag_msgs = [
'Clock Speed Error' ]
164 diag_vals = [ KeyValue(key =
'Clock Speed Error', value =
'Could not fetch data from netdata'),
165 KeyValue(key =
'Output', value = netdata_cpu_freq),
166 KeyValue(key =
'Error', value= error) ]
167 return (diag_vals, diag_msgs, diag_level)
169 del netdata_cpu_freq[
"time"]
171 for cpu_name, values
in netdata_cpu_freq.items():
172 diag_vals.append(KeyValue(key =
'Core %d (MHz)' % int(cpu_name[-1]), value = str(np.mean(values))))
177 diag_level = DiagnosticStatus.WARN
178 diag_msgs = [
'Clock Speed Error' ]
179 diag_vals = [ KeyValue(key =
'Clock Speed Error', value =
'Could not fetch data from netdata'),
180 KeyValue(key =
'Output', value = netdata_info) ]
181 return (diag_vals, diag_msgs, diag_level)
183 max_cpu_freq = float(netdata_info[
'cpu_freq'])/1e6
184 diag_vals.append(KeyValue(key =
'Maximum Frequency (MHz)', value = str(max_cpu_freq)))
186 except requests.ConnectionError
as e:
187 diag_level = DiagnosticStatus.ERROR
188 diag_msgs = [
'Clock Speed Connection Error' ]
189 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
191 except Exception
as e:
192 diag_level = DiagnosticStatus.ERROR
193 diag_msgs = [
'Clock Speed Exception' ]
194 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
196 return diag_vals, diag_msgs, diag_level
202 diag_level = DiagnosticStatus.OK
205 netdata_uptime, error = self.
_netdata_interface.query_netdata(
'system.uptime', interval)
206 if not netdata_uptime:
207 diag_level = DiagnosticStatus.WARN
208 diag_msg =
'Uptime Error'
209 diag_vals = [ KeyValue(key =
'Uptime Error', value =
'Could not fetch data from netdata'),
210 KeyValue(key =
'Output', value = netdata_uptime),
211 KeyValue(key =
'Error', value= error) ]
212 return (diag_vals, diag_msg, diag_level)
214 del netdata_uptime[
'time']
216 diag_vals.append(KeyValue(key =
'Uptime', value = str(np.max(netdata_uptime[
'uptime'].astype(float)))))
218 except requests.ConnectionError
as e:
219 diag_level = DiagnosticStatus.ERROR
220 diag_msg =
'Uptime Connection Error'
221 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
223 except Exception
as e:
224 diag_level = DiagnosticStatus.ERROR
225 diag_msg =
'Uptime Exception'
226 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
228 return diag_vals, diag_msg, diag_level
234 diag_level = DiagnosticStatus.OK
236 load_dict = { DiagnosticStatus.OK:
'OK', DiagnosticStatus.WARN:
'High Load', DiagnosticStatus.ERROR:
'Very High Load' }
239 netdata_cpu_load, error = self.
_netdata_interface.query_netdata(
'system.load', interval)
240 if not netdata_cpu_load:
241 diag_level = DiagnosticStatus.WARN
242 diag_msg =
'Load Error'
243 diag_vals = [ KeyValue(key =
'Load Error', value =
'Could not fetch data from netdata'),
244 KeyValue(key =
'Output', value = netdata_cpu_load),
245 KeyValue(key =
'Error', value= error) ]
246 return (diag_vals, diag_msg, diag_level)
248 del netdata_cpu_load[
'time']
250 load1 = np.mean(netdata_cpu_load[
'load1'].astype(float))
251 load5 = np.mean(netdata_cpu_load[
'load5'].astype(float))
252 load15 = np.mean(netdata_cpu_load[
'load15'].astype(float))
256 diag_level = DiagnosticStatus.WARN
258 diag_vals.append(KeyValue(key =
'Load Average Status', value = load_dict[diag_level]))
259 diag_vals.append(KeyValue(key =
'1 min Load Average', value = str(load1)))
260 diag_vals.append(KeyValue(key =
'1 min Load Average Threshold', value = str(self.
_load1_threshold)))
261 diag_vals.append(KeyValue(key =
'5 min Load Average', value = str(load5)))
262 diag_vals.append(KeyValue(key =
'5 min Load Average Threshold', value = str(self.
_load5_threshold)))
263 diag_vals.append(KeyValue(key =
'15 min Load Average', value = str(load15)))
265 diag_msg = load_dict[diag_level]
267 except requests.ConnectionError
as e:
268 diag_level = DiagnosticStatus.ERROR
269 diag_msg =
'Load Connection Error'
270 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
272 except Exception
as e:
273 diag_level = DiagnosticStatus.ERROR
274 diag_msg =
'Load Exception'
275 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
277 return diag_vals, diag_msg, diag_level
284 diag_level = DiagnosticStatus.OK
286 mem_dict = { DiagnosticStatus.OK:
'OK', DiagnosticStatus.WARN:
'Low Memory', DiagnosticStatus.ERROR:
'Very Low Memory' }
291 diag_level = DiagnosticStatus.WARN
292 diag_msg =
'Memory Usage Error'
293 diag_vals = [ KeyValue(key =
'Memory Usage Error', value =
'Could not fetch data from netdata'),
294 KeyValue(key =
'Output', value = netdata_mem),
295 KeyValue(key =
'Error', value= error) ]
296 return (diag_vals, diag_msg, diag_level)
298 del netdata_mem[
'time']
301 memory_vals = {k: np.mean(v.astype(float))
for k, v
in netdata_mem.items()}
302 total_mem = sum([val
for val
in memory_vals.values()])
303 free_mem = memory_vals[
'free']
304 used_mem = memory_vals[
'used']
305 cache_mem = memory_vals[
'cached'] + memory_vals[
'buffers']
307 diag_level = DiagnosticStatus.OK
309 diag_level = DiagnosticStatus.WARN
311 diag_level = DiagnosticStatus.ERROR
313 diag_vals.append(KeyValue(key =
'Mem Status', value = mem_dict[diag_level]))
314 diag_vals.append(KeyValue(key =
'Mem Total', value = str(total_mem)))
315 diag_vals.append(KeyValue(key =
'Mem Used', value = str(used_mem)))
316 diag_vals.append(KeyValue(key =
'Mem Free', value = str(free_mem)))
317 diag_vals.append(KeyValue(key =
'Mem Buff/Cache', value = str(cache_mem)))
320 netdata_swap_charts = [
'mem.swap',
'system.swap']
322 netdata_chart_err =
''
323 for chart
in netdata_swap_charts:
328 except requests.ConnectionError
as err:
330 netdata_chart_err += chart +
' '
338 netdata_chart_err =
"{} of {} failed: {}".format(error_count, len(netdata_swap_charts), netdata_chart_err)
341 diag_level = DiagnosticStatus.WARN
342 diag_msg =
'Swap Usage Error'
343 diag_vals = [ KeyValue(key=
'Swap Usage Error', value=
'Could not fetch data from netdata'),
344 KeyValue(key=
'Failed Chart Names', value=netdata_chart_err),
345 KeyValue(key=
'Output', value=netdata_swp),
346 KeyValue(key=
'Error', value=error) ]
347 return (diag_vals, diag_msg, diag_level)
349 del netdata_swp[
'time']
352 swap_vals = {k: np.mean(v.astype(float))
for k, v
in netdata_swp.items()}
353 total_swp = sum([val
for val
in swap_vals.values()])
354 free_swp = swap_vals[
'free']
355 used_swp = swap_vals[
'used']
357 diag_vals.append(KeyValue(key =
'Swap Total', value = str(total_swp)))
358 diag_vals.append(KeyValue(key =
'Swap Used', value = str(used_swp)))
359 diag_vals.append(KeyValue(key =
'Swap Free', value = str(free_swp)))
361 diag_msg = mem_dict[diag_level]
363 except requests.ConnectionError
as e:
364 diag_level = DiagnosticStatus.ERROR
365 diag_msg =
'Memory Usage Connection Error'
366 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
368 except Exception
as e:
369 diag_level = DiagnosticStatus.ERROR
370 diag_msg =
'Memory Usage Exception'
371 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
373 return diag_vals, diag_msg, diag_level
379 diag_level = DiagnosticStatus.OK
381 load_dict = { DiagnosticStatus.OK:
'OK', DiagnosticStatus.WARN:
'High Load', DiagnosticStatus.ERROR:
'Error' }
386 diag_level = DiagnosticStatus.WARN
387 diag_msg =
'CPU Usage Error'
388 diag_vals = [ KeyValue(key =
'CPU Usage Error', value =
'Could not fetch data from netdata'),
389 KeyValue(key =
'Output', value = netdata_info) ]
390 return (diag_vals, diag_msg, diag_level)
392 num_cores = int(netdata_info[
'cores_total'])
393 netdata_system_cpu, error = self.
_netdata_interface.query_netdata(
'system.cpu', interval)
394 if not netdata_system_cpu:
395 diag_level = DiagnosticStatus.WARN
396 diag_msg =
'CPU Usage Error'
397 diag_vals = [ KeyValue(key =
'CPU Usage Error', value =
'Could not fetch data from netdata'),
398 KeyValue(key =
'Output', value = netdata_system_cpu),
399 KeyValue(key =
'Error', value= error) ]
400 return (diag_vals, diag_msg, diag_level)
402 netdata_cpu_util = [self.
_netdata_interface.query_netdata(
'cpu.cpu%d' % i, interval)
for i
in range(num_cores)]
403 netdata_cpu_idle = [self.
_netdata_interface.query_netdata(
'cpu.cpu%d_cpuidle' % i, interval)
for i
in range(num_cores)]
405 if any([data ==
None for data, error
in netdata_cpu_util]):
406 diag_level = DiagnosticStatus.ERROR
407 diag_msg =
'CPU Util Error'
408 diag_vals = [ KeyValue(key =
'CPU Util Error', value =
'Could not fetch data from netdata'),
409 KeyValue(key =
'Output', value = netdata_cpu_util) ]
410 return (diag_vals, diag_msg, diag_level)
411 if any([data ==
None for data, error
in netdata_cpu_idle]):
412 diag_level = DiagnosticStatus.ERROR
413 diag_msg =
'CPU Idle Error'
414 diag_vals = [ KeyValue(key =
'CPU Idle Error', value =
'Could not fetch data from netdata'),
415 KeyValue(key =
'Output', value = netdata_cpu_idle) ]
416 return (diag_vals, diag_msg, diag_level)
419 for i_cpu
in range(num_cores):
421 cpu_name =
'Core %d' % (i_cpu)
422 idle = 100 - np.mean(netdata_cpu_idle[i_cpu][0][
'C0 (active)'])
423 user = np.mean(netdata_cpu_util[i_cpu][0][
'user'])
424 nice = np.mean(netdata_cpu_util[i_cpu][0][
'nice'])
425 system = np.mean(netdata_cpu_util[i_cpu][0][
'system'])
427 core_level = DiagnosticStatus.OK
428 usage = float(user) + float(nice)
431 core_level = DiagnosticStatus.WARN
433 core_level = DiagnosticStatus.ERROR
435 diag_vals.append(KeyValue(key =
'CPU %s Status' % cpu_name, value = load_dict[core_level]))
436 diag_vals.append(KeyValue(key =
'CPU %s User' % cpu_name, value = str(user)))
437 diag_vals.append(KeyValue(key =
'CPU %s Nice' % cpu_name, value = str(nice)))
438 diag_vals.append(KeyValue(key =
'CPU %s System' % cpu_name, value = str(system)))
439 diag_vals.append(KeyValue(key =
'CPU %s Idle' % cpu_name, value = str(idle)))
442 if num_cores - cores_loaded <= 2
and num_cores > 2:
443 diag_level = DiagnosticStatus.WARN
445 diag_msg = load_dict[diag_level]
447 except requests.ConnectionError
as e:
448 diag_level = DiagnosticStatus.ERROR
449 diag_msg =
'CPU Usage Connection Error'
450 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
452 except Exception
as e:
453 diag_level = DiagnosticStatus.ERROR
454 diag_msg =
'CPU Usage Exception'
455 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
457 return diag_vals, diag_msg, diag_level
461 throt_dict = {DiagnosticStatus.OK:
'OK', DiagnosticStatus.WARN:
'High Thermal Throttling Events',
462 DiagnosticStatus.ERROR:
'No valid Data from NetData'}
464 throt_level = DiagnosticStatus.OK
469 netdata, error = self.
_netdata_interface.query_netdata(
'cpu.core_throttling', interval)
471 throt_level = DiagnosticStatus.WARN
472 throt_msg =
'Core Throttling Error'
473 throt_vals = [ KeyValue(key =
'Core Throttling Error', value =
'Could not fetch data from netdata'),
474 KeyValue(key =
'Output', value = netdata),
475 KeyValue(key =
'Error', value= error) ]
476 return (throt_vals, throt_msg, throt_level)
479 lbl =
'CPU %d Thermal Throttling Events' % i
480 netdata_key =
'cpu%d' % i
483 if netdata_key
in netdata:
484 core_data = netdata[netdata_key]
485 if core_data
is not None:
486 core_mean = np.mean(core_data)
489 throt_level = DiagnosticStatus.WARN
491 throt_level = DiagnosticStatus.ERROR
493 throt_vals.append(KeyValue(key=lbl, value=
'%r' % core_mean))
495 throt_vals.insert(0, KeyValue(key=
'Thermal Throttling Status', value=throt_msg))
498 throt_msg = throt_dict[throt_level]
500 except requests.ConnectionError
as e:
501 throt_level = DiagnosticStatus.ERROR
502 throt_msg =
'Thermal Throttling Connection Error'
503 throt_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
505 except Exception
as e:
506 throt_level = DiagnosticStatus.ERROR
507 throt_msg =
'Thermal Throttling Exception'
508 throt_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
510 return throt_vals, throt_msg, throt_level
514 jitter_dict = {DiagnosticStatus.OK:
'OK', DiagnosticStatus.WARN:
'High IDLE Jitter',
515 DiagnosticStatus.ERROR:
'No valid Data from NetData'}
517 jitter_level = DiagnosticStatus.OK
524 jitter_level = DiagnosticStatus.WARN
525 jitter_msg =
'IDLE Jitter Error'
526 jitter_vals = [ KeyValue(key =
'Core Throttling Error', value =
'Could not fetch data from netdata'),
527 KeyValue(key =
'Output', value = netdata),
528 KeyValue(key =
'Error', value= error) ]
529 return (jitter_vals, jitter_msg, jitter_level)
537 for metric_label, metric_key, metric_threshold, aggregate_fnc
in metric_list:
538 metric_aggreagte =
'N/A'
539 if netdata
is not None and metric_key
in netdata:
540 metric_data = netdata[metric_key]
541 if metric_data
is not None:
542 metric_aggreagte = aggregate_fnc(metric_data)
544 if metric_aggreagte > metric_threshold:
545 jitter_level = DiagnosticStatus.WARN
547 jitter_level = DiagnosticStatus.ERROR
549 jitter_vals.append(KeyValue(key=metric_label, value=str(metric_aggreagte)))
550 jitter_vals.append(KeyValue(key=metric_label +
' Threshold', value=str(metric_threshold)))
552 jitter_vals.insert(0, KeyValue(key=
'IDLE Jitter Status', value=jitter_msg))
553 jitter_msg = jitter_dict[jitter_level]
555 except requests.ConnectionError
as e:
556 jitter_level = DiagnosticStatus.ERROR
557 jitter_msg =
'IDLE Jitter Connection Error'
558 jitter_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
560 except Exception
as e:
561 jitter_level = DiagnosticStatus.ERROR
562 jitter_msg =
'IDLE Jitter Exception'
563 jitter_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
565 return jitter_vals, jitter_msg, jitter_level
571 diag_level = DiagnosticStatus.OK
576 diag_vals.append(KeyValue(key=
'Netdata version', value=self.
_netdata_version))
579 interval = math.ceil(self.
_usage_timer._period.to_sec())
581 diag_vals.extend(core_vals)
582 diag_msgs.extend(core_msgs)
583 diag_level = max(diag_level, core_level)
586 diag_vals.extend(clock_vals)
587 diag_msgs.extend(clock_msgs)
588 diag_level = max(diag_level, clock_level)
590 diag_log = set(diag_msgs)
591 if len(diag_log) > DiagnosticStatus.OK:
592 message =
', '.join(diag_log)
594 message = stat_dict[diag_level]
603 diag_level = DiagnosticStatus.OK
605 interval = math.ceil(self.
_usage_timer._period.to_sec())
608 mp_vals, mp_msg, mp_level = self.
check_cpu_util(interval=interval)
609 diag_vals.extend(mp_vals)
610 if mp_level > DiagnosticStatus.OK:
611 diag_msgs.append(mp_msg)
612 diag_level = max(diag_level, mp_level)
617 diag_vals.extend(throt_vals)
619 diag_msgs.append(throt_msg)
620 diag_level = max(diag_level, throt_level)
624 jitter_vals, jitter_msg, jitter_level = self.
check_idlejitter(interval=interval)
625 diag_vals.extend(jitter_vals)
627 diag_msgs.append(jitter_msg)
628 diag_level = max(diag_level, jitter_level)
631 up_vals, up_msg, up_level = self.
check_uptime(interval=interval)
632 diag_vals.extend(up_vals)
633 if up_level > DiagnosticStatus.OK:
634 diag_msgs.append(up_msg)
635 diag_level = max(diag_level, up_level)
638 load_vals, load_msg, load_level = self.
check_load(interval=interval)
639 diag_vals.extend(load_vals)
640 if load_level > DiagnosticStatus.OK:
641 diag_msgs.append(load_msg)
642 diag_level = max(diag_level, load_level)
644 if diag_msgs
and diag_level > DiagnosticStatus.OK:
645 usage_msg =
', '.join(set(diag_msgs))
647 usage_msg = stat_dict[diag_level]
656 diag_level = DiagnosticStatus.OK
661 diag_vals.extend(mem_vals)
662 if mem_level > DiagnosticStatus.OK:
663 diag_msgs.append(mem_msg)
664 diag_level = max(diag_level, mem_level)
666 if diag_msgs
and diag_level > DiagnosticStatus.OK:
667 memory_msg =
', '.join(set(diag_msgs))
669 memory_msg = stat_dict[diag_level]
676 msg = DiagnosticArray()
677 msg.header.stamp = rospy.get_rostime()
684 if __name__ ==
'__main__':
685 hostname = socket.gethostname()
688 parser = optparse.OptionParser(usage=
"usage: cpu_monitor.py [--diag-hostname=cX]")
689 parser.add_option(
"--diag-hostname", dest=
"diag_hostname",
690 help=
"Computer name in diagnostics output (ex: 'c1')",
691 metavar=
"DIAG_HOSTNAME",
692 action=
"store", default = hostname)
693 options, args = parser.parse_args(rospy.myargv())
696 rospy.init_node(
'cpu_monitor_%s' % hostname)
697 except rospy.exceptions.ROSInitException:
698 print(
'CPU monitor is unable to initialize node. Master may not be running.')