29 from diagnostic_msgs.msg
import DiagnosticArray, DiagnosticStatus, KeyValue
33 stat_dict = { DiagnosticStatus.OK:
'OK', DiagnosticStatus.WARN:
'Warning', DiagnosticStatus.ERROR:
'Error', DiagnosticStatus.STALE:
'Stale' }
57 self.
_num_cores = rospy.get_param(
'~num_cores', psutil.cpu_count())
61 self.
_info_stat.name =
'%s CPU Info' % diag_hostname
68 self.
_usage_stat.name =
'%s CPU Usage' % diag_hostname
75 self.
_memory_stat.name =
'%s Memory Usage' % diag_hostname
81 self.
_diag_pub = rospy.Publisher(
'/diagnostics', DiagnosticArray, queue_size=1)
91 diag_level = DiagnosticStatus.OK
95 netdata_module_name_core_temps = [
'sensors.coretemp_isa_0000_temperature',
96 'sensors.coretemp-isa-0000_temperature']
98 netdata_module_name_err =
'' 99 for name
in netdata_module_name_core_temps:
104 except requests.ConnectionError
as err:
106 netdata_module_name_err += name +
' ' 108 netdata_core_temp =
None 112 if netdata_core_temp:
115 netdata_module_name_err =
"{} of {} failed: {}".format(error_count, len(netdata_module_name_core_temps), netdata_module_name_err)
117 if not netdata_core_temp:
118 diag_level = DiagnosticStatus.WARN
119 diag_msgs = [
'Core Temp Error' ]
120 diag_vals = [ KeyValue(key =
'Core Temp Error', value =
'Could not fetch data from netdata'),
121 KeyValue(key =
'Failed Chart Names', value = netdata_module_name_err),
122 KeyValue(key =
'Output', value = netdata_core_temp),
123 KeyValue(key =
'Error', value= error) ]
124 return (diag_vals, diag_msgs, diag_level)
126 del netdata_core_temp[
'time']
127 del netdata_core_temp[
'Package id 0']
129 for core_no, values
in netdata_core_temp.items():
130 mean_temp = np.mean(values)
132 diag_vals.append(KeyValue(key =
'Temp %s' % core_no, value = str(mean_temp)))
135 diag_level = max(diag_level, DiagnosticStatus.OK)
136 diag_msgs.append(
'CPU Hot')
138 diag_level = max(diag_level, DiagnosticStatus.OK)
139 diag_msgs.append(
'CPU Warm')
141 diag_level = max(diag_level, DiagnosticStatus.ERROR)
142 diag_vals.append(KeyValue(key =
'Temp %s' % core_no, value = str(mean_temp)))
144 except Exception
as e:
145 diag_level = DiagnosticStatus.ERROR
146 diag_msgs = [
'Core Temp Exception' ]
147 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
149 return diag_vals, diag_msgs, diag_level
155 diag_level = DiagnosticStatus.OK
158 netdata_cpu_freq, error = self.
_netdata_interface.query_netdata(
'cpu.cpufreq', interval)
159 if not netdata_cpu_freq:
160 diag_level = DiagnosticStatus.WARN
161 diag_msgs = [
'Clock Speed Error' ]
162 diag_vals = [ KeyValue(key =
'Clock Speed Error', value =
'Could not fetch data from netdata'),
163 KeyValue(key =
'Output', value = netdata_cpu_freq),
164 KeyValue(key =
'Error', value= error) ]
165 return (diag_vals, diag_msgs, diag_level)
167 del netdata_cpu_freq[
"time"]
169 for cpu_name, values
in netdata_cpu_freq.items():
170 diag_vals.append(KeyValue(key =
'Core %d (MHz)' % int(cpu_name[-1]), value = str(np.mean(values))))
175 diag_level = DiagnosticStatus.WARN
176 diag_msgs = [
'Clock Speed Error' ]
177 diag_vals = [ KeyValue(key =
'Clock Speed Error', value =
'Could not fetch data from netdata'),
178 KeyValue(key =
'Output', value = netdata_info) ]
179 return (diag_vals, diag_msgs, diag_level)
181 max_cpu_freq = float(netdata_info[
'cpu_freq'])/1e6
182 diag_vals.append(KeyValue(key =
'Maximum Frequency (MHz)', value = str(max_cpu_freq)))
184 except requests.ConnectionError
as e:
185 diag_level = DiagnosticStatus.ERROR
186 diag_msgs = [
'Clock Speed Connection Error' ]
187 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
189 except Exception
as e:
190 diag_level = DiagnosticStatus.ERROR
191 diag_msgs = [
'Clock Speed Exception' ]
192 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
194 return diag_vals, diag_msgs, diag_level
200 diag_level = DiagnosticStatus.OK
203 netdata_uptime, error = self.
_netdata_interface.query_netdata(
'system.uptime', interval)
204 if not netdata_uptime:
205 diag_level = DiagnosticStatus.WARN
206 diag_msg =
'Uptime Error' 207 diag_vals = [ KeyValue(key =
'Uptime Error', value =
'Could not fetch data from netdata'),
208 KeyValue(key =
'Output', value = netdata_uptime),
209 KeyValue(key =
'Error', value= error) ]
210 return (diag_vals, diag_msg, diag_level)
212 del netdata_uptime[
'time']
214 diag_vals.append(KeyValue(key =
'Uptime', value = str(np.max(netdata_uptime[
'uptime'].astype(float)))))
216 except requests.ConnectionError
as e:
217 diag_level = DiagnosticStatus.ERROR
218 diag_msg =
'Uptime Connection Error' 219 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
221 except Exception
as e:
222 diag_level = DiagnosticStatus.ERROR
223 diag_msg =
'Uptime Exception' 224 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
226 return diag_vals, diag_msg, diag_level
232 diag_level = DiagnosticStatus.OK
234 load_dict = { DiagnosticStatus.OK:
'OK', DiagnosticStatus.WARN:
'High Load', DiagnosticStatus.ERROR:
'Very High Load' }
237 netdata_cpu_load, error = self.
_netdata_interface.query_netdata(
'system.load', interval)
238 if not netdata_cpu_load:
239 diag_level = DiagnosticStatus.WARN
240 diag_msg =
'Load Error' 241 diag_vals = [ KeyValue(key =
'Load Error', value =
'Could not fetch data from netdata'),
242 KeyValue(key =
'Output', value = netdata_cpu_load),
243 KeyValue(key =
'Error', value= error) ]
244 return (diag_vals, diag_msg, diag_level)
246 del netdata_cpu_load[
'time']
248 load1 = np.mean(netdata_cpu_load[
'load1'].astype(float))
249 load5 = np.mean(netdata_cpu_load[
'load5'].astype(float))
250 load15 = np.mean(netdata_cpu_load[
'load15'].astype(float))
254 diag_level = DiagnosticStatus.WARN
256 diag_vals.append(KeyValue(key =
'Load Average Status', value = load_dict[diag_level]))
257 diag_vals.append(KeyValue(key =
'1 min Load Average', value = str(load1)))
258 diag_vals.append(KeyValue(key =
'1 min Load Average Threshold', value = str(self.
_load1_threshold)))
259 diag_vals.append(KeyValue(key =
'5 min Load Average', value = str(load5)))
260 diag_vals.append(KeyValue(key =
'5 min Load Average Threshold', value = str(self.
_load5_threshold)))
261 diag_vals.append(KeyValue(key =
'15 min Load Average', value = str(load15)))
263 diag_msg = load_dict[diag_level]
265 except requests.ConnectionError
as e:
266 diag_level = DiagnosticStatus.ERROR
267 diag_msg =
'Load Connection Error' 268 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
270 except Exception
as e:
271 diag_level = DiagnosticStatus.ERROR
272 diag_msg =
'Load Exception' 273 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
275 return diag_vals, diag_msg, diag_level
282 diag_level = DiagnosticStatus.OK
284 mem_dict = { DiagnosticStatus.OK:
'OK', DiagnosticStatus.WARN:
'Low Memory', DiagnosticStatus.ERROR:
'Very Low Memory' }
289 diag_level = DiagnosticStatus.WARN
290 diag_msg =
'Memory Usage Error' 291 diag_vals = [ KeyValue(key =
'Memory Usage Error', value =
'Could not fetch data from netdata'),
292 KeyValue(key =
'Output', value = netdata_mem),
293 KeyValue(key =
'Error', value= error) ]
294 return (diag_vals, diag_msg, diag_level)
296 del netdata_mem[
'time']
299 memory_vals = {k: np.mean(v.astype(float))
for k, v
in netdata_mem.items()}
300 total_mem = sum([val
for val
in memory_vals.values()])
301 free_mem = memory_vals[
'free']
302 used_mem = memory_vals[
'used']
303 cache_mem = memory_vals[
'cached'] + memory_vals[
'buffers']
305 diag_level = DiagnosticStatus.OK
307 diag_level = DiagnosticStatus.WARN
309 diag_level = DiagnosticStatus.ERROR
311 diag_vals.append(KeyValue(key =
'Mem Status', value = mem_dict[diag_level]))
312 diag_vals.append(KeyValue(key =
'Mem Total', value = str(total_mem)))
313 diag_vals.append(KeyValue(key =
'Mem Used', value = str(used_mem)))
314 diag_vals.append(KeyValue(key =
'Mem Free', value = str(free_mem)))
315 diag_vals.append(KeyValue(key =
'Mem Buff/Cache', value = str(cache_mem)))
319 diag_level = DiagnosticStatus.WARN
320 diag_msg =
'Swap Usage Error' 321 diag_vals = [ KeyValue(key =
'Swap Usage Error', value =
'Could not fetch data from netdata'),
322 KeyValue(key =
'Output', value = netdata_swp),
323 KeyValue(key =
'Error', value= error) ]
324 return (diag_vals, diag_msg, diag_level)
326 del netdata_swp[
'time']
329 swap_vals = {k: np.mean(v.astype(float))
for k, v
in netdata_swp.items()}
330 total_swp = sum([val
for val
in swap_vals.values()])
331 free_swp = swap_vals[
'free']
332 used_swp = swap_vals[
'used']
334 diag_vals.append(KeyValue(key =
'Swap Total', value = str(total_swp)))
335 diag_vals.append(KeyValue(key =
'Swap Used', value = str(used_swp)))
336 diag_vals.append(KeyValue(key =
'Swap Free', value = str(free_swp)))
338 diag_msg = mem_dict[diag_level]
340 except requests.ConnectionError
as e:
341 diag_level = DiagnosticStatus.ERROR
342 diag_msg =
'Memory Usage Connection Error' 343 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
345 except Exception
as e:
346 diag_level = DiagnosticStatus.ERROR
347 diag_msg =
'Memory Usage Exception' 348 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
350 return diag_vals, diag_msg, diag_level
356 diag_level = DiagnosticStatus.OK
358 load_dict = { DiagnosticStatus.OK:
'OK', DiagnosticStatus.WARN:
'High Load', DiagnosticStatus.ERROR:
'Error' }
363 diag_level = DiagnosticStatus.WARN
364 diag_msg =
'CPU Usage Error' 365 diag_vals = [ KeyValue(key =
'CPU Usage Error', value =
'Could not fetch data from netdata'),
366 KeyValue(key =
'Output', value = netdata_info) ]
367 return (diag_vals, diag_msg, diag_level)
369 num_cores = int(netdata_info[
'cores_total'])
370 netdata_system_cpu, error = self.
_netdata_interface.query_netdata(
'system.cpu', interval)
371 if not netdata_system_cpu:
372 diag_level = DiagnosticStatus.WARN
373 diag_msg =
'CPU Usage Error' 374 diag_vals = [ KeyValue(key =
'CPU Usage Error', value =
'Could not fetch data from netdata'),
375 KeyValue(key =
'Output', value = netdata_system_cpu),
376 KeyValue(key =
'Error', value= error) ]
377 return (diag_vals, diag_msg, diag_level)
379 netdata_cpu_util = [self.
_netdata_interface.query_netdata(
'cpu.cpu%d' % i, interval)
for i
in range(num_cores)]
380 netdata_cpu_idle = [self.
_netdata_interface.query_netdata(
'cpu.cpu%d_cpuidle' % i, interval)
for i
in range(num_cores)]
382 if any([data ==
None for data, error
in netdata_cpu_util]):
383 diag_level = DiagnosticStatus.ERROR
384 diag_msg =
'CPU Util Error' 385 diag_vals = [ KeyValue(key =
'CPU Util Error', value =
'Could not fetch data from netdata'),
386 KeyValue(key =
'Output', value = netdata_cpu_util) ]
387 return (diag_vals, diag_msg, diag_level)
388 if any([data ==
None for data, error
in netdata_cpu_idle]):
389 diag_level = DiagnosticStatus.ERROR
390 diag_msg =
'CPU Idle Error' 391 diag_vals = [ KeyValue(key =
'CPU Idle Error', value =
'Could not fetch data from netdata'),
392 KeyValue(key =
'Output', value = netdata_cpu_idle) ]
393 return (diag_vals, diag_msg, diag_level)
396 for i_cpu
in range(num_cores):
398 cpu_name =
'Core %d' % (i_cpu)
399 idle = 100 - np.mean(netdata_cpu_idle[i_cpu][0][
'C0 (active)'])
400 user = np.mean(netdata_cpu_util[i_cpu][0][
'user'])
401 nice = np.mean(netdata_cpu_util[i_cpu][0][
'nice'])
402 system = np.mean(netdata_cpu_util[i_cpu][0][
'system'])
404 core_level = DiagnosticStatus.OK
405 usage = float(user) + float(nice)
408 core_level = DiagnosticStatus.WARN
410 core_level = DiagnosticStatus.ERROR
412 diag_vals.append(KeyValue(key =
'CPU %s Status' % cpu_name, value = load_dict[core_level]))
413 diag_vals.append(KeyValue(key =
'CPU %s User' % cpu_name, value = str(user)))
414 diag_vals.append(KeyValue(key =
'CPU %s Nice' % cpu_name, value = str(nice)))
415 diag_vals.append(KeyValue(key =
'CPU %s System' % cpu_name, value = str(system)))
416 diag_vals.append(KeyValue(key =
'CPU %s Idle' % cpu_name, value = str(idle)))
419 if num_cores - cores_loaded <= 2
and num_cores > 2:
420 diag_level = DiagnosticStatus.WARN
422 diag_msg = load_dict[diag_level]
424 except requests.ConnectionError
as e:
425 diag_level = DiagnosticStatus.ERROR
426 diag_msg =
'CPU Usage Connection Error' 427 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
429 except Exception
as e:
430 diag_level = DiagnosticStatus.ERROR
431 diag_msg =
'CPU Usage Exception' 432 diag_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
434 return diag_vals, diag_msg, diag_level
438 throt_dict = {DiagnosticStatus.OK:
'OK', DiagnosticStatus.WARN:
'High Thermal Throttling Events',
439 DiagnosticStatus.ERROR:
'No valid Data from NetData'}
441 throt_level = DiagnosticStatus.OK
446 netdata, error = self.
_netdata_interface.query_netdata(
'cpu.core_throttling', interval)
448 throt_level = DiagnosticStatus.WARN
449 throt_msg =
'Core Throttling Error' 450 throt_vals = [ KeyValue(key =
'Core Throttling Error', value =
'Could not fetch data from netdata'),
451 KeyValue(key =
'Output', value = netdata),
452 KeyValue(key =
'Error', value= error) ]
453 return (throt_vals, throt_msg, throt_level)
456 lbl =
'CPU %d Thermal Throttling Events' % i
457 netdata_key =
'cpu%d' % i
460 if netdata_key
in netdata:
461 core_data = netdata[netdata_key]
462 if core_data
is not None:
463 core_mean = np.mean(core_data)
466 throt_level = DiagnosticStatus.WARN
468 throt_level = DiagnosticStatus.ERROR
470 throt_vals.append(KeyValue(key=lbl, value=
'%r' % core_mean))
472 throt_vals.insert(0, KeyValue(key=
'Thermal Throttling Status', value=throt_msg))
475 throt_msg = throt_dict[throt_level]
477 except requests.ConnectionError
as e:
478 throt_level = DiagnosticStatus.ERROR
479 throt_msg =
'Thermal Throttling Connection Error' 480 throt_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
482 except Exception
as e:
483 throt_level = DiagnosticStatus.ERROR
484 throt_msg =
'Thermal Throttling Exception' 485 throt_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
487 return throt_vals, throt_msg, throt_level
491 jitter_dict = {DiagnosticStatus.OK:
'OK', DiagnosticStatus.WARN:
'High IDLE Jitter',
492 DiagnosticStatus.ERROR:
'No valid Data from NetData'}
494 jitter_level = DiagnosticStatus.OK
501 jitter_level = DiagnosticStatus.WARN
502 jitter_msg =
'IDLE Jitter Error' 503 jitter_vals = [ KeyValue(key =
'Core Throttling Error', value =
'Could not fetch data from netdata'),
504 KeyValue(key =
'Output', value = netdata),
505 KeyValue(key =
'Error', value= error) ]
506 return (jitter_vals, jitter_msg, jitter_level)
514 for metric_label, metric_key, metric_threshold, aggregate_fnc
in metric_list:
515 metric_aggreagte =
'N/A' 516 if netdata
is not None and metric_key
in netdata:
517 metric_data = netdata[metric_key]
518 if metric_data
is not None:
519 metric_aggreagte = aggregate_fnc(metric_data)
521 if metric_aggreagte > metric_threshold:
522 jitter_level = DiagnosticStatus.WARN
524 jitter_level = DiagnosticStatus.ERROR
526 jitter_vals.append(KeyValue(key=metric_label, value=str(metric_aggreagte)))
527 jitter_vals.append(KeyValue(key=metric_label +
' Threshold', value=str(metric_threshold)))
529 jitter_vals.insert(0, KeyValue(key=
'IDLE Jitter Status', value=jitter_msg))
530 jitter_msg = jitter_dict[jitter_level]
532 except requests.ConnectionError
as e:
533 jitter_level = DiagnosticStatus.ERROR
534 jitter_msg =
'IDLE Jitter Connection Error' 535 jitter_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
537 except Exception
as e:
538 jitter_level = DiagnosticStatus.ERROR
539 jitter_msg =
'IDLE Jitter Exception' 540 jitter_vals = [ KeyValue(key =
'Exception', value = str(e)), KeyValue(key =
'Traceback', value = str(traceback.format_exc())) ]
542 return jitter_vals, jitter_msg, jitter_level
548 diag_level = DiagnosticStatus.OK
551 interval = math.ceil(self.
_usage_timer._period.to_sec())
553 diag_vals.extend(core_vals)
554 diag_msgs.extend(core_msgs)
555 diag_level = max(diag_level, core_level)
558 diag_vals.extend(clock_vals)
559 diag_msgs.extend(clock_msgs)
560 diag_level = max(diag_level, clock_level)
562 diag_log = set(diag_msgs)
563 if len(diag_log) > DiagnosticStatus.OK:
564 message =
', '.join(diag_log)
566 message = stat_dict[diag_level]
575 diag_level = DiagnosticStatus.OK
577 interval = math.ceil(self.
_usage_timer._period.to_sec())
580 mp_vals, mp_msg, mp_level = self.
check_cpu_util(interval=interval)
581 diag_vals.extend(mp_vals)
582 if mp_level > DiagnosticStatus.OK:
583 diag_msgs.append(mp_msg)
584 diag_level = max(diag_level, mp_level)
589 diag_vals.extend(throt_vals)
591 diag_msgs.append(throt_msg)
592 diag_level = max(diag_level, throt_level)
596 jitter_vals, jitter_msg, jitter_level = self.
check_idlejitter(interval=interval)
597 diag_vals.extend(jitter_vals)
599 diag_msgs.append(jitter_msg)
600 diag_level = max(diag_level, jitter_level)
603 up_vals, up_msg, up_level = self.
check_uptime(interval=interval)
604 diag_vals.extend(up_vals)
605 if up_level > DiagnosticStatus.OK:
606 diag_msgs.append(up_msg)
607 diag_level = max(diag_level, up_level)
610 load_vals, load_msg, load_level = self.
check_load(interval=interval)
611 diag_vals.extend(load_vals)
612 if load_level > DiagnosticStatus.OK:
613 diag_msgs.append(load_msg)
614 diag_level = max(diag_level, load_level)
616 if diag_msgs
and diag_level > DiagnosticStatus.OK:
617 usage_msg =
', '.join(set(diag_msgs))
619 usage_msg = stat_dict[diag_level]
628 diag_level = DiagnosticStatus.OK
633 diag_vals.extend(mem_vals)
634 if mem_level > DiagnosticStatus.OK:
635 diag_msgs.append(mem_msg)
636 diag_level = max(diag_level, mem_level)
638 if diag_msgs
and diag_level > DiagnosticStatus.OK:
639 memory_msg =
', '.join(set(diag_msgs))
641 memory_msg = stat_dict[diag_level]
648 msg = DiagnosticArray()
649 msg.header.stamp = rospy.get_rostime()
656 if __name__ ==
'__main__':
657 hostname = socket.gethostname()
660 parser = optparse.OptionParser(usage=
"usage: cpu_monitor.py [--diag-hostname=cX]")
661 parser.add_option(
"--diag-hostname", dest=
"diag_hostname",
662 help=
"Computer name in diagnostics output (ex: 'c1')",
663 metavar=
"DIAG_HOSTNAME",
664 action=
"store", default = hostname)
665 options, args = parser.parse_args(rospy.myargv())
668 rospy.init_node(
'cpu_monitor_%s' % hostname)
669 except rospy.exceptions.ROSInitException:
670 print(
'CPU monitor is unable to initialize node. Master may not be running.')
def check_memory(self, event)
def check_usage(self, event)
def check_core_throttling(self, interval=1)
def check_uptime(self, interval=1)
Uses 'uptime' to see system uptime.
def check_clock_speed(self, interval=1)
Checks clock speed from reading from CPU info.
def check_cpu_util(self, interval=1)
_idlejitter_max_threshold
def check_load(self, interval=1)
Uses 'system.load' to see load average.
def __init__(self, hostname, diag_hostname)
_idlejitter_average_threshold
_thermal_throttling_threshold
def publish_stats(self, event)
def check_info(self, event)
def check_idlejitter(self, interval=1)
_idlejitter_min_threshold
def check_core_temps(self, interval=1)
Check CPU core temps.
def check_free_memory(self, interval=1)
Uses 'free -m' to check free memory.
_check_thermal_throttling_events