37 from __future__
import with_statement
39 roslib.load_manifest(
'pr2_computer_monitor')
45 from threading
import Timer
47 from time
import sleep
53 from diagnostic_msgs.msg
import DiagnosticArray, DiagnosticStatus, KeyValue
57 if sys.version_info[:3] == (2, 7, 3):
59 threading._DummyThread._Thread__stop =
lambda x: 42
62 stat_dict = { 0:
'OK', 1:
'Warning', 2:
'Error' }
68 diag_level = DiagnosticStatus.OK
71 p = subprocess.Popen(
'sudo ipmitool sdr',
72 stdout = subprocess.PIPE,
73 stderr = subprocess.PIPE, shell =
True)
74 stdout, stderr = p.communicate()
75 retcode = p.returncode
78 diag_level = DiagnosticStatus.ERROR
79 diag_msgs = [
'ipmitool Error' ]
80 diag_vals = [ KeyValue(key =
'IPMI Error', value = stderr) ]
81 return diag_vals, diag_msgs, diag_level
83 lines = stdout.split(
'\n')
85 diag_vals = [ KeyValue(key =
'ipmitool status', value =
'No output') ]
87 diag_msgs = [
'No ipmitool response' ]
88 diag_level = DiagnosticStatus.ERROR
90 return diag_vals, diag_msgs, diag_level
100 name = words[0].strip()
101 ipmi_val = words[1].strip()
102 stat_byte = words[2].strip()
105 if words[0].startswith(
'CPU')
and words[0].strip().endswith(
'Temp'):
106 if words[1].strip().endswith(
'degrees C'):
107 tmp = ipmi_val.rstrip(
' degrees C').lstrip()
108 if unicode(tmp).isnumeric():
109 temperature = float(tmp)
110 diag_vals.append(KeyValue(key = name +
' (C)', value = tmp))
112 cpu_name = name.split()[0]
113 if temperature >= 80
and temperature < 89:
114 diag_level = max(diag_level, DiagnosticStatus.WARN)
115 if diag_msgs.count(
'CPU Hot') == 0:
116 diag_msgs.append(
'CPU Warm')
118 if temperature >= 89:
119 diag_level = max(diag_level, DiagnosticStatus.ERROR)
120 diag_msgs.append(
'CPU Hot')
122 if diag_msgs.count(
'CPU Warm') > 0:
123 idx = diag_msgs.index(
'CPU Warm')
126 diag_vals.append(KeyValue(key = name, value = words[1]))
130 if name ==
'MB Temp' or name ==
'BP Temp' or name ==
'FP Temp':
131 if ipmi_val.endswith(
'degrees C'):
132 tmp = ipmi_val.rstrip(
' degrees C').lstrip()
133 diag_vals.append(KeyValue(key = name +
' (C)', value = tmp))
135 dev_name = name.split()[0]
136 if unicode(tmp).isnumeric():
137 temperature = float(tmp)
139 if temperature >= 60
and temperature < 75:
140 diag_level = max(diag_level, DiagnosticStatus.WARN)
141 diag_msgs.append(
'%s Warm' % dev_name)
143 if temperature >= 75:
144 diag_level = max(diag_level, DiagnosticStatus.ERROR)
145 diag_msgs.append(
'%s Hot' % dev_name)
147 diag_level = max(diag_level, DiagnosticStatus.ERROR)
148 diag_msgs.append(
'%s Error' % dev_name)
150 diag_vals.append(KeyValue(key = name, value = ipmi_val))
153 if (name.startswith(
'CPU')
and name.endswith(
'Fan'))
or name ==
'MB Fan':
154 if ipmi_val.endswith(
'RPM'):
155 rpm = ipmi_val.rstrip(
' RPM').lstrip()
156 if unicode(rpm).isnumeric():
158 diag_level = max(diag_level, DiagnosticStatus.ERROR)
159 diag_msgs.append(
'CPU Fan Off')
161 diag_vals.append(KeyValue(key = name +
' RPM', value = rpm))
163 diag_vals.append(KeyValue(key = name, value = ipmi_val))
167 if name.startswith(
'CPU')
and name.endswith(
'hot'):
168 if ipmi_val ==
'0x01':
169 diag_vals.append(KeyValue(key = name, value =
'OK'))
171 diag_vals.append(KeyValue(key = name, value =
'Hot'))
172 diag_level = max(diag_level, DiagnosticStatus.ERROR)
173 diag_msgs.append(
'CPU Hot Alarm')
175 except Exception
as e:
176 diag_vals.append(KeyValue(key =
'Exception', value = traceback.format_exc()))
177 diag_level = DiagnosticStatus.ERROR
178 diag_msgs.append(
'Exception')
180 return diag_vals, diag_msgs, diag_level
192 for index, temp_str
in enumerate(sys_temp_strings):
193 if len(temp_str) < 5:
196 cmd =
'cat %s' % temp_str
197 p = subprocess.Popen(cmd, stdout = subprocess.PIPE,
198 stderr = subprocess.PIPE, shell =
True)
199 stdout, stderr = p.communicate()
200 retcode = p.returncode
203 diag_level = DiagnosticStatus.ERROR
204 diag_msg = [
'Core Temp Error' ]
205 diag_vals = [ KeyValue(key =
'Core Temp Error', value = stderr),
206 KeyValue(key =
'Output', value = stdout) ]
207 return diag_vals, diag_msgs, diag_level
210 if unicode(tmp).isnumeric():
211 temp = float(tmp) / 1000
212 diag_vals.append(KeyValue(key =
'Core %d Temp' % index, value = str(temp)))
214 if temp >= 85
and temp < 90:
215 diag_level = max(diag_level, DiagnosticStatus.WARN)
216 diag_msgs.append(
'Warm')
218 diag_level = max(diag_level, DiagnosticStatus.ERROR)
219 diag_msgs.append(
'Hot')
221 diag_level = max(diag_level, DiagnosticStatus.ERROR)
222 diag_vals.append(KeyValue(key =
'Core %s Temp' % index, value = tmp))
224 return diag_vals, diag_msgs, diag_level
230 lvl = DiagnosticStatus.OK
233 p = subprocess.Popen(
'cat /proc/cpuinfo | grep MHz',
234 stdout = subprocess.PIPE,
235 stderr = subprocess.PIPE, shell =
True)
236 stdout, stderr = p.communicate()
237 retcode = p.returncode
240 lvl = DiagnosticStatus.ERROR
241 msgs = [
'Clock speed error' ]
242 vals = [ KeyValue(key =
'Clock speed error', value = stderr),
243 KeyValue(key =
'Output', value = stdout) ]
245 return (vals, msgs, lvl)
247 for index, ln
in enumerate(stdout.split(
'\n')):
248 words = ln.split(
':')
252 speed = words[1].strip().split(
'.')[0]
253 vals.append(KeyValue(key =
'Core %d MHz' % index, value = speed))
254 if unicode(speed).isnumeric():
257 if mhz < 2240
and mhz > 2150:
258 lvl = max(lvl, DiagnosticStatus.WARN)
260 lvl = max(lvl, DiagnosticStatus.ERROR)
263 lvl = max(lvl, DiagnosticStatus.ERROR)
265 if not enforce_speed:
266 lvl = DiagnosticStatus.OK
268 if lvl == DiagnosticStatus.WARN
and enforce_speed:
269 msgs = [
'Core slowing' ]
270 elif lvl == DiagnosticStatus.ERROR
and enforce_speed:
271 msgs = [
'Core throttled' ]
273 except Exception
as e:
274 rospy.logerr(traceback.format_exc())
275 lvl = DiagnosticStatus.ERROR
276 msgs.append(
'Exception')
277 vals.append(KeyValue(key =
'Exception', value = traceback.format_exc()))
279 return vals, msgs, lvl
285 level = DiagnosticStatus.OK
288 load_dict = { 0:
'OK', 1:
'High Load', 2:
'Very High Load' }
291 p = subprocess.Popen(
'uptime', stdout = subprocess.PIPE,
292 stderr = subprocess.PIPE, shell =
True)
293 stdout, stderr = p.communicate()
294 retcode = p.returncode
297 vals.append(KeyValue(key =
'uptime Failed', value = stderr))
298 return DiagnosticStatus.ERROR, vals
300 upvals = stdout.split()
301 load1 = upvals[-3].rstrip(
',')
302 load5 = upvals[-2].rstrip(
',')
304 num_users = upvals[-7]
307 if float(load1) > load1_threshold
or float(load5) > load5_threshold:
308 level = DiagnosticStatus.WARN
310 vals.append(KeyValue(key =
'Load Average Status', value = load_dict[level]))
311 vals.append(KeyValue(key =
'1 min Load Average', value = load1))
312 vals.append(KeyValue(key =
'1 min Load Average Threshold', value = str(load1_threshold)))
313 vals.append(KeyValue(key =
'5 min Load Average', value = load5))
314 vals.append(KeyValue(key =
'5 min Load Average Threshold', value = str(load5_threshold)))
315 vals.append(KeyValue(key =
'15 min Load Average', value = load15))
316 vals.append(KeyValue(key =
'Number of Users', value = num_users))
318 except Exception
as e:
319 rospy.logerr(traceback.format_exc())
320 level = DiagnosticStatus.ERROR
321 vals.append(KeyValue(key =
'Load Average Status', value = traceback.format_exc()))
323 return level, load_dict[level], vals
329 level = DiagnosticStatus.OK
332 mem_dict = { 0:
'OK', 1:
'Low Memory', 2:
'Very Low Memory' }
335 p = subprocess.Popen(
'free -m',
336 stdout = subprocess.PIPE,
337 stderr = subprocess.PIPE, shell =
True)
338 stdout, stderr = p.communicate()
339 retcode = p.returncode
342 values.append(KeyValue(key =
"\"free -m\" Call Error", value = str(retcode)))
343 return DiagnosticStatus.ERROR, values
345 rows = stdout.split(
'\n')
346 data = rows[1].split()
351 level = DiagnosticStatus.OK
352 if float(free_mem) < 25:
353 level = DiagnosticStatus.WARN
354 if float(free_mem) < 1:
355 level = DiagnosticStatus.ERROR
357 values.append(KeyValue(key =
'Memory Status', value = mem_dict[level]))
358 values.append(KeyValue(key =
'Total Memory', value = total_mem))
359 values.append(KeyValue(key =
'Used Memory', value = used_mem))
360 values.append(KeyValue(key =
'Free Memory', value = free_mem))
362 msg = mem_dict[level]
363 except Exception
as e:
364 rospy.logerr(traceback.format_exc())
365 msg =
'Memory Usage Check Error' 366 values.append(KeyValue(key = msg, value = str(e)))
367 level = DiagnosticStatus.ERROR
369 return level, mem_dict[level], values
376 has_warned_mpstat =
False 377 has_error_core_count =
False 380 mp_level = DiagnosticStatus.OK
382 load_dict = { 0:
'OK', 1:
'High Load', 2:
'Error' }
385 p = subprocess.Popen(
'mpstat -P ALL 1 1',
386 stdout = subprocess.PIPE,
387 stderr = subprocess.PIPE, shell =
True)
388 stdout, stderr = p.communicate()
389 retcode = p.returncode
392 global has_warned_mpstat
393 if not has_warned_mpstat:
394 rospy.logerr(
"mpstat failed to run for cpu_monitor. Return code %d.", retcode)
395 has_warned_mpstat =
True 397 mp_level = DiagnosticStatus.ERROR
398 vals.append(KeyValue(key =
'\"mpstat\" Call Error', value = str(retcode)))
399 return mp_level,
'Unable to Check CPU Usage', vals
403 rows = stdout.split(
'\n')
404 col_names = rows[2].split()
405 idle_col = -1
if (len(col_names) > 2
and col_names[-1] ==
'%idle')
else -2
409 for index, row
in enumerate(stdout.split(
'\n')):
414 if row.find(
'all') > -1:
422 if lst[0].startswith(
'Average'):
425 cpu_name =
'%d' % (num_cores)
426 idle = lst[idle_col].replace(
',',
'.')
427 user = lst[3].replace(
',',
'.')
428 nice = lst[4].replace(
',',
'.')
429 system = lst[5].replace(
',',
'.')
432 usage = float(user) + float(nice)
434 rospy.logwarn(
'Read cpu usage of %f percent. Reverting to previous reading of %f percent'%(usage, usage_old))
440 core_level = DiagnosticStatus.WARN
442 core_level = DiagnosticStatus.ERROR
444 vals.append(KeyValue(key =
'CPU %s Status' % cpu_name, value = load_dict[core_level]))
445 vals.append(KeyValue(key =
'CPU %s User' % cpu_name, value = user))
446 vals.append(KeyValue(key =
'CPU %s Nice' % cpu_name, value = nice))
447 vals.append(KeyValue(key =
'CPU %s System' % cpu_name, value = system))
448 vals.append(KeyValue(key =
'CPU %s Idle' % cpu_name, value = idle))
453 if num_cores - cores_loaded <= 2
and num_cores > 2:
454 mp_level = DiagnosticStatus.WARN
457 if core_count > 0
and core_count != num_cores:
458 mp_level = DiagnosticStatus.ERROR
459 global has_error_core_count
460 if not has_error_core_count:
461 rospy.logerr(
'Error checking number of cores. Expected %d, got %d. Computer may have not booted properly.',
462 core_count, num_cores)
463 has_error_core_count =
True 464 return DiagnosticStatus.ERROR,
'Incorrect number of CPU cores', vals
466 except Exception
as e:
467 mp_level = DiagnosticStatus.ERROR
468 vals.append(KeyValue(key =
'mpstat Exception', value = str(e)))
470 return mp_level, load_dict[mp_level], vals
477 p = subprocess.Popen(
'find /sys/devices -name temp1_input',
478 stdout = subprocess.PIPE,
479 stderr = subprocess.PIPE, shell =
True)
480 stdout, stderr = p.communicate()
481 retcode = p.returncode
484 rospy.logerr(
'Error find core temp locations: %s' % stderr)
487 for ln
in stdout.split(
'\n'):
488 temp_vals.append(ln.strip())
492 rospy.logerr(
'Exception finding temp vals: %s' % traceback.format_exc())
496 time_since_update = rospy.get_time() - last_update_time
499 if time_since_update > 20
and time_since_update <= 35:
500 stale_status =
'Lagging' 501 if stat.level == DiagnosticStatus.OK:
502 stat.message = stale_status
503 elif stat.message.find(stale_status) < 0:
504 stat.message =
', '.join([stat.message, stale_status])
505 stat.level = max(stat.level, DiagnosticStatus.WARN)
506 if time_since_update > 35:
507 stale_status =
'Stale' 508 if stat.level == DiagnosticStatus.OK:
509 stat.message = stale_status
510 elif stat.message.find(stale_status) < 0:
511 stat.message =
', '.join([stat.message, stale_status])
512 stat.level = max(stat.level, DiagnosticStatus.ERROR)
517 stat.values.insert(0, KeyValue(key =
'Update Status', value = stale_status))
518 stat.values.insert(1, KeyValue(key =
'Time Since Update', value = str(time_since_update)))
523 self.
_diag_pub = rospy.Publisher(
'/diagnostics', DiagnosticArray, queue_size=10)
532 rospy.logwarn(
'Checking CPU core temperatures is deprecated. This will be removed in D-turtle')
535 rospy.logwarn(
'NFS checking is deprecated for CPU monitor. This will be removed in D-turtle')
551 self._temp_stat.name =
'%s CPU Temperature' % diag_hostname
552 self._temp_stat.level = 1
553 self._temp_stat.hardware_id = hostname
554 self._temp_stat.message =
'No Data' 555 self._temp_stat.values = [ KeyValue(key =
'Update Status', value =
'No Data' ),
556 KeyValue(key =
'Time Since Last Update', value =
'N/A') ]
559 self._usage_stat.name =
'%s CPU Usage' % diag_hostname
560 self._usage_stat.level = 1
561 self._usage_stat.hardware_id = hostname
562 self._usage_stat.message =
'No Data' 563 self._usage_stat.values = [ KeyValue(key =
'Update Status', value =
'No Data' ),
564 KeyValue(key =
'Time Since Last Update', value =
'N/A') ]
567 self._nfs_stat.name =
'%s NFS IO' % diag_hostname
568 self._nfs_stat.level = 1
569 self._nfs_stat.hardware_id = hostname
570 self._nfs_stat.message =
'No Data' 571 self._nfs_stat.values = [ KeyValue(key =
'Update Status', value =
'No Data' ),
572 KeyValue(key =
'Time Since Last Update', value =
'N/A') ]
587 rospy.logerr(
'Restarting temperature check thread in cpu_monitor. This should not happen')
591 self._temps_timer.cancel()
594 except Exception
as e:
595 rospy.logerr(
'Unable to restart temp thread. Error: %s' % traceback.format_exc())
601 self._temps_timer.cancel()
604 self._nfs_timer.cancel()
607 self._usage_timer.cancel()
610 if rospy.is_shutdown():
617 vals = [ KeyValue(key =
'Update Status', value =
'OK' ),
618 KeyValue(key =
'Time Since Last Update', value = str(0) )]
621 p = subprocess.Popen(
'iostat -n',
622 stdout = subprocess.PIPE,
623 stderr = subprocess.PIPE, shell =
True)
624 stdout, stderr = p.communicate()
625 retcode = p.returncode
628 nfs_level = DiagnosticStatus.ERROR
630 vals.append(KeyValue(key =
'\"iostat -n\" Call Error', value = str(e)))
634 for index, row
in enumerate(stdout.split(
'\n')):
645 read_blk_dir = lst[3]
646 write_blk_dir = lst[4]
650 vals.append(KeyValue(
651 key =
'%s Read Blks/s' % file_sys, value=read_blk))
652 vals.append(KeyValue(
653 key =
'%s Write Blks/s' % file_sys, value=write_blk))
654 vals.append(KeyValue(
655 key =
'%s Read Blk dir/s' % file_sys, value=read_blk_dir))
656 vals.append(KeyValue(
657 key =
'%s Write Blks dir/s' % file_sys, value=write_blk_dir))
658 vals.append(KeyValue(
659 key =
'%s Read Blks srv/s' % file_sys, value=r_blk_srv))
660 vals.append(KeyValue(
661 key =
'%s Write Blks srv/s' % file_sys, value=w_blk_srv))
663 except Exception
as e:
664 rospy.logerr(traceback.format_exc())
665 nfs_level = DiagnosticStatus.ERROR
667 vals.append(KeyValue(key =
'Exception', value = str(e)))
670 self._nfs_stat.level = nfs_level
671 self._nfs_stat.message = msg
672 self._nfs_stat.values = vals
676 if not rospy.is_shutdown():
678 self._nfs_timer.start()
685 if rospy.is_shutdown():
690 diag_vals = [ KeyValue(key =
'Update Status', value =
'OK' ),
691 KeyValue(key =
'Time Since Last Update', value = str(0) ) ]
696 ipmi_vals, ipmi_msgs, ipmi_level =
check_ipmi()
697 diag_vals.extend(ipmi_vals)
698 diag_msgs.extend(ipmi_msgs)
699 diag_level = max(diag_level, ipmi_level)
703 diag_vals.extend(core_vals)
704 diag_msgs.extend(core_msgs)
705 diag_level = max(diag_level, core_level)
708 diag_vals.extend(clock_vals)
709 diag_msgs.extend(clock_msgs)
710 diag_level = max(diag_level, clock_level)
712 diag_log = set(diag_msgs)
713 if len(diag_log) > 0:
714 message =
', '.join(diag_log)
716 message = stat_dict[diag_level]
721 self._temp_stat.level = diag_level
722 self._temp_stat.message = message
723 self._temp_stat.values = diag_vals
725 if not rospy.is_shutdown():
727 self._temps_timer.start()
732 if rospy.is_shutdown():
738 diag_vals = [ KeyValue(key =
'Update Status', value =
'OK' ),
739 KeyValue(key =
'Time Since Last Update', value = 0 )]
744 diag_vals.extend(mp_vals)
746 diag_msgs.append(mp_msg)
747 diag_level = max(diag_level, mp_level)
751 diag_vals.extend(up_vals)
753 diag_msgs.append(up_msg)
754 diag_level = max(diag_level, uptime_level)
758 diag_vals.extend(mem_vals)
760 diag_msgs.append(mem_msg)
761 diag_level = max(diag_level, mem_level)
763 if diag_msgs
and diag_level > 0:
764 usage_msg =
', '.join(set(diag_msgs))
766 usage_msg = stat_dict[diag_level]
771 self._usage_stat.level = diag_level
772 self._usage_stat.values = diag_vals
774 self._usage_stat.message = usage_msg
776 if not rospy.is_shutdown():
778 self._usage_timer.start()
790 msg = DiagnosticArray()
791 msg.header.stamp = rospy.get_rostime()
798 self._diag_pub.publish(msg)
808 if __name__ ==
'__main__':
809 hostname = socket.gethostname()
812 parser = optparse.OptionParser(usage=
"usage: cpu_monitor.py [--diag-hostname=cX]")
813 parser.add_option(
"--diag-hostname", dest=
"diag_hostname",
814 help=
"Computer name in diagnostics output (ex: 'c1')",
815 metavar=
"DIAG_HOSTNAME",
816 action=
"store", default = hostname)
817 options, args = parser.parse_args(rospy.myargv())
820 rospy.init_node(
'cpu_monitor_%s' % hostname)
821 except rospy.exceptions.ROSInitException:
822 print(
'CPU monitor is unable to initialize node. Master may not be running.', file=sys.stderr)
827 rate = rospy.Rate(1.0)
829 while not rospy.is_shutdown():
831 cpu_node.publish_stats()
832 except KeyboardInterrupt:
834 except Exception
as e:
835 traceback.print_exc()
836 rospy.logerr(traceback.format_exc())
838 cpu_node.cancel_timers()
def get_core_temp_names()
Returns names for core temperature files Returns list of names, each name can be read like file...
def check_uptime(load1_threshold, load5_threshold)
Uses 'uptime' to see load average.
def check_temps(self)
Call every 10sec at minimum.
def check_core_temps(sys_temp_strings)
Check CPU core temps.
def check_clock_speed(enforce_speed)
Checks clock speed from reading from CPU info.
def update_status_stale(stat, last_update_time)
def check_memory()
Uses 'free -m' to check free memory.
def __init__(self, hostname, diag_hostname)
def _restart_temp_check(self)
def check_mpstat(core_count=-1)
def cancel_timers(self)
Must have the lock to cancel everything.