37 from __future__
import with_statement, print_function
39 roslib.load_manifest(
'pr2_computer_monitor')
45 from threading
import Timer
47 from time
import sleep
53 from diagnostic_msgs.msg
import DiagnosticArray, DiagnosticStatus, KeyValue
57 if sys.version_info[:3] == (2, 7, 3):
59 threading._DummyThread._Thread__stop =
lambda x: 42
62 stat_dict = { 0:
'OK', 1:
'Warning', 2:
'Error' }
68 diag_level = DiagnosticStatus.OK
71 p = subprocess.Popen(
'sudo ipmitool sdr',
72 stdout = subprocess.PIPE,
73 stderr = subprocess.PIPE, shell =
True)
74 stdout, stderr = p.communicate()
75 stdout = stdout.decode()
76 stderr = stderr.decode()
77 retcode = p.returncode
80 diag_level = DiagnosticStatus.ERROR
81 diag_msgs = [
'ipmitool Error' ]
82 diag_vals = [ KeyValue(key =
'IPMI Error', value = stderr) ]
83 return diag_vals, diag_msgs, diag_level
85 lines = stdout.split(
'\n')
87 diag_vals = [ KeyValue(key =
'ipmitool status', value =
'No output') ]
89 diag_msgs = [
'No ipmitool response' ]
90 diag_level = DiagnosticStatus.ERROR
92 return diag_vals, diag_msgs, diag_level
102 name = words[0].strip()
103 ipmi_val = words[1].strip()
104 stat_byte = words[2].strip()
107 if words[0].startswith(
'CPU')
and words[0].strip().endswith(
'Temp'):
108 if words[1].strip().endswith(
'degrees C'):
109 tmp = ipmi_val.rstrip(
' degrees C').lstrip()
111 temperature = float(tmp)
112 diag_vals.append(KeyValue(key = name +
' (C)', value = tmp))
114 cpu_name = name.split()[0]
115 if temperature >= 80
and temperature < 89:
116 diag_level = max(diag_level, DiagnosticStatus.WARN)
117 if diag_msgs.count(
'CPU Hot') == 0:
118 diag_msgs.append(
'CPU Warm')
120 if temperature >= 89:
121 diag_level = max(diag_level, DiagnosticStatus.ERROR)
122 diag_msgs.append(
'CPU Hot')
124 if diag_msgs.count(
'CPU Warm') > 0:
125 idx = diag_msgs.index(
'CPU Warm')
128 diag_vals.append(KeyValue(key = name, value = words[1]))
132 if name ==
'MB Temp' or name ==
'BP Temp' or name ==
'FP Temp':
133 if ipmi_val.endswith(
'degrees C'):
134 tmp = ipmi_val.rstrip(
' degrees C').lstrip()
135 diag_vals.append(KeyValue(key = name +
' (C)', value = tmp))
137 dev_name = name.split()[0]
139 temperature = float(tmp)
141 if temperature >= 60
and temperature < 75:
142 diag_level = max(diag_level, DiagnosticStatus.WARN)
143 diag_msgs.append(
'%s Warm' % dev_name)
145 if temperature >= 75:
146 diag_level = max(diag_level, DiagnosticStatus.ERROR)
147 diag_msgs.append(
'%s Hot' % dev_name)
149 diag_level = max(diag_level, DiagnosticStatus.ERROR)
150 diag_msgs.append(
'%s Error' % dev_name)
152 diag_vals.append(KeyValue(key = name, value = ipmi_val))
155 if (name.startswith(
'CPU')
and name.endswith(
'Fan'))
or name ==
'MB Fan':
156 if ipmi_val.endswith(
'RPM'):
157 rpm = ipmi_val.rstrip(
' RPM').lstrip()
160 diag_level = max(diag_level, DiagnosticStatus.ERROR)
161 diag_msgs.append(
'CPU Fan Off')
163 diag_vals.append(KeyValue(key = name +
' RPM', value = rpm))
165 diag_vals.append(KeyValue(key = name, value = ipmi_val))
169 if name.startswith(
'CPU')
and name.endswith(
'hot'):
170 if ipmi_val ==
'0x01':
171 diag_vals.append(KeyValue(key = name, value =
'OK'))
173 diag_vals.append(KeyValue(key = name, value =
'Hot'))
174 diag_level = max(diag_level, DiagnosticStatus.ERROR)
175 diag_msgs.append(
'CPU Hot Alarm')
177 except Exception
as e:
178 diag_vals.append(KeyValue(key =
'Exception', value = traceback.format_exc()))
179 diag_level = DiagnosticStatus.ERROR
180 diag_msgs.append(
'Exception')
182 return diag_vals, diag_msgs, diag_level
194 for index, temp_str
in enumerate(sys_temp_strings):
195 if len(temp_str) < 5:
198 cmd =
'cat %s' % temp_str
199 p = subprocess.Popen(cmd, stdout = subprocess.PIPE,
200 stderr = subprocess.PIPE, shell =
True)
201 stdout, stderr = p.communicate()
202 stdout = stdout.decode()
203 stderr = stderr.decode()
204 retcode = p.returncode
207 diag_level = DiagnosticStatus.ERROR
208 diag_msg = [
'Core Temp Error' ]
209 diag_vals = [ KeyValue(key =
'Core Temp Error', value = stderr),
210 KeyValue(key =
'Output', value = stdout) ]
211 return diag_vals, diag_msgs, diag_level
215 temp = float(tmp) / 1000
216 diag_vals.append(KeyValue(key =
'Core %d Temp' % index, value = str(temp)))
218 if temp >= 85
and temp < 90:
219 diag_level = max(diag_level, DiagnosticStatus.WARN)
220 diag_msgs.append(
'Warm')
222 diag_level = max(diag_level, DiagnosticStatus.ERROR)
223 diag_msgs.append(
'Hot')
225 diag_level = max(diag_level, DiagnosticStatus.ERROR)
226 diag_vals.append(KeyValue(key =
'Core %s Temp' % index, value = tmp))
228 return diag_vals, diag_msgs, diag_level
234 lvl = DiagnosticStatus.OK
237 p = subprocess.Popen(
'cat /proc/cpuinfo | grep MHz',
238 stdout = subprocess.PIPE,
239 stderr = subprocess.PIPE, shell =
True)
240 stdout, stderr = p.communicate()
241 stdout = stdout.decode()
242 stderr = stderr.decode()
243 retcode = p.returncode
246 lvl = DiagnosticStatus.ERROR
247 msgs = [
'Clock speed error' ]
248 vals = [ KeyValue(key =
'Clock speed error', value = stderr),
249 KeyValue(key =
'Output', value = stdout) ]
251 return (vals, msgs, lvl)
253 for index, ln
in enumerate(stdout.split(
'\n')):
254 words = ln.split(
':')
258 speed = words[1].strip().split(
'.')[0]
259 vals.append(KeyValue(key =
'Core %d MHz' % index, value = speed))
260 if speed.isnumeric():
263 if mhz < 2240
and mhz > 2150:
264 lvl = max(lvl, DiagnosticStatus.WARN)
266 lvl = max(lvl, DiagnosticStatus.ERROR)
269 lvl = max(lvl, DiagnosticStatus.ERROR)
271 if not enforce_speed:
272 lvl = DiagnosticStatus.OK
274 if lvl == DiagnosticStatus.WARN
and enforce_speed:
275 msgs = [
'Core slowing' ]
276 elif lvl == DiagnosticStatus.ERROR
and enforce_speed:
277 msgs = [
'Core throttled' ]
279 except Exception
as e:
280 rospy.logerr(traceback.format_exc())
281 lvl = DiagnosticStatus.ERROR
282 msgs.append(
'Exception')
283 vals.append(KeyValue(key =
'Exception', value = traceback.format_exc()))
285 return vals, msgs, lvl
291 level = DiagnosticStatus.OK
294 load_dict = { 0:
'OK', 1:
'High Load', 2:
'Very High Load' }
297 p = subprocess.Popen(
'uptime', stdout = subprocess.PIPE,
298 stderr = subprocess.PIPE, shell =
True)
299 stdout, stderr = p.communicate()
300 stdout = stdout.decode()
301 stderr = stderr.decode()
302 retcode = p.returncode
305 vals.append(KeyValue(key =
'uptime Failed', value = stderr))
306 return DiagnosticStatus.ERROR, vals
308 upvals = stdout.split()
309 load1 = upvals[-3].rstrip(
',')
310 load5 = upvals[-2].rstrip(
',')
312 num_users = upvals[-7]
315 if float(load1) > load1_threshold
or float(load5) > load5_threshold:
316 level = DiagnosticStatus.WARN
318 vals.append(KeyValue(key =
'Load Average Status', value = load_dict[level]))
319 vals.append(KeyValue(key =
'1 min Load Average', value = load1))
320 vals.append(KeyValue(key =
'1 min Load Average Threshold', value = str(load1_threshold)))
321 vals.append(KeyValue(key =
'5 min Load Average', value = load5))
322 vals.append(KeyValue(key =
'5 min Load Average Threshold', value = str(load5_threshold)))
323 vals.append(KeyValue(key =
'15 min Load Average', value = load15))
324 vals.append(KeyValue(key =
'Number of Users', value = num_users))
326 except Exception
as e:
327 rospy.logerr(traceback.format_exc())
328 level = DiagnosticStatus.ERROR
329 vals.append(KeyValue(key =
'Load Average Status', value = traceback.format_exc()))
331 return level, load_dict[level], vals
337 level = DiagnosticStatus.OK
340 mem_dict = { 0:
'OK', 1:
'Low Memory', 2:
'Very Low Memory' }
343 p = subprocess.Popen(
'free -m',
344 stdout = subprocess.PIPE,
345 stderr = subprocess.PIPE, shell =
True)
346 stdout, stderr = p.communicate()
347 stdout = stdout.decode()
348 retcode = p.returncode
351 values.append(KeyValue(key =
"\"free -m\" Call Error", value = str(retcode)))
352 return DiagnosticStatus.ERROR, values
354 rows = stdout.split(
'\n')
355 data = rows[1].split()
360 level = DiagnosticStatus.OK
361 if float(free_mem) < 25:
362 level = DiagnosticStatus.WARN
363 if float(free_mem) < 1:
364 level = DiagnosticStatus.ERROR
366 values.append(KeyValue(key =
'Memory Status', value = mem_dict[level]))
367 values.append(KeyValue(key =
'Total Memory', value = total_mem))
368 values.append(KeyValue(key =
'Used Memory', value = used_mem))
369 values.append(KeyValue(key =
'Free Memory', value = free_mem))
371 msg = mem_dict[level]
372 except Exception
as e:
373 rospy.logerr(traceback.format_exc())
374 msg =
'Memory Usage Check Error'
375 values.append(KeyValue(key = msg, value = str(e)))
376 level = DiagnosticStatus.ERROR
378 return level, mem_dict[level], values
385 has_warned_mpstat =
False
386 has_error_core_count =
False
389 mp_level = DiagnosticStatus.OK
391 load_dict = { 0:
'OK', 1:
'High Load', 2:
'Error' }
394 p = subprocess.Popen(
'mpstat -P ALL 1 1',
395 stdout = subprocess.PIPE,
396 stderr = subprocess.PIPE, shell =
True)
397 stdout, stderr = p.communicate()
398 stdout = stdout.decode()
399 stderr = stderr.decode()
400 retcode = p.returncode
403 global has_warned_mpstat
404 if not has_warned_mpstat:
405 rospy.logerr(
"mpstat failed to run for cpu_monitor. Return code %d.", retcode)
406 has_warned_mpstat =
True
408 mp_level = DiagnosticStatus.ERROR
409 vals.append(KeyValue(key =
'\"mpstat\" Call Error', value = str(retcode)))
410 return mp_level,
'Unable to Check CPU Usage', vals
414 rows = stdout.split(
'\n')
415 col_names = rows[2].split()
416 idle_col = -1
if (len(col_names) > 2
and col_names[-1] ==
'%idle')
else -2
420 for index, row
in enumerate(stdout.split(
'\n')):
425 if row.find(
'all') > -1:
433 if lst[0].startswith(
'Average'):
436 cpu_name =
'%d' % (num_cores)
437 idle = lst[idle_col].replace(
',',
'.')
438 user = lst[3].replace(
',',
'.')
439 nice = lst[4].replace(
',',
'.')
440 system = lst[5].replace(
',',
'.')
443 usage = float(user) + float(nice)
445 rospy.logwarn(
'Read cpu usage of %f percent. Reverting to previous reading of %f percent'%(usage, usage_old))
451 core_level = DiagnosticStatus.WARN
453 core_level = DiagnosticStatus.ERROR
455 vals.append(KeyValue(key =
'CPU %s Status' % cpu_name, value = load_dict[core_level]))
456 vals.append(KeyValue(key =
'CPU %s User' % cpu_name, value = user))
457 vals.append(KeyValue(key =
'CPU %s Nice' % cpu_name, value = nice))
458 vals.append(KeyValue(key =
'CPU %s System' % cpu_name, value = system))
459 vals.append(KeyValue(key =
'CPU %s Idle' % cpu_name, value = idle))
464 if num_cores - cores_loaded <= 2
and num_cores > 2:
465 mp_level = DiagnosticStatus.WARN
468 if core_count > 0
and core_count != num_cores:
469 mp_level = DiagnosticStatus.ERROR
470 global has_error_core_count
471 if not has_error_core_count:
472 rospy.logerr(
'Error checking number of cores. Expected %d, got %d. Computer may have not booted properly.',
473 core_count, num_cores)
474 has_error_core_count =
True
475 return DiagnosticStatus.ERROR,
'Incorrect number of CPU cores', vals
477 except Exception
as e:
478 mp_level = DiagnosticStatus.ERROR
479 vals.append(KeyValue(key =
'mpstat Exception', value = str(e)))
481 return mp_level, load_dict[mp_level], vals
488 p = subprocess.Popen(
'find /sys/devices -name temp1_input',
489 stdout = subprocess.PIPE,
490 stderr = subprocess.PIPE, shell =
True)
491 stdout, stderr = p.communicate()
492 stdout = stdout.decode()
493 stderr = stderr.decode()
494 retcode = p.returncode
497 rospy.logerr(
'Error find core temp locations: %s' % stderr)
500 for ln
in stdout.split(
'\n'):
501 temp_vals.append(ln.strip())
505 rospy.logerr(
'Exception finding temp vals: %s' % traceback.format_exc())
509 time_since_update = rospy.get_time() - last_update_time
512 if time_since_update > 20
and time_since_update <= 35:
513 stale_status =
'Lagging'
514 if stat.level == DiagnosticStatus.OK:
515 stat.message = stale_status
516 elif stat.message.find(stale_status) < 0:
517 stat.message =
', '.join([stat.message, stale_status])
518 stat.level = max(stat.level, DiagnosticStatus.WARN)
519 if time_since_update > 35:
520 stale_status =
'Stale'
521 if stat.level == DiagnosticStatus.OK:
522 stat.message = stale_status
523 elif stat.message.find(stale_status) < 0:
524 stat.message =
', '.join([stat.message, stale_status])
525 stat.level = max(stat.level, DiagnosticStatus.ERROR)
530 stat.values.insert(0, KeyValue(key =
'Update Status', value = stale_status))
531 stat.values.insert(1, KeyValue(key =
'Time Since Update', value = str(time_since_update)))
536 self.
_diag_pub = rospy.Publisher(
'/diagnostics', DiagnosticArray, queue_size=10)
545 rospy.logwarn(
'Checking CPU core temperatures is deprecated. This will be removed in D-turtle')
548 rospy.logwarn(
'NFS checking is deprecated for CPU monitor. This will be removed in D-turtle')
564 self.
_temp_stat.name =
'%s CPU Temperature' % diag_hostname
568 self.
_temp_stat.values = [ KeyValue(key =
'Update Status', value =
'No Data' ),
569 KeyValue(key =
'Time Since Last Update', value =
'N/A') ]
572 self.
_usage_stat.name =
'%s CPU Usage' % diag_hostname
576 self.
_usage_stat.values = [ KeyValue(key =
'Update Status', value =
'No Data' ),
577 KeyValue(key =
'Time Since Last Update', value =
'N/A') ]
580 self.
_nfs_stat.name =
'%s NFS IO' % diag_hostname
584 self.
_nfs_stat.values = [ KeyValue(key =
'Update Status', value =
'No Data' ),
585 KeyValue(key =
'Time Since Last Update', value =
'N/A') ]
600 rospy.logerr(
'Restarting temperature check thread in cpu_monitor. This should not happen')
607 except Exception
as e:
608 rospy.logerr(
'Unable to restart temp thread. Error: %s' % traceback.format_exc())
623 if rospy.is_shutdown():
630 vals = [ KeyValue(key =
'Update Status', value =
'OK' ),
631 KeyValue(key =
'Time Since Last Update', value = str(0) )]
634 p = subprocess.Popen(
'iostat -n',
635 stdout = subprocess.PIPE,
636 stderr = subprocess.PIPE, shell =
True)
637 stdout, stderr = p.communicate()
638 stdout = stdout.decode()
639 stderr = stderr.decode()
640 retcode = p.returncode
643 nfs_level = DiagnosticStatus.ERROR
645 vals.append(KeyValue(key =
'\"iostat -n\" Call Error', value = str(e)))
649 for index, row
in enumerate(stdout.split(
'\n')):
660 read_blk_dir = lst[3]
661 write_blk_dir = lst[4]
665 vals.append(KeyValue(
666 key =
'%s Read Blks/s' % file_sys, value=read_blk))
667 vals.append(KeyValue(
668 key =
'%s Write Blks/s' % file_sys, value=write_blk))
669 vals.append(KeyValue(
670 key =
'%s Read Blk dir/s' % file_sys, value=read_blk_dir))
671 vals.append(KeyValue(
672 key =
'%s Write Blks dir/s' % file_sys, value=write_blk_dir))
673 vals.append(KeyValue(
674 key =
'%s Read Blks srv/s' % file_sys, value=r_blk_srv))
675 vals.append(KeyValue(
676 key =
'%s Write Blks srv/s' % file_sys, value=w_blk_srv))
678 except Exception
as e:
679 rospy.logerr(traceback.format_exc())
680 nfs_level = DiagnosticStatus.ERROR
682 vals.append(KeyValue(key =
'Exception', value = str(e)))
691 if not rospy.is_shutdown():
700 if rospy.is_shutdown():
705 diag_vals = [ KeyValue(key =
'Update Status', value =
'OK' ),
706 KeyValue(key =
'Time Since Last Update', value = str(0) ) ]
711 ipmi_vals, ipmi_msgs, ipmi_level =
check_ipmi()
712 diag_vals.extend(ipmi_vals)
713 diag_msgs.extend(ipmi_msgs)
714 diag_level = max(diag_level, ipmi_level)
718 diag_vals.extend(core_vals)
719 diag_msgs.extend(core_msgs)
720 diag_level = max(diag_level, core_level)
723 diag_vals.extend(clock_vals)
724 diag_msgs.extend(clock_msgs)
725 diag_level = max(diag_level, clock_level)
727 diag_log = set(diag_msgs)
728 if len(diag_log) > 0:
729 message =
', '.join(diag_log)
731 message = stat_dict[diag_level]
740 if not rospy.is_shutdown():
747 if rospy.is_shutdown():
753 diag_vals = [ KeyValue(key =
'Update Status', value =
'OK' ),
754 KeyValue(key =
'Time Since Last Update', value = 0 )]
759 diag_vals.extend(mp_vals)
761 diag_msgs.append(mp_msg)
762 diag_level = max(diag_level, mp_level)
766 diag_vals.extend(up_vals)
768 diag_msgs.append(up_msg)
769 diag_level = max(diag_level, uptime_level)
773 diag_vals.extend(mem_vals)
775 diag_msgs.append(mem_msg)
776 diag_level = max(diag_level, mem_level)
778 if diag_msgs
and diag_level > 0:
779 usage_msg =
', '.join(set(diag_msgs))
781 usage_msg = stat_dict[diag_level]
791 if not rospy.is_shutdown():
805 msg = DiagnosticArray()
806 msg.header.stamp = rospy.get_rostime()
823 if __name__ ==
'__main__':
824 hostname = socket.gethostname()
827 parser = optparse.OptionParser(usage=
"usage: cpu_monitor.py [--diag-hostname=cX]")
828 parser.add_option(
"--diag-hostname", dest=
"diag_hostname",
829 help=
"Computer name in diagnostics output (ex: 'c1')",
830 metavar=
"DIAG_HOSTNAME",
831 action=
"store", default = hostname)
832 options, args = parser.parse_args(rospy.myargv())
835 rospy.init_node(
'cpu_monitor_%s' % hostname)
836 except rospy.exceptions.ROSInitException:
837 print(
'CPU monitor is unable to initialize node. Master may not be running.', file=sys.stderr)
842 rate = rospy.Rate(1.0)
844 while not rospy.is_shutdown():
846 cpu_node.publish_stats()
847 except KeyboardInterrupt:
849 except Exception
as e:
850 traceback.print_exc()
851 rospy.logerr(traceback.format_exc())
853 cpu_node.cancel_timers()