37 from __future__ 
import with_statement
    39 roslib.load_manifest(
'pr2_computer_monitor')
    45 from threading 
import Timer
    47 from time 
import sleep
    53 from diagnostic_msgs.msg 
import DiagnosticArray, DiagnosticStatus, KeyValue
    57 if sys.version_info[:3] == (2, 7, 3):
    59     threading._DummyThread._Thread__stop = 
lambda x: 42
    62 stat_dict = { 0: 
'OK', 1: 
'Warning', 2: 
'Error' }
    68     diag_level = DiagnosticStatus.OK
    71         p = subprocess.Popen(
'sudo ipmitool sdr',
    72                              stdout = subprocess.PIPE,
    73                              stderr = subprocess.PIPE, shell = 
True)
    74         stdout, stderr = p.communicate()
    75         retcode = p.returncode
    78             diag_level = DiagnosticStatus.ERROR
    79             diag_msgs = [ 
'ipmitool Error' ]
    80             diag_vals = [ KeyValue(key = 
'IPMI Error', value = stderr) ]
    81             return diag_vals, diag_msgs, diag_level
    83         lines = stdout.split(
'\n')
    85             diag_vals = [ KeyValue(key = 
'ipmitool status', value = 
'No output') ]
    87             diag_msgs = [ 
'No ipmitool response' ]
    88             diag_level = DiagnosticStatus.ERROR
    90             return diag_vals, diag_msgs, diag_level
   100             name = words[0].strip()
   101             ipmi_val = words[1].strip()
   102             stat_byte = words[2].strip()
   105             if words[0].startswith(
'CPU') 
and words[0].strip().endswith(
'Temp'):
   106                 if words[1].strip().endswith(
'degrees C'):
   107                     tmp = ipmi_val.rstrip(
' degrees C').lstrip()
   108                     if unicode(tmp).isnumeric():
   109                         temperature = float(tmp)
   110                         diag_vals.append(KeyValue(key = name + 
' (C)', value = tmp))
   112                         cpu_name = name.split()[0]
   113                         if temperature >= 80 
and temperature < 89:
   114                             diag_level = max(diag_level, DiagnosticStatus.WARN)
   115                             if diag_msgs.count(
'CPU Hot') == 0:
   116                                 diag_msgs.append(
'CPU Warm')
   118                         if temperature >= 89: 
   119                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
   120                             diag_msgs.append(
'CPU Hot')                                
   122                             if diag_msgs.count(
'CPU Warm') > 0:
   123                                 idx = diag_msgs.index(
'CPU Warm')
   126                     diag_vals.append(KeyValue(key = name, value = words[1]))
   130             if name == 
'MB Temp' or name == 
'BP Temp' or name == 
'FP Temp':
   131                 if ipmi_val.endswith(
'degrees C'):
   132                     tmp = ipmi_val.rstrip(
' degrees C').lstrip()
   133                     diag_vals.append(KeyValue(key = name + 
' (C)', value = tmp))
   135                     dev_name = name.split()[0]
   136                     if unicode(tmp).isnumeric():
   137                         temperature = float(tmp)
   139                         if temperature >= 60 
and temperature < 75:
   140                             diag_level = max(diag_level, DiagnosticStatus.WARN)
   141                             diag_msgs.append(
'%s Warm' % dev_name)
   143                         if temperature >= 75:
   144                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
   145                             diag_msgs.append(
'%s Hot' % dev_name)
   147                         diag_level = max(diag_level, DiagnosticStatus.ERROR)
   148                         diag_msgs.append(
'%s Error' % dev_name)
   150                     diag_vals.append(KeyValue(key = name, value = ipmi_val))
   153             if (name.startswith(
'CPU') 
and name.endswith(
'Fan')) 
or name == 
'MB Fan':
   154                 if ipmi_val.endswith(
'RPM'):
   155                     rpm = ipmi_val.rstrip(
' RPM').lstrip()
   156                     if unicode(rpm).isnumeric():
   158                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
   159                             diag_msgs.append(
'CPU Fan Off')
   161                         diag_vals.append(KeyValue(key = name + 
' RPM', value = rpm))
   163                         diag_vals.append(KeyValue(key = name, value = ipmi_val))
   167             if name.startswith(
'CPU') 
and name.endswith(
'hot'):
   168                 if ipmi_val == 
'0x01':
   169                     diag_vals.append(KeyValue(key = name, value = 
'OK'))
   171                     diag_vals.append(KeyValue(key = name, value = 
'Hot'))
   172                     diag_level = max(diag_level, DiagnosticStatus.ERROR)
   173                     diag_msgs.append(
'CPU Hot Alarm')
   175     except Exception 
as e:
   176         diag_vals.append(KeyValue(key = 
'Exception', value = traceback.format_exc()))
   177         diag_level = DiagnosticStatus.ERROR
   178         diag_msgs.append(
'Exception')
   180     return diag_vals, diag_msgs, diag_level
   192     for index, temp_str 
in enumerate(sys_temp_strings):
   193         if len(temp_str) < 5:
   196         cmd = 
'cat %s' % temp_str
   197         p = subprocess.Popen(cmd, stdout = subprocess.PIPE, 
   198                              stderr = subprocess.PIPE, shell = 
True)
   199         stdout, stderr = p.communicate()
   200         retcode = p.returncode
   203             diag_level = DiagnosticStatus.ERROR
   204             diag_msg = [ 
'Core Temp Error' ]
   205             diag_vals = [ KeyValue(key = 
'Core Temp Error', value = stderr), 
   206                           KeyValue(key = 
'Output', value = stdout) ]
   207             return diag_vals, diag_msgs, diag_level
   210         if unicode(tmp).isnumeric():
   211             temp = float(tmp) / 1000
   212             diag_vals.append(KeyValue(key = 
'Core %d Temp' % index, value = str(temp)))
   214             if temp >= 85 
and temp < 90:
   215                 diag_level = max(diag_level, DiagnosticStatus.WARN)
   216                 diag_msgs.append(
'Warm')
   218                 diag_level = max(diag_level, DiagnosticStatus.ERROR)
   219                 diag_msgs.append(
'Hot')
   221             diag_level = max(diag_level, DiagnosticStatus.ERROR) 
   222             diag_vals.append(KeyValue(key = 
'Core %s Temp' % index, value = tmp))
   224     return diag_vals, diag_msgs, diag_level
   230     lvl = DiagnosticStatus.OK
   233         p = subprocess.Popen(
'cat /proc/cpuinfo | grep MHz', 
   234                              stdout = subprocess.PIPE,
   235                              stderr = subprocess.PIPE, shell = 
True)
   236         stdout, stderr = p.communicate()
   237         retcode = p.returncode
   240             lvl = DiagnosticStatus.ERROR
   241             msgs = [ 
'Clock speed error' ]
   242             vals = [ KeyValue(key = 
'Clock speed error', value = stderr), 
   243                      KeyValue(key = 
'Output', value = stdout) ]
   245             return (vals, msgs, lvl)
   247         for index, ln 
in enumerate(stdout.split(
'\n')):
   248             words = ln.split(
':')
   252             speed = words[1].strip().split(
'.')[0] 
   253             vals.append(KeyValue(key = 
'Core %d MHz' % index, value = speed))
   254             if unicode(speed).isnumeric():
   257                 if mhz < 2240 
and mhz > 2150:
   258                     lvl = max(lvl, DiagnosticStatus.WARN)
   260                     lvl = max(lvl, DiagnosticStatus.ERROR)
   263                 lvl = max(lvl, DiagnosticStatus.ERROR)
   265         if not enforce_speed:
   266             lvl = DiagnosticStatus.OK
   268         if lvl == DiagnosticStatus.WARN 
and enforce_speed:
   269             msgs = [ 
'Core slowing' ]
   270         elif lvl == DiagnosticStatus.ERROR 
and enforce_speed:
   271             msgs = [ 
'Core throttled' ]
   273     except Exception 
as e:
   274         rospy.logerr(traceback.format_exc())
   275         lvl = DiagnosticStatus.ERROR
   276         msgs.append(
'Exception')
   277         vals.append(KeyValue(key = 
'Exception', value = traceback.format_exc()))
   279     return vals, msgs, lvl
   285     level = DiagnosticStatus.OK
   288     load_dict = { 0: 
'OK', 1: 
'High Load', 2: 
'Very High Load' }
   291         p = subprocess.Popen(
'uptime', stdout = subprocess.PIPE, 
   292                              stderr = subprocess.PIPE, shell = 
True)
   293         stdout, stderr = p.communicate()
   294         retcode = p.returncode
   297             vals.append(KeyValue(key = 
'uptime Failed', value = stderr))
   298             return DiagnosticStatus.ERROR, vals
   300         upvals = stdout.split()
   301         load1 = upvals[-3].rstrip(
',')
   302         load5 = upvals[-2].rstrip(
',')
   304         num_users = upvals[-7]
   307         if float(load1) > load1_threshold 
or float(load5) > load5_threshold:
   308             level = DiagnosticStatus.WARN
   310         vals.append(KeyValue(key = 
'Load Average Status', value = load_dict[level]))
   311         vals.append(KeyValue(key = 
'1 min Load Average', value = load1))
   312         vals.append(KeyValue(key = 
'1 min Load Average Threshold', value = str(load1_threshold)))
   313         vals.append(KeyValue(key = 
'5 min Load Average', value = load5))
   314         vals.append(KeyValue(key = 
'5 min Load Average Threshold', value = str(load5_threshold)))
   315         vals.append(KeyValue(key = 
'15 min Load Average', value = load15))
   316         vals.append(KeyValue(key = 
'Number of Users', value = num_users))
   318     except Exception 
as e:
   319         rospy.logerr(traceback.format_exc())
   320         level = DiagnosticStatus.ERROR
   321         vals.append(KeyValue(key = 
'Load Average Status', value = traceback.format_exc()))
   323     return level, load_dict[level], vals
   329     level = DiagnosticStatus.OK
   332     mem_dict = { 0: 
'OK', 1: 
'Low Memory', 2: 
'Very Low Memory' }
   335         p = subprocess.Popen(
'free -m',
   336                              stdout = subprocess.PIPE,
   337                              stderr = subprocess.PIPE, shell = 
True)
   338         stdout, stderr = p.communicate()
   339         retcode = p.returncode
   342             values.append(KeyValue(key = 
"\"free -m\" Call Error", value = str(retcode)))
   343             return DiagnosticStatus.ERROR, values
   345         rows = stdout.split(
'\n')
   346         data = rows[1].split()
   351         level = DiagnosticStatus.OK
   352         if float(free_mem) < 25:
   353             level = DiagnosticStatus.WARN
   354         if float(free_mem) < 1:
   355             level = DiagnosticStatus.ERROR
   357         values.append(KeyValue(key = 
'Memory Status', value = mem_dict[level]))
   358         values.append(KeyValue(key = 
'Total Memory', value = total_mem))
   359         values.append(KeyValue(key = 
'Used Memory', value = used_mem))
   360         values.append(KeyValue(key = 
'Free Memory', value = free_mem))
   362         msg = mem_dict[level]
   363     except Exception 
as e:
   364         rospy.logerr(traceback.format_exc())
   365         msg = 
'Memory Usage Check Error'   366         values.append(KeyValue(key = msg, value = str(e)))
   367         level = DiagnosticStatus.ERROR
   369     return level, mem_dict[level], values
   376 has_warned_mpstat = 
False   377 has_error_core_count = 
False   380     mp_level = DiagnosticStatus.OK
   382     load_dict = { 0: 
'OK', 1: 
'High Load', 2: 
'Error' }
   385         p = subprocess.Popen(
'mpstat -P ALL 1 1',
   386                              stdout = subprocess.PIPE,
   387                              stderr = subprocess.PIPE, shell = 
True)
   388         stdout, stderr = p.communicate()
   389         retcode = p.returncode
   392             global has_warned_mpstat
   393             if not has_warned_mpstat:
   394                 rospy.logerr(
"mpstat failed to run for cpu_monitor. Return code %d.", retcode)
   395                 has_warned_mpstat = 
True   397             mp_level = DiagnosticStatus.ERROR
   398             vals.append(KeyValue(key = 
'\"mpstat\" Call Error', value = str(retcode)))
   399             return mp_level, 
'Unable to Check CPU Usage', vals
   403         rows = stdout.split(
'\n')
   404         col_names = rows[2].split()
   405         idle_col = -1 
if (len(col_names) > 2 
and col_names[-1] == 
'%idle') 
else -2
   409         for index, row 
in enumerate(stdout.split(
'\n')):
   414             if row.find(
'all') > -1:
   422             if lst[0].startswith(
'Average'):
   425             cpu_name = 
'%d' % (num_cores)
   426             idle = lst[idle_col].replace(
',', 
'.')
   427             user = lst[3].replace(
',', 
'.')
   428             nice = lst[4].replace(
',', 
'.')
   429             system = lst[5].replace(
',', 
'.')
   432             usage = float(user) + float(nice)
   434                 rospy.logwarn(
'Read cpu usage of %f percent. Reverting to previous reading of %f percent'%(usage, usage_old))
   440                 core_level = DiagnosticStatus.WARN
   442                 core_level = DiagnosticStatus.ERROR
   444             vals.append(KeyValue(key = 
'CPU %s Status' % cpu_name, value = load_dict[core_level]))
   445             vals.append(KeyValue(key = 
'CPU %s User' % cpu_name, value = user))
   446             vals.append(KeyValue(key = 
'CPU %s Nice' % cpu_name, value = nice))
   447             vals.append(KeyValue(key = 
'CPU %s System' % cpu_name, value = system))
   448             vals.append(KeyValue(key = 
'CPU %s Idle' % cpu_name, value = idle))
   453         if num_cores - cores_loaded <= 2 
and num_cores > 2:
   454             mp_level = DiagnosticStatus.WARN
   457         if core_count > 0 
and core_count != num_cores:
   458             mp_level = DiagnosticStatus.ERROR
   459             global has_error_core_count
   460             if not has_error_core_count:
   461                 rospy.logerr(
'Error checking number of cores. Expected %d, got %d. Computer may have not booted properly.',
   462                               core_count, num_cores)
   463                 has_error_core_count = 
True   464             return DiagnosticStatus.ERROR, 
'Incorrect number of CPU cores', vals
   466     except Exception 
as e:
   467         mp_level = DiagnosticStatus.ERROR
   468         vals.append(KeyValue(key = 
'mpstat Exception', value = str(e)))
   470     return mp_level, load_dict[mp_level], vals
   477         p = subprocess.Popen(
'find /sys/devices -name temp1_input', 
   478                              stdout = subprocess.PIPE,
   479                              stderr = subprocess.PIPE, shell = 
True)
   480         stdout, stderr = p.communicate()
   481         retcode = p.returncode
   484             rospy.logerr(
'Error find core temp locations: %s' % stderr)
   487         for ln 
in stdout.split(
'\n'):
   488             temp_vals.append(ln.strip())
   492         rospy.logerr(
'Exception finding temp vals: %s' % traceback.format_exc())
   496     time_since_update = rospy.get_time() - last_update_time
   499     if time_since_update > 20 
and time_since_update <= 35:
   500         stale_status = 
'Lagging'   501         if stat.level == DiagnosticStatus.OK:
   502             stat.message = stale_status
   503         elif stat.message.find(stale_status) < 0:
   504             stat.message = 
', '.join([stat.message, stale_status])
   505         stat.level = max(stat.level, DiagnosticStatus.WARN)
   506     if time_since_update > 35:
   507         stale_status = 
'Stale'   508         if stat.level == DiagnosticStatus.OK:
   509             stat.message = stale_status
   510         elif stat.message.find(stale_status) < 0:
   511             stat.message = 
', '.join([stat.message, stale_status])
   512         stat.level = max(stat.level, DiagnosticStatus.ERROR)
   517     stat.values.insert(0, KeyValue(key = 
'Update Status', value = stale_status))
   518     stat.values.insert(1, KeyValue(key = 
'Time Since Update', value = str(time_since_update)))
   523         self.
_diag_pub = rospy.Publisher(
'/diagnostics', DiagnosticArray, queue_size=10)
   532             rospy.logwarn(
'Checking CPU core temperatures is deprecated. This will be removed in D-turtle')
   535             rospy.logwarn(
'NFS checking is deprecated for CPU monitor. This will be removed in D-turtle')
   551         self._temp_stat.name = 
'%s CPU Temperature' % diag_hostname
   552         self._temp_stat.level = 1
   553         self._temp_stat.hardware_id = hostname
   554         self._temp_stat.message = 
'No Data'   555         self._temp_stat.values = [ KeyValue(key = 
'Update Status', value = 
'No Data' ),
   556                                    KeyValue(key = 
'Time Since Last Update', value = 
'N/A') ]
   559         self._usage_stat.name = 
'%s CPU Usage' % diag_hostname
   560         self._usage_stat.level = 1
   561         self._usage_stat.hardware_id = hostname
   562         self._usage_stat.message = 
'No Data'   563         self._usage_stat.values = [ KeyValue(key = 
'Update Status', value = 
'No Data' ),
   564                                     KeyValue(key = 
'Time Since Last Update', value = 
'N/A') ]
   567         self._nfs_stat.name = 
'%s NFS IO' % diag_hostname
   568         self._nfs_stat.level = 1
   569         self._nfs_stat.hardware_id = hostname
   570         self._nfs_stat.message = 
'No Data'   571         self._nfs_stat.values = [ KeyValue(key = 
'Update Status', value = 
'No Data' ),
   572                                   KeyValue(key = 
'Time Since Last Update', value = 
'N/A') ]
   587         rospy.logerr(
'Restarting temperature check thread in cpu_monitor. This should not happen')
   591                     self._temps_timer.cancel()
   594         except Exception 
as e:
   595             rospy.logerr(
'Unable to restart temp thread. Error: %s' % traceback.format_exc())
   601             self._temps_timer.cancel()
   604             self._nfs_timer.cancel()
   607             self._usage_timer.cancel()
   610         if rospy.is_shutdown():
   617         vals = [ KeyValue(key = 
'Update Status', value = 
'OK' ),
   618                  KeyValue(key = 
'Time Since Last Update', value = str(0) )]
   621             p = subprocess.Popen(
'iostat -n',
   622                                  stdout = subprocess.PIPE,
   623                                  stderr = subprocess.PIPE, shell = 
True)
   624             stdout, stderr = p.communicate()
   625             retcode = p.returncode
   628                 nfs_level = DiagnosticStatus.ERROR
   630                 vals.append(KeyValue(key = 
'\"iostat -n\" Call Error', value = str(e)))
   634             for index, row 
in enumerate(stdout.split(
'\n')):
   645                 read_blk_dir = lst[3]
   646                 write_blk_dir = lst[4]
   650                 vals.append(KeyValue(
   651                         key = 
'%s Read Blks/s' % file_sys, value=read_blk))
   652                 vals.append(KeyValue(
   653                         key = 
'%s Write Blks/s' % file_sys, value=write_blk))
   654                 vals.append(KeyValue(
   655                         key = 
'%s Read Blk dir/s' % file_sys, value=read_blk_dir))
   656                 vals.append(KeyValue(
   657                         key = 
'%s Write Blks dir/s' % file_sys, value=write_blk_dir))
   658                 vals.append(KeyValue(
   659                         key = 
'%s Read Blks srv/s' % file_sys, value=r_blk_srv))
   660                 vals.append(KeyValue(
   661                         key = 
'%s Write Blks srv/s' % file_sys, value=w_blk_srv))
   663         except Exception 
as e:
   664             rospy.logerr(traceback.format_exc())
   665             nfs_level = DiagnosticStatus.ERROR
   667             vals.append(KeyValue(key = 
'Exception', value = str(e)))
   670             self._nfs_stat.level = nfs_level
   671             self._nfs_stat.message = msg
   672             self._nfs_stat.values = vals
   676             if not rospy.is_shutdown():
   678                 self._nfs_timer.start()
   685         if rospy.is_shutdown():
   690         diag_vals = [ KeyValue(key = 
'Update Status', value = 
'OK' ),
   691                       KeyValue(key = 
'Time Since Last Update', value = str(0) ) ]
   696             ipmi_vals, ipmi_msgs, ipmi_level = 
check_ipmi()
   697             diag_vals.extend(ipmi_vals)
   698             diag_msgs.extend(ipmi_msgs)
   699             diag_level = max(diag_level, ipmi_level)
   703             diag_vals.extend(core_vals)
   704             diag_msgs.extend(core_msgs)
   705             diag_level = max(diag_level, core_level)
   708         diag_vals.extend(clock_vals)
   709         diag_msgs.extend(clock_msgs)
   710         diag_level = max(diag_level, clock_level)
   712         diag_log = set(diag_msgs)
   713         if len(diag_log) > 0:
   714             message = 
', '.join(diag_log)
   716             message = stat_dict[diag_level]
   721             self._temp_stat.level = diag_level
   722             self._temp_stat.message = message
   723             self._temp_stat.values = diag_vals
   725             if not rospy.is_shutdown():
   727                 self._temps_timer.start()
   732         if rospy.is_shutdown():
   738         diag_vals = [ KeyValue(key = 
'Update Status', value = 
'OK' ),
   739                       KeyValue(key = 
'Time Since Last Update', value = 0 )]
   744         diag_vals.extend(mp_vals)
   746             diag_msgs.append(mp_msg)
   747         diag_level = max(diag_level, mp_level)
   751         diag_vals.extend(up_vals)
   753             diag_msgs.append(up_msg)
   754         diag_level = max(diag_level, uptime_level)
   758         diag_vals.extend(mem_vals)
   760             diag_msgs.append(mem_msg)
   761         diag_level = max(diag_level, mem_level)
   763         if diag_msgs 
and diag_level > 0:
   764             usage_msg = 
', '.join(set(diag_msgs))
   766             usage_msg = stat_dict[diag_level]
   771             self._usage_stat.level = diag_level
   772             self._usage_stat.values = diag_vals
   774             self._usage_stat.message = usage_msg
   776             if not rospy.is_shutdown():
   778                 self._usage_timer.start()
   790             msg = DiagnosticArray()
   791             msg.header.stamp = rospy.get_rostime()
   798                 self._diag_pub.publish(msg)
   808 if __name__ == 
'__main__':
   809     hostname = socket.gethostname()
   812     parser = optparse.OptionParser(usage=
"usage: cpu_monitor.py [--diag-hostname=cX]")
   813     parser.add_option(
"--diag-hostname", dest=
"diag_hostname",
   814                       help=
"Computer name in diagnostics output (ex: 'c1')",
   815                       metavar=
"DIAG_HOSTNAME",
   816                       action=
"store", default = hostname)
   817     options, args = parser.parse_args(rospy.myargv())
   820         rospy.init_node(
'cpu_monitor_%s' % hostname)
   821     except rospy.exceptions.ROSInitException:
   822         print(
'CPU monitor is unable to initialize node. Master may not be running.', file=sys.stderr)
   827     rate = rospy.Rate(1.0)
   829         while not rospy.is_shutdown():
   831             cpu_node.publish_stats()
   832     except KeyboardInterrupt:
   834     except Exception 
as e:
   835         traceback.print_exc()
   836         rospy.logerr(traceback.format_exc())
   838     cpu_node.cancel_timers()
 def get_core_temp_names()
Returns names for core temperature files Returns list of names, each name can be read like file...
def check_uptime(load1_threshold, load5_threshold)
Uses 'uptime' to see load average. 
def check_temps(self)
Call every 10sec at minimum. 
def check_core_temps(sys_temp_strings)
Check CPU core temps. 
def check_clock_speed(enforce_speed)
Checks clock speed from reading from CPU info. 
def update_status_stale(stat, last_update_time)
def check_memory()
Uses 'free -m' to check free memory. 
def __init__(self, hostname, diag_hostname)
def _restart_temp_check(self)
def check_mpstat(core_count=-1)
def cancel_timers(self)
Must have the lock to cancel everything.