37 from __future__ 
import with_statement, print_function
 
   39 roslib.load_manifest(
'pr2_computer_monitor')
 
   45 from threading 
import Timer
 
   47 from time 
import sleep
 
   53 from diagnostic_msgs.msg 
import DiagnosticArray, DiagnosticStatus, KeyValue
 
   57 if sys.version_info[:3] == (2, 7, 3):
 
   59     threading._DummyThread._Thread__stop = 
lambda x: 42
 
   62 stat_dict = { 0: 
'OK', 1: 
'Warning', 2: 
'Error' }
 
   68     diag_level = DiagnosticStatus.OK
 
   71         p = subprocess.Popen(
'sudo ipmitool sdr',
 
   72                              stdout = subprocess.PIPE,
 
   73                              stderr = subprocess.PIPE, shell = 
True)
 
   74         stdout, stderr = p.communicate()
 
   75         stdout = stdout.decode()
 
   76         stderr = stderr.decode()
 
   77         retcode = p.returncode
 
   80             diag_level = DiagnosticStatus.ERROR
 
   81             diag_msgs = [ 
'ipmitool Error' ]
 
   82             diag_vals = [ KeyValue(key = 
'IPMI Error', value = stderr) ]
 
   83             return diag_vals, diag_msgs, diag_level
 
   85         lines = stdout.split(
'\n')
 
   87             diag_vals = [ KeyValue(key = 
'ipmitool status', value = 
'No output') ]
 
   89             diag_msgs = [ 
'No ipmitool response' ]
 
   90             diag_level = DiagnosticStatus.ERROR
 
   92             return diag_vals, diag_msgs, diag_level
 
  102             name = words[0].strip()
 
  103             ipmi_val = words[1].strip()
 
  104             stat_byte = words[2].strip()
 
  107             if words[0].startswith(
'CPU') 
and words[0].strip().endswith(
'Temp'):
 
  108                 if words[1].strip().endswith(
'degrees C'):
 
  109                     tmp = ipmi_val.rstrip(
' degrees C').lstrip()
 
  111                         temperature = float(tmp)
 
  112                         diag_vals.append(KeyValue(key = name + 
' (C)', value = tmp))
 
  114                         cpu_name = name.split()[0]
 
  115                         if temperature >= 80 
and temperature < 89:
 
  116                             diag_level = max(diag_level, DiagnosticStatus.WARN)
 
  117                             if diag_msgs.count(
'CPU Hot') == 0:
 
  118                                 diag_msgs.append(
'CPU Warm')
 
  120                         if temperature >= 89: 
 
  121                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
 
  122                             diag_msgs.append(
'CPU Hot')                                
 
  124                             if diag_msgs.count(
'CPU Warm') > 0:
 
  125                                 idx = diag_msgs.index(
'CPU Warm')
 
  128                     diag_vals.append(KeyValue(key = name, value = words[1]))
 
  132             if name == 
'MB Temp' or name == 
'BP Temp' or name == 
'FP Temp':
 
  133                 if ipmi_val.endswith(
'degrees C'):
 
  134                     tmp = ipmi_val.rstrip(
' degrees C').lstrip()
 
  135                     diag_vals.append(KeyValue(key = name + 
' (C)', value = tmp))
 
  137                     dev_name = name.split()[0]
 
  139                         temperature = float(tmp)
 
  141                         if temperature >= 60 
and temperature < 75:
 
  142                             diag_level = max(diag_level, DiagnosticStatus.WARN)
 
  143                             diag_msgs.append(
'%s Warm' % dev_name)
 
  145                         if temperature >= 75:
 
  146                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
 
  147                             diag_msgs.append(
'%s Hot' % dev_name)
 
  149                         diag_level = max(diag_level, DiagnosticStatus.ERROR)
 
  150                         diag_msgs.append(
'%s Error' % dev_name)
 
  152                     diag_vals.append(KeyValue(key = name, value = ipmi_val))
 
  155             if (name.startswith(
'CPU') 
and name.endswith(
'Fan')) 
or name == 
'MB Fan':
 
  156                 if ipmi_val.endswith(
'RPM'):
 
  157                     rpm = ipmi_val.rstrip(
' RPM').lstrip()
 
  160                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
 
  161                             diag_msgs.append(
'CPU Fan Off')
 
  163                         diag_vals.append(KeyValue(key = name + 
' RPM', value = rpm))
 
  165                         diag_vals.append(KeyValue(key = name, value = ipmi_val))
 
  169             if name.startswith(
'CPU') 
and name.endswith(
'hot'):
 
  170                 if ipmi_val == 
'0x01':
 
  171                     diag_vals.append(KeyValue(key = name, value = 
'OK'))
 
  173                     diag_vals.append(KeyValue(key = name, value = 
'Hot'))
 
  174                     diag_level = max(diag_level, DiagnosticStatus.ERROR)
 
  175                     diag_msgs.append(
'CPU Hot Alarm')
 
  177     except Exception 
as e:
 
  178         diag_vals.append(KeyValue(key = 
'Exception', value = traceback.format_exc()))
 
  179         diag_level = DiagnosticStatus.ERROR
 
  180         diag_msgs.append(
'Exception')
 
  182     return diag_vals, diag_msgs, diag_level
 
  194     for index, temp_str 
in enumerate(sys_temp_strings):
 
  195         if len(temp_str) < 5:
 
  198         cmd = 
'cat %s' % temp_str
 
  199         p = subprocess.Popen(cmd, stdout = subprocess.PIPE, 
 
  200                              stderr = subprocess.PIPE, shell = 
True)
 
  201         stdout, stderr = p.communicate()
 
  202         stdout = stdout.decode()
 
  203         stderr = stderr.decode()
 
  204         retcode = p.returncode
 
  207             diag_level = DiagnosticStatus.ERROR
 
  208             diag_msg = [ 
'Core Temp Error' ]
 
  209             diag_vals = [ KeyValue(key = 
'Core Temp Error', value = stderr), 
 
  210                           KeyValue(key = 
'Output', value = stdout) ]
 
  211             return diag_vals, diag_msgs, diag_level
 
  215             temp = float(tmp) / 1000
 
  216             diag_vals.append(KeyValue(key = 
'Core %d Temp' % index, value = str(temp)))
 
  218             if temp >= 85 
and temp < 90:
 
  219                 diag_level = max(diag_level, DiagnosticStatus.WARN)
 
  220                 diag_msgs.append(
'Warm')
 
  222                 diag_level = max(diag_level, DiagnosticStatus.ERROR)
 
  223                 diag_msgs.append(
'Hot')
 
  225             diag_level = max(diag_level, DiagnosticStatus.ERROR) 
 
  226             diag_vals.append(KeyValue(key = 
'Core %s Temp' % index, value = tmp))
 
  228     return diag_vals, diag_msgs, diag_level
 
  234     lvl = DiagnosticStatus.OK
 
  237         p = subprocess.Popen(
'cat /proc/cpuinfo | grep MHz', 
 
  238                              stdout = subprocess.PIPE,
 
  239                              stderr = subprocess.PIPE, shell = 
True)
 
  240         stdout, stderr = p.communicate()
 
  241         stdout = stdout.decode()
 
  242         stderr = stderr.decode()
 
  243         retcode = p.returncode
 
  246             lvl = DiagnosticStatus.ERROR
 
  247             msgs = [ 
'Clock speed error' ]
 
  248             vals = [ KeyValue(key = 
'Clock speed error', value = stderr), 
 
  249                      KeyValue(key = 
'Output', value = stdout) ]
 
  251             return (vals, msgs, lvl)
 
  253         for index, ln 
in enumerate(stdout.split(
'\n')):
 
  254             words = ln.split(
':')
 
  258             speed = words[1].strip().split(
'.')[0] 
 
  259             vals.append(KeyValue(key = 
'Core %d MHz' % index, value = speed))
 
  260             if speed.isnumeric():
 
  263                 if mhz < 2240 
and mhz > 2150:
 
  264                     lvl = max(lvl, DiagnosticStatus.WARN)
 
  266                     lvl = max(lvl, DiagnosticStatus.ERROR)
 
  269                 lvl = max(lvl, DiagnosticStatus.ERROR)
 
  271         if not enforce_speed:
 
  272             lvl = DiagnosticStatus.OK
 
  274         if lvl == DiagnosticStatus.WARN 
and enforce_speed:
 
  275             msgs = [ 
'Core slowing' ]
 
  276         elif lvl == DiagnosticStatus.ERROR 
and enforce_speed:
 
  277             msgs = [ 
'Core throttled' ]
 
  279     except Exception 
as e:
 
  280         rospy.logerr(traceback.format_exc())
 
  281         lvl = DiagnosticStatus.ERROR
 
  282         msgs.append(
'Exception')
 
  283         vals.append(KeyValue(key = 
'Exception', value = traceback.format_exc()))
 
  285     return vals, msgs, lvl
 
  291     level = DiagnosticStatus.OK
 
  294     load_dict = { 0: 
'OK', 1: 
'High Load', 2: 
'Very High Load' }
 
  297         p = subprocess.Popen(
'uptime', stdout = subprocess.PIPE, 
 
  298                              stderr = subprocess.PIPE, shell = 
True)
 
  299         stdout, stderr = p.communicate()
 
  300         stdout = stdout.decode()
 
  301         stderr = stderr.decode()
 
  302         retcode = p.returncode
 
  305             vals.append(KeyValue(key = 
'uptime Failed', value = stderr))
 
  306             return DiagnosticStatus.ERROR, vals
 
  308         upvals = stdout.split()
 
  309         load1 = upvals[-3].rstrip(
',')
 
  310         load5 = upvals[-2].rstrip(
',')
 
  312         num_users = upvals[-7]
 
  315         if float(load1) > load1_threshold 
or float(load5) > load5_threshold:
 
  316             level = DiagnosticStatus.WARN
 
  318         vals.append(KeyValue(key = 
'Load Average Status', value = load_dict[level]))
 
  319         vals.append(KeyValue(key = 
'1 min Load Average', value = load1))
 
  320         vals.append(KeyValue(key = 
'1 min Load Average Threshold', value = str(load1_threshold)))
 
  321         vals.append(KeyValue(key = 
'5 min Load Average', value = load5))
 
  322         vals.append(KeyValue(key = 
'5 min Load Average Threshold', value = str(load5_threshold)))
 
  323         vals.append(KeyValue(key = 
'15 min Load Average', value = load15))
 
  324         vals.append(KeyValue(key = 
'Number of Users', value = num_users))
 
  326     except Exception 
as e:
 
  327         rospy.logerr(traceback.format_exc())
 
  328         level = DiagnosticStatus.ERROR
 
  329         vals.append(KeyValue(key = 
'Load Average Status', value = traceback.format_exc()))
 
  331     return level, load_dict[level], vals
 
  337     level = DiagnosticStatus.OK
 
  340     mem_dict = { 0: 
'OK', 1: 
'Low Memory', 2: 
'Very Low Memory' }
 
  343         p = subprocess.Popen(
'free -m',
 
  344                              stdout = subprocess.PIPE,
 
  345                              stderr = subprocess.PIPE, shell = 
True)
 
  346         stdout, stderr = p.communicate()
 
  347         stdout = stdout.decode()
 
  348         retcode = p.returncode
 
  351             values.append(KeyValue(key = 
"\"free -m\" Call Error", value = str(retcode)))
 
  352             return DiagnosticStatus.ERROR, values
 
  354         rows = stdout.split(
'\n')
 
  355         data = rows[1].split()
 
  360         level = DiagnosticStatus.OK
 
  361         if float(free_mem) < 25:
 
  362             level = DiagnosticStatus.WARN
 
  363         if float(free_mem) < 1:
 
  364             level = DiagnosticStatus.ERROR
 
  366         values.append(KeyValue(key = 
'Memory Status', value = mem_dict[level]))
 
  367         values.append(KeyValue(key = 
'Total Memory', value = total_mem))
 
  368         values.append(KeyValue(key = 
'Used Memory', value = used_mem))
 
  369         values.append(KeyValue(key = 
'Free Memory', value = free_mem))
 
  371         msg = mem_dict[level]
 
  372     except Exception 
as e:
 
  373         rospy.logerr(traceback.format_exc())
 
  374         msg = 
'Memory Usage Check Error' 
  375         values.append(KeyValue(key = msg, value = str(e)))
 
  376         level = DiagnosticStatus.ERROR
 
  378     return level, mem_dict[level], values
 
  385 has_warned_mpstat = 
False 
  386 has_error_core_count = 
False 
  389     mp_level = DiagnosticStatus.OK
 
  391     load_dict = { 0: 
'OK', 1: 
'High Load', 2: 
'Error' }
 
  394         p = subprocess.Popen(
'mpstat -P ALL 1 1',
 
  395                              stdout = subprocess.PIPE,
 
  396                              stderr = subprocess.PIPE, shell = 
True)
 
  397         stdout, stderr = p.communicate()
 
  398         stdout = stdout.decode()
 
  399         stderr = stderr.decode()
 
  400         retcode = p.returncode
 
  403             global has_warned_mpstat
 
  404             if not has_warned_mpstat:
 
  405                 rospy.logerr(
"mpstat failed to run for cpu_monitor. Return code %d.", retcode)
 
  406                 has_warned_mpstat = 
True 
  408             mp_level = DiagnosticStatus.ERROR
 
  409             vals.append(KeyValue(key = 
'\"mpstat\" Call Error', value = str(retcode)))
 
  410             return mp_level, 
'Unable to Check CPU Usage', vals
 
  414         rows = stdout.split(
'\n')
 
  415         col_names = rows[2].split()
 
  416         idle_col = -1 
if (len(col_names) > 2 
and col_names[-1] == 
'%idle') 
else -2
 
  420         for index, row 
in enumerate(stdout.split(
'\n')):
 
  425             if row.find(
'all') > -1:
 
  433             if lst[0].startswith(
'Average'):
 
  436             cpu_name = 
'%d' % (num_cores)
 
  437             idle = lst[idle_col].replace(
',', 
'.')
 
  438             user = lst[3].replace(
',', 
'.')
 
  439             nice = lst[4].replace(
',', 
'.')
 
  440             system = lst[5].replace(
',', 
'.')
 
  443             usage = float(user) + float(nice)
 
  445                 rospy.logwarn(
'Read cpu usage of %f percent. Reverting to previous reading of %f percent'%(usage, usage_old))
 
  451                 core_level = DiagnosticStatus.WARN
 
  453                 core_level = DiagnosticStatus.ERROR
 
  455             vals.append(KeyValue(key = 
'CPU %s Status' % cpu_name, value = load_dict[core_level]))
 
  456             vals.append(KeyValue(key = 
'CPU %s User' % cpu_name, value = user))
 
  457             vals.append(KeyValue(key = 
'CPU %s Nice' % cpu_name, value = nice))
 
  458             vals.append(KeyValue(key = 
'CPU %s System' % cpu_name, value = system))
 
  459             vals.append(KeyValue(key = 
'CPU %s Idle' % cpu_name, value = idle))
 
  464         if num_cores - cores_loaded <= 2 
and num_cores > 2:
 
  465             mp_level = DiagnosticStatus.WARN
 
  468         if core_count > 0 
and core_count != num_cores:
 
  469             mp_level = DiagnosticStatus.ERROR
 
  470             global has_error_core_count
 
  471             if not has_error_core_count:
 
  472                 rospy.logerr(
'Error checking number of cores. Expected %d, got %d. Computer may have not booted properly.',
 
  473                               core_count, num_cores)
 
  474                 has_error_core_count = 
True 
  475             return DiagnosticStatus.ERROR, 
'Incorrect number of CPU cores', vals
 
  477     except Exception 
as e:
 
  478         mp_level = DiagnosticStatus.ERROR
 
  479         vals.append(KeyValue(key = 
'mpstat Exception', value = str(e)))
 
  481     return mp_level, load_dict[mp_level], vals
 
  488         p = subprocess.Popen(
'find /sys/devices -name temp1_input', 
 
  489                              stdout = subprocess.PIPE,
 
  490                              stderr = subprocess.PIPE, shell = 
True)
 
  491         stdout, stderr = p.communicate()
 
  492         stdout = stdout.decode()
 
  493         stderr = stderr.decode()
 
  494         retcode = p.returncode
 
  497             rospy.logerr(
'Error find core temp locations: %s' % stderr)
 
  500         for ln 
in stdout.split(
'\n'):
 
  501             temp_vals.append(ln.strip())
 
  505         rospy.logerr(
'Exception finding temp vals: %s' % traceback.format_exc())
 
  509     time_since_update = rospy.get_time() - last_update_time
 
  512     if time_since_update > 20 
and time_since_update <= 35:
 
  513         stale_status = 
'Lagging' 
  514         if stat.level == DiagnosticStatus.OK:
 
  515             stat.message = stale_status
 
  516         elif stat.message.find(stale_status) < 0:
 
  517             stat.message = 
', '.join([stat.message, stale_status])
 
  518         stat.level = max(stat.level, DiagnosticStatus.WARN)
 
  519     if time_since_update > 35:
 
  520         stale_status = 
'Stale' 
  521         if stat.level == DiagnosticStatus.OK:
 
  522             stat.message = stale_status
 
  523         elif stat.message.find(stale_status) < 0:
 
  524             stat.message = 
', '.join([stat.message, stale_status])
 
  525         stat.level = max(stat.level, DiagnosticStatus.ERROR)
 
  530     stat.values.insert(0, KeyValue(key = 
'Update Status', value = stale_status))
 
  531     stat.values.insert(1, KeyValue(key = 
'Time Since Update', value = str(time_since_update)))
 
  536         self.
_diag_pub = rospy.Publisher(
'/diagnostics', DiagnosticArray, queue_size=10)
 
  545             rospy.logwarn(
'Checking CPU core temperatures is deprecated. This will be removed in D-turtle')
 
  548             rospy.logwarn(
'NFS checking is deprecated for CPU monitor. This will be removed in D-turtle')
 
  564         self.
_temp_stat.name = 
'%s CPU Temperature' % diag_hostname
 
  568         self.
_temp_stat.values = [ KeyValue(key = 
'Update Status', value = 
'No Data' ),
 
  569                                    KeyValue(key = 
'Time Since Last Update', value = 
'N/A') ]
 
  572         self.
_usage_stat.name = 
'%s CPU Usage' % diag_hostname
 
  576         self.
_usage_stat.values = [ KeyValue(key = 
'Update Status', value = 
'No Data' ),
 
  577                                     KeyValue(key = 
'Time Since Last Update', value = 
'N/A') ]
 
  580         self.
_nfs_stat.name = 
'%s NFS IO' % diag_hostname
 
  584         self.
_nfs_stat.values = [ KeyValue(key = 
'Update Status', value = 
'No Data' ),
 
  585                                   KeyValue(key = 
'Time Since Last Update', value = 
'N/A') ]
 
  600         rospy.logerr(
'Restarting temperature check thread in cpu_monitor. This should not happen')
 
  607         except Exception 
as e:
 
  608             rospy.logerr(
'Unable to restart temp thread. Error: %s' % traceback.format_exc())
 
  623         if rospy.is_shutdown():
 
  630         vals = [ KeyValue(key = 
'Update Status', value = 
'OK' ),
 
  631                  KeyValue(key = 
'Time Since Last Update', value = str(0) )]
 
  634             p = subprocess.Popen(
'iostat -n',
 
  635                                  stdout = subprocess.PIPE,
 
  636                                  stderr = subprocess.PIPE, shell = 
True)
 
  637             stdout, stderr = p.communicate()
 
  638             stdout = stdout.decode()
 
  639             stderr = stderr.decode()
 
  640             retcode = p.returncode
 
  643                 nfs_level = DiagnosticStatus.ERROR
 
  645                 vals.append(KeyValue(key = 
'\"iostat -n\" Call Error', value = str(e)))
 
  649             for index, row 
in enumerate(stdout.split(
'\n')):
 
  660                 read_blk_dir = lst[3]
 
  661                 write_blk_dir = lst[4]
 
  665                 vals.append(KeyValue(
 
  666                         key = 
'%s Read Blks/s' % file_sys, value=read_blk))
 
  667                 vals.append(KeyValue(
 
  668                         key = 
'%s Write Blks/s' % file_sys, value=write_blk))
 
  669                 vals.append(KeyValue(
 
  670                         key = 
'%s Read Blk dir/s' % file_sys, value=read_blk_dir))
 
  671                 vals.append(KeyValue(
 
  672                         key = 
'%s Write Blks dir/s' % file_sys, value=write_blk_dir))
 
  673                 vals.append(KeyValue(
 
  674                         key = 
'%s Read Blks srv/s' % file_sys, value=r_blk_srv))
 
  675                 vals.append(KeyValue(
 
  676                         key = 
'%s Write Blks srv/s' % file_sys, value=w_blk_srv))
 
  678         except Exception 
as e:
 
  679             rospy.logerr(traceback.format_exc())
 
  680             nfs_level = DiagnosticStatus.ERROR
 
  682             vals.append(KeyValue(key = 
'Exception', value = str(e)))
 
  691             if not rospy.is_shutdown():
 
  700         if rospy.is_shutdown():
 
  705         diag_vals = [ KeyValue(key = 
'Update Status', value = 
'OK' ),
 
  706                       KeyValue(key = 
'Time Since Last Update', value = str(0) ) ]
 
  711             ipmi_vals, ipmi_msgs, ipmi_level = 
check_ipmi()
 
  712             diag_vals.extend(ipmi_vals)
 
  713             diag_msgs.extend(ipmi_msgs)
 
  714             diag_level = max(diag_level, ipmi_level)
 
  718             diag_vals.extend(core_vals)
 
  719             diag_msgs.extend(core_msgs)
 
  720             diag_level = max(diag_level, core_level)
 
  723         diag_vals.extend(clock_vals)
 
  724         diag_msgs.extend(clock_msgs)
 
  725         diag_level = max(diag_level, clock_level)
 
  727         diag_log = set(diag_msgs)
 
  728         if len(diag_log) > 0:
 
  729             message = 
', '.join(diag_log)
 
  731             message = stat_dict[diag_level]
 
  740             if not rospy.is_shutdown():
 
  747         if rospy.is_shutdown():
 
  753         diag_vals = [ KeyValue(key = 
'Update Status', value = 
'OK' ),
 
  754                       KeyValue(key = 
'Time Since Last Update', value = 0 )]
 
  759         diag_vals.extend(mp_vals)
 
  761             diag_msgs.append(mp_msg)
 
  762         diag_level = max(diag_level, mp_level)
 
  766         diag_vals.extend(up_vals)
 
  768             diag_msgs.append(up_msg)
 
  769         diag_level = max(diag_level, uptime_level)
 
  773         diag_vals.extend(mem_vals)
 
  775             diag_msgs.append(mem_msg)
 
  776         diag_level = max(diag_level, mem_level)
 
  778         if diag_msgs 
and diag_level > 0:
 
  779             usage_msg = 
', '.join(set(diag_msgs))
 
  781             usage_msg = stat_dict[diag_level]
 
  791             if not rospy.is_shutdown():
 
  805             msg = DiagnosticArray()
 
  806             msg.header.stamp = rospy.get_rostime()
 
  823 if __name__ == 
'__main__':
 
  824     hostname = socket.gethostname()
 
  827     parser = optparse.OptionParser(usage=
"usage: cpu_monitor.py [--diag-hostname=cX]")
 
  828     parser.add_option(
"--diag-hostname", dest=
"diag_hostname",
 
  829                       help=
"Computer name in diagnostics output (ex: 'c1')",
 
  830                       metavar=
"DIAG_HOSTNAME",
 
  831                       action=
"store", default = hostname)
 
  832     options, args = parser.parse_args(rospy.myargv())
 
  835         rospy.init_node(
'cpu_monitor_%s' % hostname)
 
  836     except rospy.exceptions.ROSInitException:
 
  837         print(
'CPU monitor is unable to initialize node. Master may not be running.', file=sys.stderr)
 
  842     rate = rospy.Rate(1.0)
 
  844         while not rospy.is_shutdown():
 
  846             cpu_node.publish_stats()
 
  847     except KeyboardInterrupt:
 
  849     except Exception 
as e:
 
  850         traceback.print_exc()
 
  851         rospy.logerr(traceback.format_exc())
 
  853     cpu_node.cancel_timers()