cpu_monitor.py
Go to the documentation of this file.
00001 #!/usr/bin/env python
00002 #
00003 # Copyright 2017 Fraunhofer Institute for Manufacturing Engineering and Automation (IPA)
00004 #
00005 # Licensed under the Apache License, Version 2.0 (the "License");
00006 # you may not use this file except in compliance with the License.
00007 # You may obtain a copy of the License at
00008 #
00009 #   http://www.apache.org/licenses/LICENSE-2.0
00010 #
00011 # Unless required by applicable law or agreed to in writing, software
00012 # distributed under the License is distributed on an "AS IS" BASIS,
00013 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
00014 # See the License for the specific language governing permissions and
00015 # limitations under the License.
00016 
00017 
00018 from __future__ import with_statement, print_function
00019 
00020 import sys, os, time
00021 import subprocess
00022 import string
00023 import socket
00024 import psutil
00025 import requests
00026 import numpy as np
00027 import math
00028 
00029 import rospy
00030 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
00031 
00032 stat_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'Warning', DiagnosticStatus.ERROR: 'Error', DiagnosticStatus.STALE: 'Stale' }
00033 
00034 class CPUMonitor():
00035     def __init__(self, hostname, diag_hostname):
00036         self._check_ipmi = rospy.get_param('~check_ipmi_tool', False)
00037         self._check_core_temps = rospy.get_param('~check_core_temps', False)
00038 
00039         self._core_load_warn = rospy.get_param('~core_load_warn', 90)
00040         self._core_load_error = rospy.get_param('~core_load_error', 110)
00041         self._load1_threshold = rospy.get_param('~load1_threshold', 5.0)
00042         self._load5_threshold = rospy.get_param('~load5_threshold', 3.0)
00043         self._core_temp_warn = rospy.get_param('~core_temp_warn', 90)
00044         self._core_temp_error = rospy.get_param('~core_temp_error', 95)
00045         self._mem_warn = rospy.get_param('~mem_warn', 25)
00046         self._mem_error = rospy.get_param('~mem_error', 1)
00047 
00048         self._check_thermal_throttling_events = rospy.get_param('~check_thermal_throttling_events', False)
00049         self._thermal_throttling_threshold = rospy.get_param('~thermal_throttling_threshold', 1000)
00050 
00051         self._check_idlejitter = rospy.get_param('~check_idlejitter', False)
00052         self._idlejitter_min_threshold = rospy.get_param('~idlejitter_min_threshold', 50000)
00053         self._idlejitter_max_threshold = rospy.get_param('~idlejitter_max_threshold', 2000000)
00054         self._idlejitter_average_threshold = rospy.get_param('~idlejitter_average_threshold', 200000)
00055 
00056         if psutil.__version__ < '2.0.0':
00057             self._num_cores = rospy.get_param('~num_cores', psutil.NUM_CPUS)
00058         else:
00059             self._num_cores = rospy.get_param('~num_cores', psutil.cpu_count())
00060 
00061         # Get temp_input files
00062         self._temp_vals = self.get_core_temp_names()
00063 
00064         # CPU stats
00065         self._info_stat = DiagnosticStatus()
00066         self._info_stat.name = '%s CPU Info' % diag_hostname
00067         self._info_stat.level = DiagnosticStatus.WARN
00068         self._info_stat.hardware_id = hostname
00069         self._info_stat.message = 'No Data'
00070         self._info_stat.values = []
00071 
00072         self._usage_stat = DiagnosticStatus()
00073         self._usage_stat.name = '%s CPU Usage' % diag_hostname
00074         self._usage_stat.level = DiagnosticStatus.WARN
00075         self._usage_stat.hardware_id = hostname
00076         self._usage_stat.message = 'No Data'
00077         self._usage_stat.values = []
00078 
00079         self._memory_stat = DiagnosticStatus()
00080         self._memory_stat.name = '%s Memory Usage' % diag_hostname
00081         self._memory_stat.level = DiagnosticStatus.WARN
00082         self._memory_stat.hardware_id = hostname
00083         self._memory_stat.message = 'No Data'
00084         self._memory_stat.values = []
00085 
00086         self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=1)
00087         self._publish_timer = rospy.Timer(rospy.Duration(1.0), self.publish_stats)
00088         self._info_timer = rospy.Timer(rospy.Duration(5.0), self.check_info)
00089         self._usage_timer = rospy.Timer(rospy.Duration(5.0), self.check_usage)
00090         self._memory_timer = rospy.Timer(rospy.Duration(5.0), self.check_memory)
00091 
00092     ##\brief Output entire IPMI data set
00093     def check_ipmi(self):
00094         diag_vals = []
00095         diag_msgs = []
00096         diag_level = DiagnosticStatus.OK
00097 
00098         try:
00099             p = subprocess.Popen('sudo ipmitool sdr',
00100                                  stdout = subprocess.PIPE,
00101                                  stderr = subprocess.PIPE, shell = True)
00102             stdout, stderr = p.communicate()
00103             retcode = p.returncode
00104 
00105             if retcode != 0:
00106                 diag_level = DiagnosticStatus.ERROR
00107                 diag_msgs = [ 'ipmitool Error' ]
00108                 diag_vals = [ KeyValue(key = 'IPMI Error', value = stderr) ,
00109                               KeyValue(key = 'Output', value = stdout) ]
00110                 return diag_vals, diag_msgs, diag_level
00111 
00112             lines = stdout.split('\n')
00113             if len(lines) < 2:
00114                 diag_vals = [ KeyValue(key = 'ipmitool status', value = 'No output') ]
00115 
00116                 diag_msgs = [ 'No ipmitool response' ]
00117                 diag_level = DiagnosticStatus.ERROR
00118 
00119                 return diag_vals, diag_msgs, diag_level
00120 
00121             for ln in lines:
00122                 if len(ln) < 3:
00123                     continue
00124 
00125                 words = ln.split('|')
00126                 if len(words) < 3:
00127                     continue
00128 
00129                 name = words[0].strip()
00130                 ipmi_val = words[1].strip()
00131                 stat_byte = words[2].strip()
00132 
00133                 # CPU temps
00134                 if words[0].startswith('CPU') and words[0].strip().endswith('Temp'):
00135                     if words[1].strip().endswith('degrees C'):
00136                         tmp = ipmi_val.rstrip(' degrees C').lstrip()
00137                         if unicode(tmp).isnumeric():
00138                             temperature = float(tmp)
00139                             diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00140 
00141                             cpu_name = name.split()[0]
00142                             if temperature >= self._core_temp_error: # CPU should shut down here
00143                                 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00144                                 diag_msgs.append('CPU Hot')
00145                             elif temperature >= self._core_temp_warn:
00146                                 diag_level = max(diag_level, DiagnosticStatus.WARN)
00147                                 diag_msgs.append('CPU Warm')
00148                     else:
00149                         diag_vals.append(KeyValue(key = name, value = words[1]))
00150 
00151 
00152                 # MP, BP, FP temps
00153                 if name == 'MB Temp' or name == 'BP Temp' or name == 'FP Temp':
00154                     if ipmi_val.endswith('degrees C'):
00155                         tmp = ipmi_val.rstrip(' degrees C').lstrip()
00156                         diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00157                         # Give temp warning
00158                         dev_name = name.split()[0]
00159                         if unicode(tmp).isnumeric():
00160                             temperature = float(tmp)
00161 
00162                             if temperature >= 60 and temperature < 75:
00163                                 diag_level = max(diag_level, DiagnosticStatus.WARN)
00164                                 diag_msgs.append('%s Warm' % dev_name)
00165 
00166                             if temperature >= 75:
00167                                 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00168                                 diag_msgs.append('%s Hot' % dev_name)
00169                         else:
00170                             diag_level = max(diag_level, DiagnosticStatus.ERROR)
00171                             diag_msgs.append('%s Error' % dev_name)
00172                     else:
00173                         diag_vals.append(KeyValue(key = name, value = ipmi_val))
00174 
00175                 # CPU fan speeds
00176                 if (name.startswith('CPU') and name.endswith('Fan')) or name == 'MB Fan':
00177                     if ipmi_val.endswith('RPM'):
00178                         rpm = ipmi_val.rstrip(' RPM').lstrip()
00179                         if unicode(rpm).isnumeric():
00180                             if int(rpm) == 0:
00181                                 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00182                                 diag_msgs.append('CPU Fan Off')
00183 
00184                             diag_vals.append(KeyValue(key = name + ' RPM', value = rpm))
00185                         else:
00186                             diag_vals.append(KeyValue(key = name, value = ipmi_val))
00187 
00188                 # If CPU is hot we get an alarm from ipmitool, report that too
00189                 # CPU should shut down if we get a hot alarm, so report as error
00190                 if name.startswith('CPU') and name.endswith('hot'):
00191                     if ipmi_val == '0x01':
00192                         diag_vals.append(KeyValue(key = name, value = 'OK'))
00193                     else:
00194                         diag_vals.append(KeyValue(key = name, value = 'Hot'))
00195                         diag_level = max(diag_level, DiagnosticStatus.ERROR)
00196                         diag_msgs.append('CPU Hot Alarm')
00197 
00198         except Exception, e:
00199             diag_level = DiagnosticStatus.ERROR
00200             diag_msgs = [ 'IPMI Exception' ]
00201             diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
00202 
00203         return diag_vals, diag_msgs, diag_level
00204 
00205     ##\brief Check CPU core temps
00206     ##
00207     ## Read from every core, divide by 1000
00208     def check_core_temps(self):
00209         diag_vals = []
00210         diag_msgs = []
00211         diag_level = DiagnosticStatus.OK
00212 
00213         try:
00214             for device_type, devices in self._temp_vals.items():
00215                 for dev in devices:
00216                     cmd = 'cat %s' % dev[1]
00217                     p = subprocess.Popen(cmd, stdout = subprocess.PIPE,
00218                                          stderr = subprocess.PIPE, shell = True)
00219                     stdout, stderr = p.communicate()
00220                     retcode = p.returncode
00221 
00222                     if retcode != 0:
00223                         diag_level = DiagnosticStatus.ERROR
00224                         diag_msg = [ 'Core Temp Error' ]
00225                         diag_vals = [ KeyValue(key = 'Core Temp Error', value = stderr),
00226                                       KeyValue(key = 'Output', value = stdout) ]
00227                         return diag_vals, diag_msgs, diag_level
00228 
00229                     tmp = stdout.strip()
00230                     if device_type == 'platform':
00231                         if unicode(tmp).isnumeric():
00232                             temp = float(tmp) / 1000
00233                             diag_vals.append(KeyValue(key = 'Temp '+dev[0], value = str(temp)))
00234 
00235                             if temp >= self._core_temp_error:
00236                                 diag_level = max(diag_level, DiagnosticStatus.OK) #do not set ERROR
00237                                 diag_msgs.append('CPU Hot')
00238                             elif temp >= self._core_temp_warn:
00239                                 diag_level = max(diag_level, DiagnosticStatus.OK) #do not set WARN
00240                                 diag_msgs.append('CPU Warm')
00241 
00242                         else:
00243                             diag_level = max(diag_level, DiagnosticStatus.ERROR) # Error if not numeric value
00244                             diag_vals.append(KeyValue(key = 'Temp '+dev[0], value = tmp))
00245                     else: # device_type == `virtual`
00246                         diag_vals.append(KeyValue(key = 'Temp '+dev[0], value = tmp))
00247 
00248         except Exception, e:
00249             diag_level = DiagnosticStatus.ERROR
00250             diag_msgs = [ 'Core Temp Exception' ]
00251             diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
00252 
00253         return diag_vals, diag_msgs, diag_level
00254 
00255     ##\brief Checks clock speed from reading from CPU info
00256     def check_clock_speed(self):
00257         diag_vals = []
00258         diag_msgs = []
00259         diag_level = DiagnosticStatus.OK
00260 
00261         try:
00262             # get current freq
00263             p = subprocess.Popen('cat /proc/cpuinfo | grep MHz',
00264                                  stdout = subprocess.PIPE,
00265                                  stderr = subprocess.PIPE, shell = True)
00266             stdout, stderr = p.communicate()
00267             retcode = p.returncode
00268 
00269             if retcode != 0:
00270                 diag_level = DiagnosticStatus.ERROR
00271                 diag_msgs = [ 'Clock Speed Error' ]
00272                 diag_vals = [ KeyValue(key = 'Clock Speed Error', value = stderr),
00273                               KeyValue(key = 'Output', value = stdout) ]
00274                 return (diag_vals, diag_msgs, diag_level)
00275 
00276             for index, ln in enumerate(stdout.split('\n')):
00277                 words = ln.split(':')
00278                 if len(words) < 2:
00279                     continue
00280 
00281                 speed = words[1].strip().split('.')[0] # Conversion to float doesn't work with decimal
00282                 diag_vals.append(KeyValue(key = 'Core %d MHz' % index, value = speed))
00283                 if not unicode(speed).isnumeric():
00284                     # Automatically give error if speed isn't a number
00285                     diag_level = max(diag_level, DiagnosticStatus.ERROR)
00286                     diag_msgs = [ 'Clock speed not numeric' ]
00287 
00288             # get max freq
00289             p = subprocess.Popen('lscpu | grep "max MHz"',
00290                                  stdout = subprocess.PIPE,
00291                                  stderr = subprocess.PIPE, shell = True)
00292             stdout, stderr = p.communicate()
00293             retcode = p.returncode
00294 
00295             if retcode != 0:
00296                 diag_level = DiagnosticStatus.ERROR
00297                 diag_msgs = [ 'Clock Speed Error' ]
00298                 diag_vals = [ KeyValue(key = 'Clock Speed Error', value = stderr),
00299                               KeyValue(key = 'Output', value = stdout) ]
00300                 return (diag_vals, diag_msgs, diag_level)
00301 
00302             diag_vals.append(KeyValue(key = stdout.split(':')[0].strip(), value = str(stdout.split(':')[1].strip())))
00303 
00304             # get min freq
00305             p = subprocess.Popen('lscpu | grep "min MHz"',
00306                                  stdout = subprocess.PIPE,
00307                                  stderr = subprocess.PIPE, shell = True)
00308             stdout, stderr = p.communicate()
00309             retcode = p.returncode
00310 
00311             if retcode != 0:
00312                 diag_level = DiagnosticStatus.ERROR
00313                 diag_msgs = [ 'Clock Speed Error' ]
00314                 diag_vals = [ KeyValue(key = 'Clock Speed Error', value = stderr),
00315                               KeyValue(key = 'Output', value = stdout) ]
00316                 return (diag_vals, diag_msgs, diag_level)
00317 
00318             diag_vals.append(KeyValue(key = stdout.split(':')[0].strip(), value = str(stdout.split(':')[1].strip())))
00319 
00320         except Exception, e:
00321             diag_level = DiagnosticStatus.ERROR
00322             diag_msgs = [ 'Clock Speed Exception' ]
00323             diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
00324 
00325         return diag_vals, diag_msgs, diag_level
00326 
00327     ##\brief Uses 'uptime' to see load average
00328     def check_uptime(self):
00329         diag_vals = []
00330         diag_msg = ''
00331         diag_level = DiagnosticStatus.OK
00332 
00333         load_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High Load', DiagnosticStatus.ERROR: 'Very High Load' }
00334 
00335         try:
00336             p = subprocess.Popen('uptime', stdout = subprocess.PIPE,
00337                                  stderr = subprocess.PIPE, shell = True)
00338             stdout, stderr = p.communicate()
00339             retcode = p.returncode
00340 
00341             if retcode != 0:
00342                 diag_level = DiagnosticStatus.ERROR
00343                 diag_msg = 'Uptime Error'
00344                 diag_vals = [ KeyValue(key = 'Uptime Error', value = stderr),
00345                               KeyValue(key = 'Output', value = stdout) ]
00346                 return (diag_vals, diag_msg, diag_level)
00347 
00348             upvals = stdout.split()
00349             load1 = upvals[-3].rstrip(',').replace(',', '.')
00350             load5 = upvals[-2].rstrip(',').replace(',', '.')
00351             load15 = upvals[-1].replace(',', '.')
00352             num_users = upvals[-7]
00353 
00354             # Give warning if we go over load limit
00355             if float(load1) > self._load1_threshold or float(load5) > self._load5_threshold:
00356                 diag_level = DiagnosticStatus.WARN
00357 
00358             diag_vals.append(KeyValue(key = 'Load Average Status', value = load_dict[diag_level]))
00359             diag_vals.append(KeyValue(key = '1 min Load Average', value = load1))
00360             diag_vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(self._load1_threshold)))
00361             diag_vals.append(KeyValue(key = '5 min Load Average', value = load5))
00362             diag_vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(self._load5_threshold)))
00363             diag_vals.append(KeyValue(key = '15 min Load Average', value = load15))
00364             diag_vals.append(KeyValue(key = 'Number of Users', value = num_users))
00365 
00366             diag_msg = load_dict[diag_level]
00367 
00368         except Exception, e:
00369             diag_level = DiagnosticStatus.ERROR
00370             diag_msg = 'Uptime Exception'
00371             diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
00372 
00373         return diag_vals, diag_msg, diag_level
00374 
00375     ##\brief Uses 'free -m' to check free memory
00376     def check_free_memory(self):
00377         diag_vals = []
00378         diag_msg = ''
00379         diag_level = DiagnosticStatus.OK
00380 
00381         mem_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'Low Memory', DiagnosticStatus.ERROR: 'Very Low Memory' }
00382 
00383         try:
00384             p = subprocess.Popen('free -m',
00385                                  stdout = subprocess.PIPE,
00386                                  stderr = subprocess.PIPE, shell = True)
00387             stdout, stderr = p.communicate()
00388             retcode = p.returncode
00389 
00390             if retcode != 0:
00391                 diag_level = DiagnosticStatus.ERROR
00392                 diag_msg = 'Memory Usage Error'
00393                 diag_vals = [ KeyValue(key = 'Memory Usage Error', value = stderr),
00394                               KeyValue(key = 'Output', value = stdout) ]
00395                 return (diag_vals, diag_msg, diag_level)
00396 
00397             rows = stdout.split('\n')
00398 
00399             # Mem
00400             data = rows[1].split()
00401             total_mem = data[1]
00402             used_mem = data[2]
00403             free_mem = data[3]
00404             shared_mem = data[4]
00405             cache_mem = data[5]
00406             available_mem = data[6]
00407 
00408             diag_level = DiagnosticStatus.OK
00409             if float(free_mem) < self._mem_warn:
00410                 diag_level = DiagnosticStatus.WARN
00411             if float(free_mem) < self._mem_error:
00412                 diag_level = DiagnosticStatus.ERROR
00413 
00414             diag_vals.append(KeyValue(key = 'Mem Status', value = mem_dict[diag_level]))
00415             diag_vals.append(KeyValue(key = 'Mem Total', value = total_mem))
00416             diag_vals.append(KeyValue(key = 'Mem Used', value = used_mem))
00417             diag_vals.append(KeyValue(key = 'Mem Free', value = free_mem))
00418             diag_vals.append(KeyValue(key = 'Mem Shared', value = shared_mem))
00419             diag_vals.append(KeyValue(key = 'Mem Buff/Cache', value = cache_mem))
00420             diag_vals.append(KeyValue(key = 'Mem Available', value = available_mem))
00421 
00422             # Swap
00423             data = rows[2].split()
00424             total_mem = data[1]
00425             used_mem = data[2]
00426             free_mem = data[3]
00427 
00428             diag_vals.append(KeyValue(key = 'Swap Total', value = total_mem))
00429             diag_vals.append(KeyValue(key = 'Swap Used', value = used_mem))
00430             diag_vals.append(KeyValue(key = 'Swap Free', value = free_mem))
00431 
00432             diag_msg = mem_dict[diag_level]
00433 
00434         except Exception, e:
00435             diag_level = DiagnosticStatus.ERROR
00436             diag_msg = 'Memory Usage Exception'
00437             diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
00438 
00439         return diag_vals, diag_msg, diag_level
00440 
00441     ##\brief Use mpstat to find CPU usage
00442     def check_mpstat(self):
00443         diag_vals = []
00444         diag_msg = ''
00445         diag_level = DiagnosticStatus.OK
00446 
00447         load_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High Load', DiagnosticStatus.ERROR: 'Error' }
00448 
00449         try:
00450             p = subprocess.Popen('mpstat -P ALL 1 1',
00451                                  stdout = subprocess.PIPE,
00452                                  stderr = subprocess.PIPE, shell = True)
00453             stdout, stderr = p.communicate()
00454             retcode = p.returncode
00455 
00456             if retcode != 0:
00457                 diag_level = DiagnosticStatus.ERROR
00458                 diag_msg = 'CPU Usage Error'
00459                 diag_vals = [ KeyValue(key = 'CPU Usage Error', value = stderr),
00460                               KeyValue(key = 'Output', value = stdout) ]
00461                 return (diag_vals, diag_msg, diag_level)
00462 
00463             # Check which column '%idle' is, #4539
00464             # mpstat output changed between 8.06 and 8.1
00465             rows = stdout.split('\n')
00466             col_names = rows[2].split()
00467             idle_col = -1 if (len(col_names) > 2 and col_names[-1] == '%idle') else -2
00468 
00469             num_cores = 0
00470             cores_loaded = 0
00471             for index, row in enumerate(stdout.split('\n')):
00472                 if index < 3:
00473                     continue
00474 
00475                 # Skip row containing 'all' data
00476                 if row.find('all') > -1:
00477                     continue
00478 
00479                 lst = row.split()
00480                 if len(lst) < 8:
00481                     continue
00482 
00483                 ## Ignore 'Average: ...' data
00484                 if lst[0].startswith('Average'):
00485                     continue
00486 
00487                 cpu_name = '%d' % (num_cores)
00488                 idle = lst[idle_col].replace(',', '.')
00489                 user = lst[3].replace(',', '.')
00490                 nice = lst[4].replace(',', '.')
00491                 system = lst[5].replace(',', '.')
00492 
00493                 core_level = DiagnosticStatus.OK
00494                 usage = float(user) + float(nice)
00495                 if usage > self._core_load_warn:
00496                     cores_loaded += 1
00497                     core_level = DiagnosticStatus.WARN
00498                 if usage > self._core_load_error:
00499                     core_level = DiagnosticStatus.ERROR
00500 
00501                 diag_vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level]))
00502                 diag_vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = user))
00503                 diag_vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = nice))
00504                 diag_vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = system))
00505                 diag_vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = idle))
00506 
00507                 num_cores += 1
00508 
00509             # Warn for high load only if we have <= 2 cores that aren't loaded
00510             if num_cores - cores_loaded <= 2 and num_cores > 2:
00511                 diag_level = DiagnosticStatus.WARN
00512 
00513             # Check the number of cores if self._num_cores > 0, #4850
00514             if self._num_cores > 0 and self._num_cores != num_cores:
00515                 diag_level = DiagnosticStatus.ERROR
00516                 diag_msg = 'Incorrect number of CPU cores: Expected %d, got %d. Computer may have not booted properly.' % self._num_cores, num_cores
00517                 return diag_vals, diag_msg, diag_level
00518 
00519             diag_msg = load_dict[diag_level]
00520 
00521         except Exception, e:
00522             diag_level = DiagnosticStatus.ERROR
00523             diag_msg = 'CPU Usage Exception'
00524             diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
00525 
00526         return diag_vals, diag_msg, diag_level
00527 
00528 
00529     def query_netdata(self, chart, after):
00530         NETDATA_URI = 'http://127.0.0.1:19999/api/v1/data?chart=%s&format=json&after=-%d'
00531         url = NETDATA_URI % (chart, int(after))
00532 
00533         try:
00534             r = requests.get(url)
00535         except requests.ConnectionError as ex:
00536             rospy.logerr("NetData ConnectionError %r", ex)
00537             return None
00538 
00539         if r.status_code != 200:
00540             rospy.logerr("NetData request not successful with status_code %d", r.status_code)
00541             return None
00542 
00543         rdata = r.json()
00544 
00545         sdata = zip(*rdata['data'])
00546         d = dict()
00547 
00548         for idx, label in enumerate(rdata['labels']):
00549             np_array = np.array(sdata[idx])
00550             if np_array.dtype == np.object:
00551                 rospy.logwarn("Data from NetData malformed")
00552                 return None
00553             d[label] = np_array
00554 
00555         return d
00556 
00557 
00558     def check_core_throttling(self, interval=1):
00559         throt_dict = {DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High Thermal Throttling Events',
00560                       DiagnosticStatus.ERROR: 'No valid Data from NetData'}
00561 
00562         throt_level = DiagnosticStatus.OK
00563 
00564         vals = []
00565 
00566         netdata = self.query_netdata('cpu.core_throttling', interval)
00567 
00568         for i in range(self._num_cores):
00569             lbl = 'CPU %d Thermal Throttling Events' % i
00570             netdata_key = 'cpu%d' % i
00571 
00572             core_mean = 'N/A'
00573             if netdata is not None and netdata_key in netdata:
00574                 core_data = netdata[netdata_key]
00575                 if core_data is not None:
00576                     core_mean = np.mean(core_data)
00577 
00578                     if core_mean > self._thermal_throttling_threshold:
00579                         throt_level = DiagnosticStatus.WARN
00580             else:
00581                 throt_level = DiagnosticStatus.ERROR
00582 
00583             vals.append(KeyValue(key=lbl, value='%r' % core_mean))
00584 
00585         vals.insert(0, KeyValue(key='Thermal Throttling Status', value=throt_dict[throt_level]))
00586         vals.append(KeyValue(key='Thermal Throttling Threshold', value=str(self._thermal_throttling_threshold)))
00587 
00588         return throt_level, throt_dict[throt_level], vals
00589 
00590 
00591     def check_idlejitter(self, interval=1):
00592         jitter_dict = {DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High IDLE Jitter',
00593                        DiagnosticStatus.ERROR: 'No valid Data from NetData'}
00594 
00595         jitter_level = DiagnosticStatus.OK
00596 
00597         vals = []
00598 
00599         netdata = self.query_netdata('system.idlejitter', interval)
00600 
00601         metric_list = [
00602             ('IDLE Jitter Min', 'min', self._idlejitter_min_threshold, np.min),
00603             ('IDLE Jitter Max', 'max', self._idlejitter_max_threshold, np.max),
00604             ('IDLE Jitter Average', 'average', self._idlejitter_average_threshold, np.mean),
00605         ]
00606 
00607         for metric_label, metric_key, metric_threshold, aggregate_fnc in metric_list:
00608             metric_aggreagte = 'N/A'
00609             if netdata is not None and metric_key in netdata:
00610                 metric_data = netdata[metric_key]
00611                 if metric_data is not None:
00612                     metric_aggreagte = aggregate_fnc(metric_data)
00613 
00614                     if metric_aggreagte > metric_threshold:
00615                         jitter_level = DiagnosticStatus.WARN
00616             else:
00617                 jitter_level = DiagnosticStatus.ERROR
00618 
00619             vals.append(KeyValue(key=metric_label, value=str(metric_aggreagte)))
00620             vals.append(KeyValue(key=metric_label + ' Threshold', value=str(metric_threshold)))
00621 
00622         vals.insert(0, KeyValue(key='IDLE Jitter Status', value=jitter_dict[jitter_level]))
00623 
00624         return jitter_level, jitter_dict[jitter_level], vals
00625 
00626 
00627     ##\brief Returns names for core temperature files
00628     def get_core_temp_names(self):
00629         devices = {}
00630         platform_vals = []
00631         virtual_vals = []
00632         try:
00633             #platform devices
00634             p = subprocess.Popen('find /sys/devices/platform -name temp*_input',
00635                                  stdout = subprocess.PIPE,
00636                                  stderr = subprocess.PIPE, shell = True)
00637             stdout, stderr = p.communicate()
00638             retcode = p.returncode
00639 
00640             if retcode != 0:
00641                 rospy.logerr('Error find core temp locations: %s' % stderr)
00642                 return []
00643 
00644             for ln in stdout.split('\n'):
00645                 if ln:
00646                     device_path, device_file = os.path.split(ln.strip())
00647                     device_label = device_path+'/'+device_file.split('_')[0]+'_label'
00648                     name = open(device_label, 'r').read()
00649                     pair = (name.strip(), ln.strip())
00650                     platform_vals.append(pair)
00651 
00652             #virtual devices
00653             p = subprocess.Popen('find /sys/devices/virtual -name temp*_input',
00654                                  stdout = subprocess.PIPE,
00655                                  stderr = subprocess.PIPE, shell = True)
00656             stdout, stderr = p.communicate()
00657             retcode = p.returncode
00658 
00659             if retcode != 0:
00660                 rospy.logerr('Error find core temp locations: %s' % stderr)
00661                 return []
00662 
00663             for ln in stdout.split('\n'):
00664                 if ln:
00665                     device_path, device_file = os.path.split(ln.strip())
00666                     name = open(device_path+'/name', 'r').read()
00667                     pair = (name.strip(), ln.strip())
00668                     virtual_vals.append(pair)
00669 
00670             devices['platform'] = platform_vals
00671             devices['virtual'] = virtual_vals
00672             return devices
00673         except:
00674             rospy.logerr('Exception finding temp vals: %s' % str(e)())
00675             return []
00676 
00677 
00678     def check_info(self, event):
00679         diag_vals = []
00680         diag_msgs = []
00681         diag_level = DiagnosticStatus.OK
00682 
00683         if self._check_ipmi:
00684             ipmi_vals, ipmi_msgs, ipmi_level = self.check_ipmi()
00685             diag_vals.extend(ipmi_vals)
00686             diag_msgs.extend(ipmi_msgs)
00687             diag_level = max(diag_level, ipmi_level)
00688 
00689         if self._check_core_temps:
00690             core_vals, core_msgs, core_level = self.check_core_temps()
00691             diag_vals.extend(core_vals)
00692             diag_msgs.extend(core_msgs)
00693             diag_level = max(diag_level, core_level)
00694 
00695         clock_vals, clock_msgs, clock_level = self.check_clock_speed()
00696         diag_vals.extend(clock_vals)
00697         diag_msgs.extend(clock_msgs)
00698         diag_level = max(diag_level, clock_level)
00699 
00700         diag_log = set(diag_msgs)
00701         if len(diag_log) > DiagnosticStatus.OK:
00702             message = ', '.join(diag_log)
00703         else:
00704             message = stat_dict[diag_level]
00705 
00706         self._info_stat.values = diag_vals
00707         self._info_stat.message = message
00708         self._info_stat.level = diag_level
00709 
00710     def check_usage(self, event):
00711         diag_vals = []
00712         diag_msgs = []
00713         diag_level = DiagnosticStatus.OK
00714 
00715         # Check mpstat
00716         mp_vals, mp_msg, mp_level = self.check_mpstat()
00717         diag_vals.extend(mp_vals)
00718         if mp_level > DiagnosticStatus.OK:
00719             diag_msgs.append(mp_msg)
00720         diag_level = max(diag_level, mp_level)
00721 
00722         # Check NetData cpu.core_throttling
00723         if self._check_thermal_throttling_events:
00724             interval = math.ceil(self._usage_timer._period.to_sec())
00725             throt_level, throt_msg, throt_vals = self.check_core_throttling(interval=interval)
00726             diag_vals.extend(throt_vals)
00727             if throt_level > 0:
00728                 diag_msgs.append(throt_msg)
00729             diag_level = max(diag_level, throt_level)
00730 
00731         # Check NetData system.idlejitter
00732         if self._check_idlejitter:
00733             interval = math.ceil(self._usage_timer._period.to_sec())
00734             jitter_level, jitter_msg, jitter_vals = self.check_idlejitter(interval=interval)
00735             diag_vals.extend(jitter_vals)
00736             if jitter_level > 0:
00737                 diag_msgs.append(jitter_msg)
00738             diag_level = max(diag_level, jitter_level)
00739 
00740         # Check uptime
00741         up_vals, up_msg, up_level = self.check_uptime()
00742         diag_vals.extend(up_vals)
00743         if up_level > DiagnosticStatus.OK:
00744             diag_msgs.append(up_msg)
00745         diag_level = max(diag_level, up_level)
00746 
00747         if diag_msgs and diag_level > DiagnosticStatus.OK:
00748             usage_msg = ', '.join(set(diag_msgs))
00749         else:
00750             usage_msg = stat_dict[diag_level]
00751 
00752         self._usage_stat.values = diag_vals
00753         self._usage_stat.message = usage_msg
00754         self._usage_stat.level = diag_level
00755 
00756     def check_memory(self, event):
00757         diag_vals = []
00758         diag_msgs = []
00759         diag_level = DiagnosticStatus.OK
00760 
00761         # Check memory
00762         mem_vals, mem_msg, mem_level = self.check_free_memory()
00763         diag_vals.extend(mem_vals)
00764         if mem_level > DiagnosticStatus.OK:
00765             diag_msgs.append(mem_msg)
00766         diag_level = max(diag_level, mem_level)
00767 
00768         if diag_msgs and diag_level > DiagnosticStatus.OK:
00769             memory_msg = ', '.join(set(diag_msgs))
00770         else:
00771             memory_msg = stat_dict[diag_level]
00772 
00773         self._memory_stat.values = diag_vals
00774         self._memory_stat.message = memory_msg
00775         self._memory_stat.level = diag_level
00776 
00777     def publish_stats(self, event):
00778         msg = DiagnosticArray()
00779         msg.header.stamp = rospy.get_rostime()
00780         msg.status.append(self._info_stat)
00781         msg.status.append(self._usage_stat)
00782         msg.status.append(self._memory_stat)
00783         self._diag_pub.publish(msg)
00784 
00785 
00786 if __name__ == '__main__':
00787     hostname = socket.gethostname()
00788 
00789     import optparse
00790     parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]")
00791     parser.add_option("--diag-hostname", dest="diag_hostname",
00792                       help="Computer name in diagnostics output (ex: 'c1')",
00793                       metavar="DIAG_HOSTNAME",
00794                       action="store", default = hostname)
00795     options, args = parser.parse_args(rospy.myargv())
00796 
00797     try:
00798         rospy.init_node('cpu_monitor_%s' % hostname)
00799     except rospy.exceptions.ROSInitException:
00800         print('CPU monitor is unable to initialize node. Master may not be running.', file=sys.stderr)
00801         sys.exit(0)
00802 
00803     cpu_node = CPUMonitor(hostname, options.diag_hostname)
00804     rospy.spin()


cob_monitoring
Author(s): Florian Weisshardt , Felix Messmer
autogenerated on Sun Jun 9 2019 20:20:19