00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 from __future__ import with_statement, print_function
00019
00020 import sys, os, time
00021 import subprocess
00022 import string
00023 import socket
00024 import psutil
00025 import requests
00026 import numpy as np
00027 import math
00028
00029 import rospy
00030 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
00031
00032 stat_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'Warning', DiagnosticStatus.ERROR: 'Error', DiagnosticStatus.STALE: 'Stale' }
00033
00034 class CPUMonitor():
00035 def __init__(self, hostname, diag_hostname):
00036 self._check_ipmi = rospy.get_param('~check_ipmi_tool', False)
00037 self._check_core_temps = rospy.get_param('~check_core_temps', False)
00038
00039 self._core_load_warn = rospy.get_param('~core_load_warn', 90)
00040 self._core_load_error = rospy.get_param('~core_load_error', 110)
00041 self._load1_threshold = rospy.get_param('~load1_threshold', 5.0)
00042 self._load5_threshold = rospy.get_param('~load5_threshold', 3.0)
00043 self._core_temp_warn = rospy.get_param('~core_temp_warn', 90)
00044 self._core_temp_error = rospy.get_param('~core_temp_error', 95)
00045 self._mem_warn = rospy.get_param('~mem_warn', 25)
00046 self._mem_error = rospy.get_param('~mem_error', 1)
00047
00048 self._check_thermal_throttling_events = rospy.get_param('~check_thermal_throttling_events', False)
00049 self._thermal_throttling_threshold = rospy.get_param('~thermal_throttling_threshold', 1000)
00050
00051 self._check_idlejitter = rospy.get_param('~check_idlejitter', False)
00052 self._idlejitter_min_threshold = rospy.get_param('~idlejitter_min_threshold', 50000)
00053 self._idlejitter_max_threshold = rospy.get_param('~idlejitter_max_threshold', 2000000)
00054 self._idlejitter_average_threshold = rospy.get_param('~idlejitter_average_threshold', 200000)
00055
00056 if psutil.__version__ < '2.0.0':
00057 self._num_cores = rospy.get_param('~num_cores', psutil.NUM_CPUS)
00058 else:
00059 self._num_cores = rospy.get_param('~num_cores', psutil.cpu_count())
00060
00061
00062 self._temp_vals = self.get_core_temp_names()
00063
00064
00065 self._info_stat = DiagnosticStatus()
00066 self._info_stat.name = '%s CPU Info' % diag_hostname
00067 self._info_stat.level = DiagnosticStatus.WARN
00068 self._info_stat.hardware_id = hostname
00069 self._info_stat.message = 'No Data'
00070 self._info_stat.values = []
00071
00072 self._usage_stat = DiagnosticStatus()
00073 self._usage_stat.name = '%s CPU Usage' % diag_hostname
00074 self._usage_stat.level = DiagnosticStatus.WARN
00075 self._usage_stat.hardware_id = hostname
00076 self._usage_stat.message = 'No Data'
00077 self._usage_stat.values = []
00078
00079 self._memory_stat = DiagnosticStatus()
00080 self._memory_stat.name = '%s Memory Usage' % diag_hostname
00081 self._memory_stat.level = DiagnosticStatus.WARN
00082 self._memory_stat.hardware_id = hostname
00083 self._memory_stat.message = 'No Data'
00084 self._memory_stat.values = []
00085
00086 self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=1)
00087 self._publish_timer = rospy.Timer(rospy.Duration(1.0), self.publish_stats)
00088 self._info_timer = rospy.Timer(rospy.Duration(5.0), self.check_info)
00089 self._usage_timer = rospy.Timer(rospy.Duration(5.0), self.check_usage)
00090 self._memory_timer = rospy.Timer(rospy.Duration(5.0), self.check_memory)
00091
00092
00093 def check_ipmi(self):
00094 diag_vals = []
00095 diag_msgs = []
00096 diag_level = DiagnosticStatus.OK
00097
00098 try:
00099 p = subprocess.Popen('sudo ipmitool sdr',
00100 stdout = subprocess.PIPE,
00101 stderr = subprocess.PIPE, shell = True)
00102 stdout, stderr = p.communicate()
00103 retcode = p.returncode
00104
00105 if retcode != 0:
00106 diag_level = DiagnosticStatus.ERROR
00107 diag_msgs = [ 'ipmitool Error' ]
00108 diag_vals = [ KeyValue(key = 'IPMI Error', value = stderr) ,
00109 KeyValue(key = 'Output', value = stdout) ]
00110 return diag_vals, diag_msgs, diag_level
00111
00112 lines = stdout.split('\n')
00113 if len(lines) < 2:
00114 diag_vals = [ KeyValue(key = 'ipmitool status', value = 'No output') ]
00115
00116 diag_msgs = [ 'No ipmitool response' ]
00117 diag_level = DiagnosticStatus.ERROR
00118
00119 return diag_vals, diag_msgs, diag_level
00120
00121 for ln in lines:
00122 if len(ln) < 3:
00123 continue
00124
00125 words = ln.split('|')
00126 if len(words) < 3:
00127 continue
00128
00129 name = words[0].strip()
00130 ipmi_val = words[1].strip()
00131 stat_byte = words[2].strip()
00132
00133
00134 if words[0].startswith('CPU') and words[0].strip().endswith('Temp'):
00135 if words[1].strip().endswith('degrees C'):
00136 tmp = ipmi_val.rstrip(' degrees C').lstrip()
00137 if unicode(tmp).isnumeric():
00138 temperature = float(tmp)
00139 diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00140
00141 cpu_name = name.split()[0]
00142 if temperature >= self._core_temp_error:
00143 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00144 diag_msgs.append('CPU Hot')
00145 elif temperature >= self._core_temp_warn:
00146 diag_level = max(diag_level, DiagnosticStatus.WARN)
00147 diag_msgs.append('CPU Warm')
00148 else:
00149 diag_vals.append(KeyValue(key = name, value = words[1]))
00150
00151
00152
00153 if name == 'MB Temp' or name == 'BP Temp' or name == 'FP Temp':
00154 if ipmi_val.endswith('degrees C'):
00155 tmp = ipmi_val.rstrip(' degrees C').lstrip()
00156 diag_vals.append(KeyValue(key = name + ' (C)', value = tmp))
00157
00158 dev_name = name.split()[0]
00159 if unicode(tmp).isnumeric():
00160 temperature = float(tmp)
00161
00162 if temperature >= 60 and temperature < 75:
00163 diag_level = max(diag_level, DiagnosticStatus.WARN)
00164 diag_msgs.append('%s Warm' % dev_name)
00165
00166 if temperature >= 75:
00167 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00168 diag_msgs.append('%s Hot' % dev_name)
00169 else:
00170 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00171 diag_msgs.append('%s Error' % dev_name)
00172 else:
00173 diag_vals.append(KeyValue(key = name, value = ipmi_val))
00174
00175
00176 if (name.startswith('CPU') and name.endswith('Fan')) or name == 'MB Fan':
00177 if ipmi_val.endswith('RPM'):
00178 rpm = ipmi_val.rstrip(' RPM').lstrip()
00179 if unicode(rpm).isnumeric():
00180 if int(rpm) == 0:
00181 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00182 diag_msgs.append('CPU Fan Off')
00183
00184 diag_vals.append(KeyValue(key = name + ' RPM', value = rpm))
00185 else:
00186 diag_vals.append(KeyValue(key = name, value = ipmi_val))
00187
00188
00189
00190 if name.startswith('CPU') and name.endswith('hot'):
00191 if ipmi_val == '0x01':
00192 diag_vals.append(KeyValue(key = name, value = 'OK'))
00193 else:
00194 diag_vals.append(KeyValue(key = name, value = 'Hot'))
00195 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00196 diag_msgs.append('CPU Hot Alarm')
00197
00198 except Exception, e:
00199 diag_level = DiagnosticStatus.ERROR
00200 diag_msgs = [ 'IPMI Exception' ]
00201 diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
00202
00203 return diag_vals, diag_msgs, diag_level
00204
00205
00206
00207
00208 def check_core_temps(self):
00209 diag_vals = []
00210 diag_msgs = []
00211 diag_level = DiagnosticStatus.OK
00212
00213 try:
00214 for device_type, devices in self._temp_vals.items():
00215 for dev in devices:
00216 cmd = 'cat %s' % dev[1]
00217 p = subprocess.Popen(cmd, stdout = subprocess.PIPE,
00218 stderr = subprocess.PIPE, shell = True)
00219 stdout, stderr = p.communicate()
00220 retcode = p.returncode
00221
00222 if retcode != 0:
00223 diag_level = DiagnosticStatus.ERROR
00224 diag_msg = [ 'Core Temp Error' ]
00225 diag_vals = [ KeyValue(key = 'Core Temp Error', value = stderr),
00226 KeyValue(key = 'Output', value = stdout) ]
00227 return diag_vals, diag_msgs, diag_level
00228
00229 tmp = stdout.strip()
00230 if device_type == 'platform':
00231 if unicode(tmp).isnumeric():
00232 temp = float(tmp) / 1000
00233 diag_vals.append(KeyValue(key = 'Temp '+dev[0], value = str(temp)))
00234
00235 if temp >= self._core_temp_error:
00236 diag_level = max(diag_level, DiagnosticStatus.OK)
00237 diag_msgs.append('CPU Hot')
00238 elif temp >= self._core_temp_warn:
00239 diag_level = max(diag_level, DiagnosticStatus.OK)
00240 diag_msgs.append('CPU Warm')
00241
00242 else:
00243 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00244 diag_vals.append(KeyValue(key = 'Temp '+dev[0], value = tmp))
00245 else:
00246 diag_vals.append(KeyValue(key = 'Temp '+dev[0], value = tmp))
00247
00248 except Exception, e:
00249 diag_level = DiagnosticStatus.ERROR
00250 diag_msgs = [ 'Core Temp Exception' ]
00251 diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
00252
00253 return diag_vals, diag_msgs, diag_level
00254
00255
00256 def check_clock_speed(self):
00257 diag_vals = []
00258 diag_msgs = []
00259 diag_level = DiagnosticStatus.OK
00260
00261 try:
00262
00263 p = subprocess.Popen('cat /proc/cpuinfo | grep MHz',
00264 stdout = subprocess.PIPE,
00265 stderr = subprocess.PIPE, shell = True)
00266 stdout, stderr = p.communicate()
00267 retcode = p.returncode
00268
00269 if retcode != 0:
00270 diag_level = DiagnosticStatus.ERROR
00271 diag_msgs = [ 'Clock Speed Error' ]
00272 diag_vals = [ KeyValue(key = 'Clock Speed Error', value = stderr),
00273 KeyValue(key = 'Output', value = stdout) ]
00274 return (diag_vals, diag_msgs, diag_level)
00275
00276 for index, ln in enumerate(stdout.split('\n')):
00277 words = ln.split(':')
00278 if len(words) < 2:
00279 continue
00280
00281 speed = words[1].strip().split('.')[0]
00282 diag_vals.append(KeyValue(key = 'Core %d MHz' % index, value = speed))
00283 if not unicode(speed).isnumeric():
00284
00285 diag_level = max(diag_level, DiagnosticStatus.ERROR)
00286 diag_msgs = [ 'Clock speed not numeric' ]
00287
00288
00289 p = subprocess.Popen('lscpu | grep "max MHz"',
00290 stdout = subprocess.PIPE,
00291 stderr = subprocess.PIPE, shell = True)
00292 stdout, stderr = p.communicate()
00293 retcode = p.returncode
00294
00295 if retcode != 0:
00296 diag_level = DiagnosticStatus.ERROR
00297 diag_msgs = [ 'Clock Speed Error' ]
00298 diag_vals = [ KeyValue(key = 'Clock Speed Error', value = stderr),
00299 KeyValue(key = 'Output', value = stdout) ]
00300 return (diag_vals, diag_msgs, diag_level)
00301
00302 diag_vals.append(KeyValue(key = stdout.split(':')[0].strip(), value = str(stdout.split(':')[1].strip())))
00303
00304
00305 p = subprocess.Popen('lscpu | grep "min MHz"',
00306 stdout = subprocess.PIPE,
00307 stderr = subprocess.PIPE, shell = True)
00308 stdout, stderr = p.communicate()
00309 retcode = p.returncode
00310
00311 if retcode != 0:
00312 diag_level = DiagnosticStatus.ERROR
00313 diag_msgs = [ 'Clock Speed Error' ]
00314 diag_vals = [ KeyValue(key = 'Clock Speed Error', value = stderr),
00315 KeyValue(key = 'Output', value = stdout) ]
00316 return (diag_vals, diag_msgs, diag_level)
00317
00318 diag_vals.append(KeyValue(key = stdout.split(':')[0].strip(), value = str(stdout.split(':')[1].strip())))
00319
00320 except Exception, e:
00321 diag_level = DiagnosticStatus.ERROR
00322 diag_msgs = [ 'Clock Speed Exception' ]
00323 diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
00324
00325 return diag_vals, diag_msgs, diag_level
00326
00327
00328 def check_uptime(self):
00329 diag_vals = []
00330 diag_msg = ''
00331 diag_level = DiagnosticStatus.OK
00332
00333 load_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High Load', DiagnosticStatus.ERROR: 'Very High Load' }
00334
00335 try:
00336 p = subprocess.Popen('uptime', stdout = subprocess.PIPE,
00337 stderr = subprocess.PIPE, shell = True)
00338 stdout, stderr = p.communicate()
00339 retcode = p.returncode
00340
00341 if retcode != 0:
00342 diag_level = DiagnosticStatus.ERROR
00343 diag_msg = 'Uptime Error'
00344 diag_vals = [ KeyValue(key = 'Uptime Error', value = stderr),
00345 KeyValue(key = 'Output', value = stdout) ]
00346 return (diag_vals, diag_msg, diag_level)
00347
00348 upvals = stdout.split()
00349 load1 = upvals[-3].rstrip(',').replace(',', '.')
00350 load5 = upvals[-2].rstrip(',').replace(',', '.')
00351 load15 = upvals[-1].replace(',', '.')
00352 num_users = upvals[-7]
00353
00354
00355 if float(load1) > self._load1_threshold or float(load5) > self._load5_threshold:
00356 diag_level = DiagnosticStatus.WARN
00357
00358 diag_vals.append(KeyValue(key = 'Load Average Status', value = load_dict[diag_level]))
00359 diag_vals.append(KeyValue(key = '1 min Load Average', value = load1))
00360 diag_vals.append(KeyValue(key = '1 min Load Average Threshold', value = str(self._load1_threshold)))
00361 diag_vals.append(KeyValue(key = '5 min Load Average', value = load5))
00362 diag_vals.append(KeyValue(key = '5 min Load Average Threshold', value = str(self._load5_threshold)))
00363 diag_vals.append(KeyValue(key = '15 min Load Average', value = load15))
00364 diag_vals.append(KeyValue(key = 'Number of Users', value = num_users))
00365
00366 diag_msg = load_dict[diag_level]
00367
00368 except Exception, e:
00369 diag_level = DiagnosticStatus.ERROR
00370 diag_msg = 'Uptime Exception'
00371 diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
00372
00373 return diag_vals, diag_msg, diag_level
00374
00375
00376 def check_free_memory(self):
00377 diag_vals = []
00378 diag_msg = ''
00379 diag_level = DiagnosticStatus.OK
00380
00381 mem_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'Low Memory', DiagnosticStatus.ERROR: 'Very Low Memory' }
00382
00383 try:
00384 p = subprocess.Popen('free -m',
00385 stdout = subprocess.PIPE,
00386 stderr = subprocess.PIPE, shell = True)
00387 stdout, stderr = p.communicate()
00388 retcode = p.returncode
00389
00390 if retcode != 0:
00391 diag_level = DiagnosticStatus.ERROR
00392 diag_msg = 'Memory Usage Error'
00393 diag_vals = [ KeyValue(key = 'Memory Usage Error', value = stderr),
00394 KeyValue(key = 'Output', value = stdout) ]
00395 return (diag_vals, diag_msg, diag_level)
00396
00397 rows = stdout.split('\n')
00398
00399
00400 data = rows[1].split()
00401 total_mem = data[1]
00402 used_mem = data[2]
00403 free_mem = data[3]
00404 shared_mem = data[4]
00405 cache_mem = data[5]
00406 available_mem = data[6]
00407
00408 diag_level = DiagnosticStatus.OK
00409 if float(free_mem) < self._mem_warn:
00410 diag_level = DiagnosticStatus.WARN
00411 if float(free_mem) < self._mem_error:
00412 diag_level = DiagnosticStatus.ERROR
00413
00414 diag_vals.append(KeyValue(key = 'Mem Status', value = mem_dict[diag_level]))
00415 diag_vals.append(KeyValue(key = 'Mem Total', value = total_mem))
00416 diag_vals.append(KeyValue(key = 'Mem Used', value = used_mem))
00417 diag_vals.append(KeyValue(key = 'Mem Free', value = free_mem))
00418 diag_vals.append(KeyValue(key = 'Mem Shared', value = shared_mem))
00419 diag_vals.append(KeyValue(key = 'Mem Buff/Cache', value = cache_mem))
00420 diag_vals.append(KeyValue(key = 'Mem Available', value = available_mem))
00421
00422
00423 data = rows[2].split()
00424 total_mem = data[1]
00425 used_mem = data[2]
00426 free_mem = data[3]
00427
00428 diag_vals.append(KeyValue(key = 'Swap Total', value = total_mem))
00429 diag_vals.append(KeyValue(key = 'Swap Used', value = used_mem))
00430 diag_vals.append(KeyValue(key = 'Swap Free', value = free_mem))
00431
00432 diag_msg = mem_dict[diag_level]
00433
00434 except Exception, e:
00435 diag_level = DiagnosticStatus.ERROR
00436 diag_msg = 'Memory Usage Exception'
00437 diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
00438
00439 return diag_vals, diag_msg, diag_level
00440
00441
00442 def check_mpstat(self):
00443 diag_vals = []
00444 diag_msg = ''
00445 diag_level = DiagnosticStatus.OK
00446
00447 load_dict = { DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High Load', DiagnosticStatus.ERROR: 'Error' }
00448
00449 try:
00450 p = subprocess.Popen('mpstat -P ALL 1 1',
00451 stdout = subprocess.PIPE,
00452 stderr = subprocess.PIPE, shell = True)
00453 stdout, stderr = p.communicate()
00454 retcode = p.returncode
00455
00456 if retcode != 0:
00457 diag_level = DiagnosticStatus.ERROR
00458 diag_msg = 'CPU Usage Error'
00459 diag_vals = [ KeyValue(key = 'CPU Usage Error', value = stderr),
00460 KeyValue(key = 'Output', value = stdout) ]
00461 return (diag_vals, diag_msg, diag_level)
00462
00463
00464
00465 rows = stdout.split('\n')
00466 col_names = rows[2].split()
00467 idle_col = -1 if (len(col_names) > 2 and col_names[-1] == '%idle') else -2
00468
00469 num_cores = 0
00470 cores_loaded = 0
00471 for index, row in enumerate(stdout.split('\n')):
00472 if index < 3:
00473 continue
00474
00475
00476 if row.find('all') > -1:
00477 continue
00478
00479 lst = row.split()
00480 if len(lst) < 8:
00481 continue
00482
00483
00484 if lst[0].startswith('Average'):
00485 continue
00486
00487 cpu_name = '%d' % (num_cores)
00488 idle = lst[idle_col].replace(',', '.')
00489 user = lst[3].replace(',', '.')
00490 nice = lst[4].replace(',', '.')
00491 system = lst[5].replace(',', '.')
00492
00493 core_level = DiagnosticStatus.OK
00494 usage = float(user) + float(nice)
00495 if usage > self._core_load_warn:
00496 cores_loaded += 1
00497 core_level = DiagnosticStatus.WARN
00498 if usage > self._core_load_error:
00499 core_level = DiagnosticStatus.ERROR
00500
00501 diag_vals.append(KeyValue(key = 'CPU %s Status' % cpu_name, value = load_dict[core_level]))
00502 diag_vals.append(KeyValue(key = 'CPU %s User' % cpu_name, value = user))
00503 diag_vals.append(KeyValue(key = 'CPU %s Nice' % cpu_name, value = nice))
00504 diag_vals.append(KeyValue(key = 'CPU %s System' % cpu_name, value = system))
00505 diag_vals.append(KeyValue(key = 'CPU %s Idle' % cpu_name, value = idle))
00506
00507 num_cores += 1
00508
00509
00510 if num_cores - cores_loaded <= 2 and num_cores > 2:
00511 diag_level = DiagnosticStatus.WARN
00512
00513
00514 if self._num_cores > 0 and self._num_cores != num_cores:
00515 diag_level = DiagnosticStatus.ERROR
00516 diag_msg = 'Incorrect number of CPU cores: Expected %d, got %d. Computer may have not booted properly.' % self._num_cores, num_cores
00517 return diag_vals, diag_msg, diag_level
00518
00519 diag_msg = load_dict[diag_level]
00520
00521 except Exception, e:
00522 diag_level = DiagnosticStatus.ERROR
00523 diag_msg = 'CPU Usage Exception'
00524 diag_vals = [ KeyValue(key = 'Exception', value = str(e)) ]
00525
00526 return diag_vals, diag_msg, diag_level
00527
00528
00529 def query_netdata(self, chart, after):
00530 NETDATA_URI = 'http://127.0.0.1:19999/api/v1/data?chart=%s&format=json&after=-%d'
00531 url = NETDATA_URI % (chart, int(after))
00532
00533 try:
00534 r = requests.get(url)
00535 except requests.ConnectionError as ex:
00536 rospy.logerr("NetData ConnectionError %r", ex)
00537 return None
00538
00539 if r.status_code != 200:
00540 rospy.logerr("NetData request not successful with status_code %d", r.status_code)
00541 return None
00542
00543 rdata = r.json()
00544
00545 sdata = zip(*rdata['data'])
00546 d = dict()
00547
00548 for idx, label in enumerate(rdata['labels']):
00549 np_array = np.array(sdata[idx])
00550 if np_array.dtype == np.object:
00551 rospy.logwarn("Data from NetData malformed")
00552 return None
00553 d[label] = np_array
00554
00555 return d
00556
00557
00558 def check_core_throttling(self, interval=1):
00559 throt_dict = {DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High Thermal Throttling Events',
00560 DiagnosticStatus.ERROR: 'No valid Data from NetData'}
00561
00562 throt_level = DiagnosticStatus.OK
00563
00564 vals = []
00565
00566 netdata = self.query_netdata('cpu.core_throttling', interval)
00567
00568 for i in range(self._num_cores):
00569 lbl = 'CPU %d Thermal Throttling Events' % i
00570 netdata_key = 'cpu%d' % i
00571
00572 core_mean = 'N/A'
00573 if netdata is not None and netdata_key in netdata:
00574 core_data = netdata[netdata_key]
00575 if core_data is not None:
00576 core_mean = np.mean(core_data)
00577
00578 if core_mean > self._thermal_throttling_threshold:
00579 throt_level = DiagnosticStatus.WARN
00580 else:
00581 throt_level = DiagnosticStatus.ERROR
00582
00583 vals.append(KeyValue(key=lbl, value='%r' % core_mean))
00584
00585 vals.insert(0, KeyValue(key='Thermal Throttling Status', value=throt_dict[throt_level]))
00586 vals.append(KeyValue(key='Thermal Throttling Threshold', value=str(self._thermal_throttling_threshold)))
00587
00588 return throt_level, throt_dict[throt_level], vals
00589
00590
00591 def check_idlejitter(self, interval=1):
00592 jitter_dict = {DiagnosticStatus.OK: 'OK', DiagnosticStatus.WARN: 'High IDLE Jitter',
00593 DiagnosticStatus.ERROR: 'No valid Data from NetData'}
00594
00595 jitter_level = DiagnosticStatus.OK
00596
00597 vals = []
00598
00599 netdata = self.query_netdata('system.idlejitter', interval)
00600
00601 metric_list = [
00602 ('IDLE Jitter Min', 'min', self._idlejitter_min_threshold, np.min),
00603 ('IDLE Jitter Max', 'max', self._idlejitter_max_threshold, np.max),
00604 ('IDLE Jitter Average', 'average', self._idlejitter_average_threshold, np.mean),
00605 ]
00606
00607 for metric_label, metric_key, metric_threshold, aggregate_fnc in metric_list:
00608 metric_aggreagte = 'N/A'
00609 if netdata is not None and metric_key in netdata:
00610 metric_data = netdata[metric_key]
00611 if metric_data is not None:
00612 metric_aggreagte = aggregate_fnc(metric_data)
00613
00614 if metric_aggreagte > metric_threshold:
00615 jitter_level = DiagnosticStatus.WARN
00616 else:
00617 jitter_level = DiagnosticStatus.ERROR
00618
00619 vals.append(KeyValue(key=metric_label, value=str(metric_aggreagte)))
00620 vals.append(KeyValue(key=metric_label + ' Threshold', value=str(metric_threshold)))
00621
00622 vals.insert(0, KeyValue(key='IDLE Jitter Status', value=jitter_dict[jitter_level]))
00623
00624 return jitter_level, jitter_dict[jitter_level], vals
00625
00626
00627
00628 def get_core_temp_names(self):
00629 devices = {}
00630 platform_vals = []
00631 virtual_vals = []
00632 try:
00633
00634 p = subprocess.Popen('find /sys/devices/platform -name temp*_input',
00635 stdout = subprocess.PIPE,
00636 stderr = subprocess.PIPE, shell = True)
00637 stdout, stderr = p.communicate()
00638 retcode = p.returncode
00639
00640 if retcode != 0:
00641 rospy.logerr('Error find core temp locations: %s' % stderr)
00642 return []
00643
00644 for ln in stdout.split('\n'):
00645 if ln:
00646 device_path, device_file = os.path.split(ln.strip())
00647 device_label = device_path+'/'+device_file.split('_')[0]+'_label'
00648 name = open(device_label, 'r').read()
00649 pair = (name.strip(), ln.strip())
00650 platform_vals.append(pair)
00651
00652
00653 p = subprocess.Popen('find /sys/devices/virtual -name temp*_input',
00654 stdout = subprocess.PIPE,
00655 stderr = subprocess.PIPE, shell = True)
00656 stdout, stderr = p.communicate()
00657 retcode = p.returncode
00658
00659 if retcode != 0:
00660 rospy.logerr('Error find core temp locations: %s' % stderr)
00661 return []
00662
00663 for ln in stdout.split('\n'):
00664 if ln:
00665 device_path, device_file = os.path.split(ln.strip())
00666 name = open(device_path+'/name', 'r').read()
00667 pair = (name.strip(), ln.strip())
00668 virtual_vals.append(pair)
00669
00670 devices['platform'] = platform_vals
00671 devices['virtual'] = virtual_vals
00672 return devices
00673 except:
00674 rospy.logerr('Exception finding temp vals: %s' % str(e)())
00675 return []
00676
00677
00678 def check_info(self, event):
00679 diag_vals = []
00680 diag_msgs = []
00681 diag_level = DiagnosticStatus.OK
00682
00683 if self._check_ipmi:
00684 ipmi_vals, ipmi_msgs, ipmi_level = self.check_ipmi()
00685 diag_vals.extend(ipmi_vals)
00686 diag_msgs.extend(ipmi_msgs)
00687 diag_level = max(diag_level, ipmi_level)
00688
00689 if self._check_core_temps:
00690 core_vals, core_msgs, core_level = self.check_core_temps()
00691 diag_vals.extend(core_vals)
00692 diag_msgs.extend(core_msgs)
00693 diag_level = max(diag_level, core_level)
00694
00695 clock_vals, clock_msgs, clock_level = self.check_clock_speed()
00696 diag_vals.extend(clock_vals)
00697 diag_msgs.extend(clock_msgs)
00698 diag_level = max(diag_level, clock_level)
00699
00700 diag_log = set(diag_msgs)
00701 if len(diag_log) > DiagnosticStatus.OK:
00702 message = ', '.join(diag_log)
00703 else:
00704 message = stat_dict[diag_level]
00705
00706 self._info_stat.values = diag_vals
00707 self._info_stat.message = message
00708 self._info_stat.level = diag_level
00709
00710 def check_usage(self, event):
00711 diag_vals = []
00712 diag_msgs = []
00713 diag_level = DiagnosticStatus.OK
00714
00715
00716 mp_vals, mp_msg, mp_level = self.check_mpstat()
00717 diag_vals.extend(mp_vals)
00718 if mp_level > DiagnosticStatus.OK:
00719 diag_msgs.append(mp_msg)
00720 diag_level = max(diag_level, mp_level)
00721
00722
00723 if self._check_thermal_throttling_events:
00724 interval = math.ceil(self._usage_timer._period.to_sec())
00725 throt_level, throt_msg, throt_vals = self.check_core_throttling(interval=interval)
00726 diag_vals.extend(throt_vals)
00727 if throt_level > 0:
00728 diag_msgs.append(throt_msg)
00729 diag_level = max(diag_level, throt_level)
00730
00731
00732 if self._check_idlejitter:
00733 interval = math.ceil(self._usage_timer._period.to_sec())
00734 jitter_level, jitter_msg, jitter_vals = self.check_idlejitter(interval=interval)
00735 diag_vals.extend(jitter_vals)
00736 if jitter_level > 0:
00737 diag_msgs.append(jitter_msg)
00738 diag_level = max(diag_level, jitter_level)
00739
00740
00741 up_vals, up_msg, up_level = self.check_uptime()
00742 diag_vals.extend(up_vals)
00743 if up_level > DiagnosticStatus.OK:
00744 diag_msgs.append(up_msg)
00745 diag_level = max(diag_level, up_level)
00746
00747 if diag_msgs and diag_level > DiagnosticStatus.OK:
00748 usage_msg = ', '.join(set(diag_msgs))
00749 else:
00750 usage_msg = stat_dict[diag_level]
00751
00752 self._usage_stat.values = diag_vals
00753 self._usage_stat.message = usage_msg
00754 self._usage_stat.level = diag_level
00755
00756 def check_memory(self, event):
00757 diag_vals = []
00758 diag_msgs = []
00759 diag_level = DiagnosticStatus.OK
00760
00761
00762 mem_vals, mem_msg, mem_level = self.check_free_memory()
00763 diag_vals.extend(mem_vals)
00764 if mem_level > DiagnosticStatus.OK:
00765 diag_msgs.append(mem_msg)
00766 diag_level = max(diag_level, mem_level)
00767
00768 if diag_msgs and diag_level > DiagnosticStatus.OK:
00769 memory_msg = ', '.join(set(diag_msgs))
00770 else:
00771 memory_msg = stat_dict[diag_level]
00772
00773 self._memory_stat.values = diag_vals
00774 self._memory_stat.message = memory_msg
00775 self._memory_stat.level = diag_level
00776
00777 def publish_stats(self, event):
00778 msg = DiagnosticArray()
00779 msg.header.stamp = rospy.get_rostime()
00780 msg.status.append(self._info_stat)
00781 msg.status.append(self._usage_stat)
00782 msg.status.append(self._memory_stat)
00783 self._diag_pub.publish(msg)
00784
00785
00786 if __name__ == '__main__':
00787 hostname = socket.gethostname()
00788
00789 import optparse
00790 parser = optparse.OptionParser(usage="usage: cpu_monitor.py [--diag-hostname=cX]")
00791 parser.add_option("--diag-hostname", dest="diag_hostname",
00792 help="Computer name in diagnostics output (ex: 'c1')",
00793 metavar="DIAG_HOSTNAME",
00794 action="store", default = hostname)
00795 options, args = parser.parse_args(rospy.myargv())
00796
00797 try:
00798 rospy.init_node('cpu_monitor_%s' % hostname)
00799 except rospy.exceptions.ROSInitException:
00800 print('CPU monitor is unable to initialize node. Master may not be running.', file=sys.stderr)
00801 sys.exit(0)
00802
00803 cpu_node = CPUMonitor(hostname, options.diag_hostname)
00804 rospy.spin()