00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024
00025
00026
00027
00028
00029
00030
00031
00032
00033
00034
00035
00036
00037 from __future__ import division
00038
00039 PKG = 'pr2_computer_monitor'
00040 import roslib; roslib.load_manifest(PKG)
00041
00042 import rospy
00043
00044 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
00045 from pr2_msgs.msg import GPUStatus
00046
00047 import subprocess
00048 import math
00049
00050 MAX_FAN_RPM = 4500
00051
00052 def _rads_to_rpm(rads):
00053 return rads / (2 * math.pi) * 60
00054
00055 def _rpm_to_rads(rpm):
00056 return rpm * (2 * math.pi) / 60
00057
00058 def gpu_status_to_diag(gpu_stat):
00059 stat = DiagnosticStatus()
00060 stat.name = 'GPU Status'
00061 stat.message = 'OK'
00062 stat.level = DiagnosticStatus.OK
00063 stat.hardware_id = gpu_stat.pci_device_id
00064
00065 stat.values.append(KeyValue(key='Product Name', value = gpu_stat.product_name))
00066 stat.values.append(KeyValue(key='PCI Device/Vendor ID', value = gpu_stat.pci_device_id))
00067 stat.values.append(KeyValue(key='PCI Location ID', value = gpu_stat.pci_location))
00068 stat.values.append(KeyValue(key='Display', value = gpu_stat.display))
00069 stat.values.append(KeyValue(key='Driver Version', value = gpu_stat.driver_version))
00070 stat.values.append(KeyValue(key='Temperature (C)', value = '%.0f' % gpu_stat.temperature))
00071 stat.values.append(KeyValue(key='Fan Speed (RPM)', value = '%.0f' % _rads_to_rpm(gpu_stat.fan_speed)))
00072 stat.values.append(KeyValue(key='Usage (%)', value = '%.0f' % gpu_stat.gpu_usage))
00073 stat.values.append(KeyValue(key='Memory (%)', value = '%.0f' % gpu_stat.memory_usage))
00074
00075
00076 if not gpu_stat.product_name or not gpu_stat.pci_device_id:
00077 stat.level = DiagnosticStatus.ERROR
00078 stat.message = 'No Device Data'
00079 return stat
00080
00081
00082 if gpu_stat.gpu_usage > 98:
00083 stat.level = max(stat.level, DiagnosticStatus.WARN)
00084 stat.message = 'High Load'
00085
00086
00087 if gpu_stat.temperature > 90:
00088 stat.level = max(stat.level, DiagnosticStatus.WARN)
00089 stat.message = 'High Temperature'
00090 if gpu_stat.temperature > 95:
00091 stat.level = max(stat.level, DiagnosticStatus.ERROR)
00092 stat.message = 'Temperature Alarm'
00093
00094
00095 if gpu_stat.fan_speed == 0:
00096 stat.level = max(stat.level, DiagnosticStatus.ERROR)
00097 stat.message = 'No Fan Speed'
00098
00099
00100
00101 return stat
00102
00103
00104 def _find_val(output, word):
00105 lines = output.split('\n')
00106 for line in lines:
00107 tple = line.split(':')
00108 if not len(tple) > 1:
00109 continue
00110
00111 name = tple[0].strip()
00112 val = ':'.join(tple[1:]).strip()
00113
00114 if not name.lower() == word.lower():
00115 continue
00116
00117 return val.strip()
00118
00119 return ''
00120
00121 def parse_smi_output(output):
00122 gpu_stat = GPUStatus()
00123
00124
00125 gpu_stat.product_name = _find_val(output, 'Product Name')
00126 gpu_stat.pci_device_id = _find_val(output, 'PCI Device/Vendor ID')
00127 gpu_stat.pci_location = _find_val(output, 'PCI Location ID')
00128 gpu_stat.display = _find_val(output, 'Display')
00129 gpu_stat.driver_version = _find_val(output, 'Driver Version')
00130
00131 temp_str = _find_val(output, 'Temperature')
00132 if temp_str:
00133 temp, units = temp_str.split()
00134 gpu_stat.temperature = int(temp)
00135
00136 fan_str = _find_val(output, 'Fan Speed')
00137 if fan_str:
00138
00139 fan_spd = float(fan_str.strip('\%').strip()) * 0.01 * MAX_FAN_RPM
00140
00141 gpu_stat.fan_speed = _rpm_to_rads(fan_spd)
00142
00143 usage_str = _find_val(output, 'GPU')
00144 if usage_str:
00145 usage = usage_str.strip('\%').strip()
00146 gpu_stat.gpu_usage = int(usage)
00147
00148 mem_str = _find_val(output, 'Memory')
00149 if mem_str:
00150 mem = mem_str.strip('\%').strip()
00151 gpu_stat.memory_usage = int(mem)
00152
00153 return gpu_stat
00154
00155 def get_gpu_status():
00156 p = subprocess.Popen('sudo nvidia-smi -a', stdout = subprocess.PIPE,
00157 stderr = subprocess.PIPE, shell = True)
00158 (o, e) = p.communicate()
00159
00160 if not p.returncode == 0:
00161 return ''
00162
00163 if not o: return ''
00164
00165 return o