net_monitor.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 
40 
41 # copied from https://github.com/ethz-asl/ros-system-monitor
42 
43 
44 
45 import rospy
46 
47 import traceback
48 import threading
49 import subprocess
50 import re
51 
52 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
53 
54 stat_dict = {0: 'OK', 1: 'Warning', 2: 'Error'}
55 
56 
57 class NetMonitor(object):
58  def __init__(self):
59  rospy.init_node("net_monitor")
60  self._mutex = threading.Lock()
61  self._diag_hostname = rospy.get_param('~diag_hostname', "localhost")
62  self._net_level_warn = rospy.get_param('~net_level_warn', 0.95)
63  self._net_capacity = rospy.get_param('~net_capacity', 128)
64  self._carrier_changes_threshold = rospy.get_param('~carrier_changes_threshold', 20)
65  self._usage_stat = DiagnosticStatus()
66  self._usage_stat.name = '%s Network Usage' % self._diag_hostname
67  self._usage_stat.hardware_id = self._diag_hostname
68  self._usage_stat.level = DiagnosticStatus.OK
69  self._usage_stat.message = 'No Data'
70 
71  self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=1)
72  self._usage_timer = rospy.Timer(rospy.Duration(1), self.check_usage)
73  self._diag_timer = rospy.Timer(rospy.Duration(1), self.publish_stats)
74 
75  self._filehandles = {} # type Dict[str, file], mapping paths to files to actual, open, file handles
76 
77  def read_sysinfo(self, path):
78  if path not in self._filehandles:
79  self._filehandles[path] = open(path)
80 
81  fh = self._filehandles[path] # type: file
82  fh.seek(0)
83  return fh.readline().strip()
84 
85  def get_sys_net_stat(self, iface, sys):
86  try:
87  return 0, self.read_sysinfo('/sys/class/net/%s/statistics/%s' % (iface, sys))
88  except IOError:
89  return -1, None
90 
91  def get_sys_net(self, iface, sys):
92  try:
93  return 0, self.read_sysinfo('/sys/class/net/%s/%s' % (iface, sys))
94  except IOError:
95  return -1, None
96 
97  def check_network(self):
98  level = DiagnosticStatus.OK
99  net_msg = 'OK'
100  values = []
101  try:
102  p = subprocess.Popen('ifstat -q -S 1 1',
103  stdout=subprocess.PIPE,
104  stderr=subprocess.PIPE, shell=True)
105  stdout, stderr = p.communicate()
106  ret_code = p.returncode
107  try:
108  stdout = stdout.decode() #python3
109  except (UnicodeDecodeError, AttributeError):
110  pass
111 
112  if ret_code != 0:
113  values.append(KeyValue(key="\"ifstat -q -S 1 1\" Call Error",
114  value=str(ret_code)))
115  return DiagnosticStatus.ERROR, 'Call Error', values
116  rows = stdout.split('\n')
117  data = rows[0].split()
118  ifaces = []
119  for i in range(0, len(data)):
120  ifaces.append(data[i])
121  data = rows[2].split()
122  kb_in = []
123  kb_out = []
124  for i in range(0, len(data), 2):
125  kb_in.append(data[i])
126  kb_out.append(data[i + 1])
127  level = DiagnosticStatus.OK
128  for i in range(0, len(ifaces)):
129  values.append(KeyValue(key=str(i), value="======================="))
130  values.append(KeyValue(key='Interface Name',
131  value=ifaces[i]))
132  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'operstate')
133  if ret_code == 0:
134  values.append(KeyValue(key='State', value=cmd_out))
135  ifacematch = re.match('eth[0-9]+', ifaces[i]) or re.match('eno[0-9]+', ifaces[i])
136  if ifacematch and (cmd_out == 'down' or cmd_out == 'dormant'):
137  level = DiagnosticStatus.ERROR
138  net_msg = 'Network Down'
139  values.append(KeyValue(key='Input Traffic',
140  value=str(float(kb_in[i]) / 1024) + " (MB/s)")) if kb_in[i] != 'n/a' else 0
141  values.append(KeyValue(key='Output Traffic',
142  value=str(float(kb_out[i]) / 1024) + " (MB/s)")) if kb_out[i] != 'n/a' else 0
143  net_usage_in = float(kb_in[i]) / 1024 / self._net_capacity if kb_in[i] != 'n/a' else 0
144  net_usage_out = float(kb_out[i]) / 1024 / self._net_capacity if kb_out[i] != 'n/a' else 0
145  if net_usage_in > self._net_level_warn or \
146  net_usage_out > self._net_level_warn:
147  level = DiagnosticStatus.WARN
148  net_msg = 'High Network Usage (net_usage_in: {}, net_usage_out: {}, threshold: {})'.format(net_usage_in, net_usage_out, self._net_level_warn)
149  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'mtu')
150  if ret_code == 0:
151  values.append(KeyValue(key='MTU', value=cmd_out))
152  # additional keys (https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-class-net-statistics)
153  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_bytes')
154  if ret_code == 0:
155  values.append(KeyValue(key='Total received MB',
156  value=str(float(cmd_out) / 1024 / 1024)))
157  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_bytes')
158  if ret_code == 0:
159  values.append(KeyValue(key='Total transmitted MB',
160  value=str(float(cmd_out) / 1024 / 1024)))
161  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'collisions')
162  if ret_code == 0:
163  values.append(KeyValue(key='collisions', value=cmd_out))
164  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_errors')
165  if ret_code == 0:
166  values.append(KeyValue(key='rx_errors', value=cmd_out))
167  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_crc_errors')
168  if ret_code == 0:
169  values.append(KeyValue(key='rx_crc_errors', value=cmd_out))
170  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_dropped')
171  if ret_code == 0:
172  values.append(KeyValue(key='rx_dropped', value=cmd_out))
173  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_fifo_errors')
174  if ret_code == 0:
175  values.append(KeyValue(key='rx_fifo_errors', value=cmd_out))
176  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_frame_errors')
177  if ret_code == 0:
178  values.append(KeyValue(key='rx_frame_errors', value=cmd_out))
179  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_length_errors')
180  if ret_code == 0:
181  values.append(KeyValue(key='rx_length_errors', value=cmd_out))
182  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_missed_errors')
183  if ret_code == 0:
184  values.append(KeyValue(key='rx_missed_errors', value=cmd_out))
185  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_over_errors')
186  if ret_code == 0:
187  values.append(KeyValue(key='rx_over_errors', value=cmd_out))
188  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_packets')
189  if ret_code == 0:
190  values.append(KeyValue(key='rx_packets', value=cmd_out))
191  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_errors')
192  if ret_code == 0:
193  values.append(KeyValue(key='tx_errors', value=cmd_out))
194  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_aborted_errors')
195  if ret_code == 0:
196  values.append(KeyValue(key='tx_aborted_errors', value=cmd_out))
197  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_carrier_errors')
198  if ret_code == 0:
199  values.append(KeyValue(key='tx_carrier_errors', value=cmd_out))
200  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_fifo_errors')
201  if ret_code == 0:
202  values.append(KeyValue(key='tx_fifo_errors', value=cmd_out))
203  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_heartbeat_errors')
204  if ret_code == 0:
205  values.append(KeyValue(key='tx_heartbeat_errors', value=cmd_out))
206  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_window_errors')
207  if ret_code == 0:
208  values.append(KeyValue(key='tx_window_errors', value=cmd_out))
209  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_dropped')
210  if ret_code == 0:
211  values.append(KeyValue(key='tx_dropped', value=cmd_out))
212  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_packets')
213  if ret_code == 0:
214  values.append(KeyValue(key='tx_packets', value=cmd_out))
215  # additional keys (https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-class-net)
216  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'addr_assign_type')
217  if ret_code == 0:
218  try:
219  tmp_dict = {'0': 'permanent address', '1': 'randomly generated',
220  '2': 'stolen from another device', '3': 'set using dev_set_mac_address'}
221  values.append(KeyValue(key='addr_assign_type', value=tmp_dict[cmd_out]))
222  except KeyError:
223  values.append(KeyValue(key='addr_assign_type', value=cmd_out))
224  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'address')
225  if ret_code == 0:
226  values.append(KeyValue(key='address', value=cmd_out))
227  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'carrier')
228  if ret_code == 0:
229  try:
230  tmp_dict = {'0': 'physical link is down', '1': 'physical link is up'}
231  values.append(KeyValue(key='carrier', value=tmp_dict[cmd_out]))
232  except KeyError:
233  values.append(KeyValue(key='carrier', value=cmd_out))
234  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'carrier_changes')
235  if ret_code == 0:
236  values.append(KeyValue(key='carrier_changes', value=cmd_out))
237  if int(cmd_out) > self._carrier_changes_threshold:
238  level = DiagnosticStatus.WARN
239  net_msg = 'Network unstable (carrier_changes: {}, threshold: {})'.format(cmd_out, self._carrier_changes_threshold)
240  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'carrier_up_count')
241  if ret_code == 0:
242  values.append(KeyValue(key='carrier_up_count', value=cmd_out))
243  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'carrier_down_count')
244  if ret_code == 0:
245  values.append(KeyValue(key='carrier_down_count', value=cmd_out))
246  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'speed')
247  if ret_code == 0:
248  values.append(KeyValue(key='speed', value=cmd_out))
249  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'tx_queue_len')
250  if ret_code == 0:
251  values.append(KeyValue(key='tx_queue_len', value=cmd_out))
252  except Exception as e:
253  rospy.logerr(traceback.format_exc())
254  net_msg = 'Network Usage Check Error'
255  values.append(KeyValue(key=net_msg, value=str(e)))
256  values.append(KeyValue(key='Traceback', value=str(traceback.format_exc())))
257  level = DiagnosticStatus.ERROR
258  return level, net_msg, values
259 
260  def check_usage(self, _):
261  diag_level = DiagnosticStatus.OK
262  diag_vals = []
263  diag_msgs = []
264  net_level, net_msg, net_vals = self.check_network()
265  diag_vals.extend(net_vals)
266  if net_level > DiagnosticStatus.OK:
267  diag_msgs.append(net_msg)
268  diag_level = max(diag_level, net_level)
269  if diag_msgs and diag_level > DiagnosticStatus.OK:
270  usage_msg = ', '.join(set(diag_msgs))
271  else:
272  usage_msg = stat_dict[diag_level]
273  with self._mutex:
274  self._usage_stat.level = diag_level
275  self._usage_stat.values = diag_vals
276  self._usage_stat.message = usage_msg
277 
278  def publish_stats(self, _):
279  with self._mutex:
280  msg = DiagnosticArray()
281  msg.header.stamp = rospy.get_rostime()
282  msg.status.append(self._usage_stat)
283  self._diag_pub.publish(msg)
284 
285 
286 if __name__ == '__main__':
287  net_node = NetMonitor()
288  rospy.spin()
net_monitor.NetMonitor._diag_pub
_diag_pub
Definition: net_monitor.py:71
net_monitor.NetMonitor._net_level_warn
_net_level_warn
Definition: net_monitor.py:62
net_monitor.NetMonitor.check_usage
def check_usage(self, _)
Definition: net_monitor.py:260
net_monitor.NetMonitor._usage_timer
_usage_timer
Definition: net_monitor.py:72
net_monitor.NetMonitor._usage_stat
_usage_stat
Definition: net_monitor.py:65
net_monitor.NetMonitor.get_sys_net_stat
def get_sys_net_stat(self, iface, sys)
Definition: net_monitor.py:85
net_monitor.NetMonitor._filehandles
_filehandles
Definition: net_monitor.py:75
net_monitor.NetMonitor._diag_hostname
_diag_hostname
Definition: net_monitor.py:61
net_monitor.NetMonitor.publish_stats
def publish_stats(self, _)
Definition: net_monitor.py:278
net_monitor.NetMonitor.check_network
def check_network(self)
Definition: net_monitor.py:97
net_monitor.NetMonitor._carrier_changes_threshold
_carrier_changes_threshold
Definition: net_monitor.py:64
net_monitor.NetMonitor.get_sys_net
def get_sys_net(self, iface, sys)
Definition: net_monitor.py:91
net_monitor.NetMonitor
Definition: net_monitor.py:57
net_monitor.NetMonitor._diag_timer
_diag_timer
Definition: net_monitor.py:73
net_monitor.NetMonitor.read_sysinfo
def read_sysinfo(self, path)
Definition: net_monitor.py:77
net_monitor.NetMonitor._mutex
_mutex
Definition: net_monitor.py:60
net_monitor.NetMonitor._net_capacity
_net_capacity
Definition: net_monitor.py:63
net_monitor.NetMonitor.__init__
def __init__(self)
Definition: net_monitor.py:58


cob_monitoring
Author(s): Florian Weisshardt , Felix Messmer
autogenerated on Fri Aug 2 2024 09:45:52