net_monitor.py
Go to the documentation of this file.
1 #!/usr/bin/env python
2 ############################################################################
3 # Copyright (C) 2009, Willow Garage, Inc. #
4 # Copyright (C) 2013 by Ralf Kaestner #
5 # ralf.kaestner@gmail.com #
6 # Copyright (C) 2013 by Jerome Maye #
7 # jerome.maye@mavt.ethz.ch #
8 # #
9 # All rights reserved. #
10 # #
11 # Redistribution and use in source and binary forms, with or without #
12 # modification, are permitted provided that the following conditions #
13 # are met: #
14 # #
15 # 1. Redistributions of source code must retain the above copyright #
16 # notice, this list of conditions and the following disclaimer. #
17 # #
18 # 2. Redistributions in binary form must reproduce the above copyright #
19 # notice, this list of conditions and the following disclaimer in #
20 # the documentation and/or other materials provided with the #
21 # distribution. #
22 # #
23 # 3. The name of the copyright holders may be used to endorse or #
24 # promote products derived from this software without specific #
25 # prior written permission. #
26 # #
27 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS #
28 # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT #
29 # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS #
30 # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE #
31 # COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, #
32 # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, #
33 # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; #
34 # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER #
35 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT #
36 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN #
37 # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE #
38 # POSSIBILITY OF SUCH DAMAGE. #
39 ############################################################################
40 
41 # copied from https://github.com/ethz-asl/ros-system-monitor
42 
43 
44 
45 import rospy
46 
47 import traceback
48 import threading
49 import subprocess
50 import re
51 
52 from diagnostic_msgs.msg import DiagnosticArray, DiagnosticStatus, KeyValue
53 
54 stat_dict = {0: 'OK', 1: 'Warning', 2: 'Error'}
55 
56 
57 class NetMonitor(object):
58  def __init__(self):
59  rospy.init_node("net_monitor")
60  self._mutex = threading.Lock()
61  self._diag_hostname = rospy.get_param('~diag_hostname', "localhost")
62  self._net_level_warn = rospy.get_param('~net_level_warn', 0.95)
63  self._net_capacity = rospy.get_param('~net_capacity', 128)
64  self._carrier_changes_threshold = rospy.get_param('~carrier_changes_threshold', 20)
65  self._usage_stat = DiagnosticStatus()
66  self._usage_stat.name = '%s Network Usage' % self._diag_hostname
67  self._usage_stat.hardware_id = self._diag_hostname
68  self._usage_stat.level = DiagnosticStatus.OK
69  self._usage_stat.message = 'No Data'
70 
71  self._diag_pub = rospy.Publisher('/diagnostics', DiagnosticArray, queue_size=1)
72  self._usage_timer = rospy.Timer(rospy.Duration(1), self.check_usage)
73  self._diag_timer = rospy.Timer(rospy.Duration(1), self.publish_stats)
74 
75  self._filehandles = {} # type Dict[str, file], mapping paths to files to actual, open, file handles
76 
77  def read_sysinfo(self, path):
78  if path not in self._filehandles:
79  self._filehandles[path] = open(path)
80 
81  fh = self._filehandles[path] # type: file
82  fh.seek(0)
83  return fh.readline().strip()
84 
85  def get_sys_net_stat(self, iface, sys):
86  try:
87  return 0, self.read_sysinfo('/sys/class/net/%s/statistics/%s' % (iface, sys))
88  except IOError:
89  return -1, None
90 
91  def get_sys_net(self, iface, sys):
92  try:
93  return 0, self.read_sysinfo('/sys/class/net/%s/%s' % (iface, sys))
94  except IOError:
95  return -1, None
96 
97  def check_network(self):
98  level = DiagnosticStatus.OK
99  net_msg = 'OK'
100  values = []
101  try:
102  p = subprocess.Popen('ifstat -q -S 1 1',
103  stdout=subprocess.PIPE,
104  stderr=subprocess.PIPE, shell=True)
105  stdout, stderr = p.communicate()
106  ret_code = p.returncode
107  try:
108  stdout = stdout.decode() #python3
109  except (UnicodeDecodeError, AttributeError):
110  pass
111 
112  if ret_code != 0:
113  values.append(KeyValue(key="\"ifstat -q -S 1 1\" Call Error",
114  value=str(ret_code)))
115  return DiagnosticStatus.ERROR, 'Call Error', values
116  rows = stdout.split('\n')
117  data = rows[0].split()
118  ifaces = []
119  for i in range(0, len(data)):
120  ifaces.append(data[i])
121  data = rows[2].split()
122  kb_in = []
123  kb_out = []
124  for i in range(0, len(data), 2):
125  kb_in.append(data[i])
126  kb_out.append(data[i + 1])
127  level = DiagnosticStatus.OK
128  for i in range(0, len(ifaces)):
129  values.append(KeyValue(key=str(i), value="======================="))
130  values.append(KeyValue(key='Interface Name',
131  value=ifaces[i]))
132  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'operstate')
133  if ret_code == 0:
134  values.append(KeyValue(key='State', value=cmd_out))
135  ifacematch = re.match('eth[0-9]+', ifaces[i]) or re.match('eno[0-9]+', ifaces[i])
136  if ifacematch and (cmd_out == 'down' or cmd_out == 'dormant'):
137  level = DiagnosticStatus.ERROR
138  net_msg = 'Network Down'
139  values.append(KeyValue(key='Input Traffic',
140  value=str(float(kb_in[i]) / 1024) + " (MB/s)")) if kb_in[i] != 'n/a' else 0
141  values.append(KeyValue(key='Output Traffic',
142  value=str(float(kb_out[i]) / 1024) + " (MB/s)")) if kb_out[i] != 'n/a' else 0
143  net_usage_in = float(kb_in[i]) / 1024 / self._net_capacity if kb_in[i] != 'n/a' else 0
144  net_usage_out = float(kb_out[i]) / 1024 / self._net_capacity if kb_out[i] != 'n/a' else 0
145  if net_usage_in > self._net_level_warn or \
146  net_usage_out > self._net_level_warn:
147  level = DiagnosticStatus.WARN
148  net_msg = 'High Network Usage (net_usage_in: {}, net_usage_out: {}, threshold: {})'.format(net_usage_in, net_usage_out, self._net_level_warn)
149  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'mtu')
150  if ret_code == 0:
151  values.append(KeyValue(key='MTU', value=cmd_out))
152  # additional keys (https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-class-net-statistics)
153  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_bytes')
154  if ret_code == 0:
155  values.append(KeyValue(key='Total received MB',
156  value=str(float(cmd_out) / 1024 / 1024)))
157  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_bytes')
158  if ret_code == 0:
159  values.append(KeyValue(key='Total transmitted MB',
160  value=str(float(cmd_out) / 1024 / 1024)))
161  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'collisions')
162  if ret_code == 0:
163  values.append(KeyValue(key='collisions', value=cmd_out))
164  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_errors')
165  if ret_code == 0:
166  values.append(KeyValue(key='rx_errors', value=cmd_out))
167  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_crc_errors')
168  if ret_code == 0:
169  values.append(KeyValue(key='rx_crc_errors', value=cmd_out))
170  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_dropped')
171  if ret_code == 0:
172  values.append(KeyValue(key='rx_dropped', value=cmd_out))
173  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_fifo_errors')
174  if ret_code == 0:
175  values.append(KeyValue(key='rx_fifo_errors', value=cmd_out))
176  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_frame_errors')
177  if ret_code == 0:
178  values.append(KeyValue(key='rx_frame_errors', value=cmd_out))
179  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_length_errors')
180  if ret_code == 0:
181  values.append(KeyValue(key='rx_length_errors', value=cmd_out))
182  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_missed_errors')
183  if ret_code == 0:
184  values.append(KeyValue(key='rx_missed_errors', value=cmd_out))
185  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_over_errors')
186  if ret_code == 0:
187  values.append(KeyValue(key='rx_over_errors', value=cmd_out))
188  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'rx_packets')
189  if ret_code == 0:
190  values.append(KeyValue(key='rx_packets', value=cmd_out))
191  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_errors')
192  if ret_code == 0:
193  values.append(KeyValue(key='tx_errors', value=cmd_out))
194  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_aborted_errors')
195  if ret_code == 0:
196  values.append(KeyValue(key='tx_aborted_errors', value=cmd_out))
197  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_carrier_errors')
198  if ret_code == 0:
199  values.append(KeyValue(key='tx_carrier_errors', value=cmd_out))
200  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_fifo_errors')
201  if ret_code == 0:
202  values.append(KeyValue(key='tx_fifo_errors', value=cmd_out))
203  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_heartbeat_errors')
204  if ret_code == 0:
205  values.append(KeyValue(key='tx_heartbeat_errors', value=cmd_out))
206  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_window_errors')
207  if ret_code == 0:
208  values.append(KeyValue(key='tx_window_errors', value=cmd_out))
209  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_dropped')
210  if ret_code == 0:
211  values.append(KeyValue(key='tx_dropped', value=cmd_out))
212  (ret_code, cmd_out) = self.get_sys_net_stat(ifaces[i], 'tx_packets')
213  if ret_code == 0:
214  values.append(KeyValue(key='tx_packets', value=cmd_out))
215  # additional keys (https://www.kernel.org/doc/Documentation/ABI/testing/sysfs-class-net)
216  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'addr_assign_type')
217  if ret_code == 0:
218  try:
219  tmp_dict = {'0': 'permanent address', '1': 'randomly generated',
220  '2': 'stolen from another device', '3': 'set using dev_set_mac_address'}
221  values.append(KeyValue(key='addr_assign_type', value=tmp_dict[cmd_out]))
222  except KeyError:
223  values.append(KeyValue(key='addr_assign_type', value=cmd_out))
224  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'address')
225  if ret_code == 0:
226  values.append(KeyValue(key='address', value=cmd_out))
227  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'carrier')
228  if ret_code == 0:
229  try:
230  tmp_dict = {'0': 'physical link is down', '1': 'physical link is up'}
231  values.append(KeyValue(key='carrier', value=tmp_dict[cmd_out]))
232  except KeyError:
233  values.append(KeyValue(key='carrier', value=cmd_out))
234  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'carrier_changes')
235  if ret_code == 0:
236  values.append(KeyValue(key='carrier_changes', value=cmd_out))
237  if int(cmd_out) > self._carrier_changes_threshold:
238  level = DiagnosticStatus.WARN
239  net_msg = 'Network unstable (carrier_changes: {}, threshold: {})'.format(cmd_out, self._carrier_changes_threshold)
240  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'carrier_up_count')
241  if ret_code == 0:
242  values.append(KeyValue(key='carrier_up_count', value=cmd_out))
243  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'carrier_down_count')
244  if ret_code == 0:
245  values.append(KeyValue(key='carrier_down_count', value=cmd_out))
246  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'speed')
247  if ret_code == 0:
248  values.append(KeyValue(key='speed', value=cmd_out))
249  (ret_code, cmd_out) = self.get_sys_net(ifaces[i], 'tx_queue_len')
250  if ret_code == 0:
251  values.append(KeyValue(key='tx_queue_len', value=cmd_out))
252  except Exception as e:
253  rospy.logerr(traceback.format_exc())
254  net_msg = 'Network Usage Check Error'
255  values.append(KeyValue(key=net_msg, value=str(e)))
256  level = DiagnosticStatus.ERROR
257  return level, net_msg, values
258 
259  def check_usage(self, _):
260  diag_level = DiagnosticStatus.OK
261  diag_vals = []
262  diag_msgs = []
263  net_level, net_msg, net_vals = self.check_network()
264  diag_vals.extend(net_vals)
265  if net_level > DiagnosticStatus.OK:
266  diag_msgs.append(net_msg)
267  diag_level = max(diag_level, net_level)
268  if diag_msgs and diag_level > DiagnosticStatus.OK:
269  usage_msg = ', '.join(set(diag_msgs))
270  else:
271  usage_msg = stat_dict[diag_level]
272  with self._mutex:
273  self._usage_stat.level = diag_level
274  self._usage_stat.values = diag_vals
275  self._usage_stat.message = usage_msg
276 
277  def publish_stats(self, _):
278  with self._mutex:
279  msg = DiagnosticArray()
280  msg.header.stamp = rospy.get_rostime()
281  msg.status.append(self._usage_stat)
282  self._diag_pub.publish(msg)
283 
284 
285 if __name__ == '__main__':
286  net_node = NetMonitor()
287  rospy.spin()
def publish_stats(self, _)
Definition: net_monitor.py:277
def get_sys_net(self, iface, sys)
Definition: net_monitor.py:91
def get_sys_net_stat(self, iface, sys)
Definition: net_monitor.py:85
def check_usage(self, _)
Definition: net_monitor.py:259
def read_sysinfo(self, path)
Definition: net_monitor.py:77


cob_monitoring
Author(s): Florian Weisshardt , Felix Messmer
autogenerated on Wed Apr 7 2021 03:03:11