node_monitor.cpp
Go to the documentation of this file.
1 // Monitors a single node process
2 // Author: Max Schwarz <max.schwarz@uni-bonn.de>
3 
4 #include "node_monitor.h"
5 
6 #include <cerrno>
7 #include <csignal>
8 #include <cstdarg>
9 #include <cstdio>
10 #include <cstring>
11 #include <sstream>
12 
13 #include <fcntl.h>
14 #include <glob.h>
15 #include <pty.h>
16 #include <sys/types.h>
17 #include <sys/wait.h>
18 #include <sys/resource.h>
19 #include <sys/utsname.h>
20 #include <sys/prctl.h>
21 #include <unistd.h>
22 #include <wordexp.h>
23 
24 #include <boost/tokenizer.hpp>
25 #include <boost/range.hpp>
26 #include <boost/algorithm/string.hpp>
27 
28 #include <fmt/format.h>
29 
30 #define TASK_COMM_LEN 16 // from linux/sched.h
31 
32 namespace
33 {
34  template<typename... Args>
35  std::runtime_error error(const char* fmt, const Args& ... args)
36  {
37  return std::runtime_error(fmt::format(fmt, args...));
38  }
39 
40  // finally() from C++ core guidelines
41  template <class F>
42  class final_act
43  {
44  public:
45  explicit final_act(F f) noexcept
46  : f_(std::move(f)), invoke_(true) {}
47 
48  final_act(final_act&& other) noexcept
49  : f_(std::move(other.f_)),
50  invoke_(other.invoke_)
51  {
52  other.invoke_ = false;
53  }
54 
55  final_act(const final_act&) = delete;
56  final_act& operator=(const final_act&) = delete;
57 
58  ~final_act() noexcept
59  {
60  if (invoke_) f_();
61  }
62 
63  private:
64  F f_;
65  bool invoke_;
66  };
67  template <class F>
68  inline final_act<F> finally(const F& f) noexcept
69  {
70  return final_act<F>(f);
71  }
72 
73  template <class F>
74  inline final_act<F> finally(F&& f) noexcept
75  {
76  return final_act<F>(std::forward<F>(f));
77  }
78 
79  static bool g_coreIsRelative = true;
80  static bool g_coreIsRelative_valid = false;
81 }
82 
83 namespace rosmon
84 {
85 namespace monitor
86 {
87 
88 NodeMonitor::NodeMonitor(launch::Node::ConstPtr launchNode, FDWatcher::Ptr fdWatcher, ros::NodeHandle& nh)
89  : m_launchNode(std::move(launchNode))
90  , m_fdWatcher(std::move(fdWatcher))
91  , m_rxBuffer(4096)
92  , m_exitCode(0)
93  , m_command(CMD_STOP) // we start in stopped state
94  , m_restarting(false)
95 {
96  m_restartTimer = nh.createWallTimer(ros::WallDuration(1.0), boost::bind(&NodeMonitor::start, this), false, false);
98 
99  m_processWorkingDirectory = m_launchNode->workingDirectory();
100 
101  if(!g_coreIsRelative_valid)
102  {
103  char core_pattern[256];
104  int core_fd = open("/proc/sys/kernel/core_pattern", O_RDONLY | O_CLOEXEC);
105  if(core_fd < 0)
106  {
107  log("could not open /proc/sys/kernel/core_pattern: {}", strerror(errno));
108  return;
109  }
110 
111  int bytes = read(core_fd, core_pattern, sizeof(core_pattern)-1);
112  close(core_fd);
113 
114  if(bytes < 1)
115  {
116  log("Could not read /proc/sys/kernel/core_pattern: {}", strerror(errno));
117  return;
118  }
119 
120  g_coreIsRelative = (core_pattern[0] != '/');
121  g_coreIsRelative_valid = true;
122  }
123 }
124 
126 {
127  if(m_pid != -1)
128  {
129  kill(m_pid, SIGKILL);
130  }
131 }
132 
133 std::vector<std::string> NodeMonitor::composeCommand() const
134 {
135  if(m_launchNode->executable().empty())
136  {
137  throw error("Could not find node '{}' in package '{}'",
138  m_launchNode->type(), m_launchNode->package()
139  );
140  }
141 
142  // Start with the launch prefix...
143  std::vector<std::string> cmd = m_launchNode->launchPrefix();
144 
145  // add executable file
146  cmd.push_back(m_launchNode->executable());
147 
148  // add extra arguments from 'args'
149  auto args = m_launchNode->extraArguments();
150  std::copy(args.begin(), args.end(), std::back_inserter(cmd));
151 
152  // add parameter for node name
153  cmd.push_back("__name:=" + m_launchNode->name());
154 
155  // and finally add remappings.
156  for(auto map : m_launchNode->remappings())
157  {
158  cmd.push_back(map.first + ":=" + map.second);
159  }
160 
161  return cmd;
162 }
163 
165 {
166  m_command = CMD_RUN;
167 
170  m_restarting = false;
171 
172  if(running())
173  return;
174 
175  if(m_launchNode->coredumpsEnabled() && g_coreIsRelative && m_processWorkingDirectory.empty())
176  {
177  char tmpfile[256];
178  strncpy(tmpfile, "/tmp/rosmon-node-XXXXXX", sizeof(tmpfile));
179  tmpfile[sizeof(tmpfile)-1] = 0;
180  m_processWorkingDirectory = mkdtemp(tmpfile);
182  }
183 
184  ROS_INFO("rosmon: starting '%s'", m_launchNode->name().c_str());
185 
186  std::vector<std::string> cmd = composeCommand();
187 
188  std::vector<char*> args;
189  auto argsCleaner = finally([&args](){
190  for(char* arg : args)
191  free(arg);
192  });
193 
194  // Open pseudo-terminal
195  // NOTE: We are not using forkpty() here, as it is probably not safe in
196  // a multi-threaded process (see
197  // https://www.linuxprogrammingblog.com/threads-and-fork-think-twice-before-using-them)
198  int master, slave;
199 
200  if(openpty(&master, &slave, nullptr, nullptr, nullptr) == -1)
201  throw error("Could not open pseudo terminal for child process: {}", strerror(errno));
202 
203  // Compose args
204  {
205  args.push_back(strdup("rosrun"));
206  args.push_back(strdup("rosmon_core"));
207  args.push_back(strdup("_shim"));
208 
209  args.push_back(strdup("--tty"));
210  args.push_back(strdup(fmt::format("{}", slave).c_str()));
211 
212  if(!m_launchNode->namespaceString().empty())
213  {
214  args.push_back(strdup("--namespace"));
215  args.push_back(strdup(m_launchNode->namespaceString().c_str()));
216  }
217 
218  for(auto& pair : m_launchNode->extraEnvironment())
219  {
220  args.push_back(strdup("--env"));
221  args.push_back(strdup(fmt::format("{}={}", pair.first, pair.second).c_str()));
222  }
223 
224  if(m_launchNode->coredumpsEnabled())
225  {
226  args.push_back(strdup("--coredump"));
227 
228  if(g_coreIsRelative)
229  {
230  args.push_back(strdup("--coredump-relative"));
231  args.push_back(strdup(m_processWorkingDirectory.c_str()));
232  }
233  }
234 
235  args.push_back(strdup("--run"));
236 
237  for(auto& c : cmd)
238  args.push_back(strdup(c.c_str()));
239 
240  args.push_back(nullptr);
241  }
242 
243  // Fork!
244  int pid = fork();
245  if(pid < 0)
246  throw error("Could not fork(): {}", strerror(errno));
247 
248  if(pid == 0)
249  {
250  close(master);
251 
252  if(execvp("rosrun", args.data()) != 0)
253  {
254  std::stringstream ss;
255  for(const auto& part : cmd)
256  ss << part << " ";
257 
258  fmt::print(stderr, "Could not execute '{}': {}\n", ss.str(), strerror(errno));
259  }
260 
261  // We should not end up here...
262  std::abort();
263  }
264 
265  // Parent
266  close(slave);
267 
268  m_fd = master;
269  m_pid = pid;
270  m_fdWatcher->registerFD(m_fd, boost::bind(&NodeMonitor::communicate, this));
271 }
272 
274 {
275  if(restart)
277  else
279 
282 
283  if(!running())
284  return;
285 
286  // kill(-pid) sends the signal to all processes in the process group
287  kill(-m_pid, SIGINT);
288 
290 }
291 
293 {
294  if(running())
295  {
296  log("required SIGKILL");
297  kill(m_pid, SIGKILL);
298  }
299 
301 }
302 
304 {
307 
308  if(running())
309  stop(true);
310  else
311  start();
312 }
313 
315 {
316  if(m_pid != -1)
317  {
318  kill(-m_pid, SIGINT);
319  }
320 }
321 
323 {
324  if(m_pid != -1)
325  {
326  kill(-m_pid, SIGKILL);
327  }
328 }
329 
331 {
332  return m_pid != -1;
333 }
334 
336 {
337  if(running())
338  return STATE_RUNNING;
339 
340  if(m_restarting)
341  return STATE_WAITING;
342 
343  if(m_exitCode == 0)
344  return STATE_IDLE;
345 
346  return STATE_CRASHED;
347 }
348 
350 {
351  char buf[1024];
352  int bytes = read(m_fd, buf, sizeof(buf));
353 
354  if(bytes == 0 || (bytes < 0 && errno == EIO))
355  {
356  int status;
357 
358  while(true)
359  {
360  if(waitpid(m_pid, &status, 0) > 0)
361  break;
362 
363  if(errno == EINTR || errno == EAGAIN)
364  continue;
365 
366  throw error("{}: Could not waitpid(): {}", m_launchNode->name(), strerror(errno));
367  }
368 
369  if(WIFEXITED(status))
370  {
371  log("{} exited with status {}", name(), WEXITSTATUS(status));
372  ROS_INFO("rosmon: %s exited with status %d", name().c_str(), WEXITSTATUS(status));
373  m_exitCode = WEXITSTATUS(status);
374  }
375  else if(WIFSIGNALED(status))
376  {
377  log("{} died from signal {}", name(), WTERMSIG(status));
378  ROS_ERROR("rosmon: %s died from signal %d", name().c_str(), WTERMSIG(status));
379  m_exitCode = 255;
380  }
381 
382 #ifdef WCOREDUMP
383  if(WCOREDUMP(status))
384  {
385  if(!m_launchNode->launchPrefix().empty())
386  {
387  log("{} used launch-prefix, not collecting core dump as it is probably useless.", name());
388  }
389  else
390  {
391  // We have a chance to find the core dump...
392  log("{} left a core dump", name());
393  gatherCoredump(WTERMSIG(status));
394  }
395  }
396 #endif
397 
399  {
400  if(rmdir(m_processWorkingDirectory.c_str()) != 0)
401  {
402  log("Could not remove process working directory '{}' after process exit: {}",
403  m_processWorkingDirectory, strerror(errno)
404  );
405  }
407  }
408 
409  m_pid = -1;
410  m_fdWatcher->removeFD(m_fd);
411  close(m_fd);
412  m_fd = -1;
413 
414  if(m_command == CMD_RESTART || (m_command == CMD_RUN && m_launchNode->respawn()))
415  {
416  if(m_command == CMD_RESTART)
418  else
419  m_restartTimer.setPeriod(m_launchNode->respawnDelay());
420 
421  m_restartCount++;
423  m_restarting = true;
424  }
425 
426  exitedSignal(name());
427 
428  return;
429  }
430 
431  if(bytes < 0)
432  throw error("{}: Could not read: {}", name(), strerror(errno));
433 
434  for(int i = 0; i < bytes; ++i)
435  {
436  m_rxBuffer.push_back(buf[i]);
437  if(buf[i] == '\n')
438  {
439  m_rxBuffer.push_back(0);
440  m_rxBuffer.linearize();
441 
442  auto one = m_rxBuffer.array_one();
443  logMessageSignal(name(), one.first);
444 
445  m_rxBuffer.clear();
446  }
447  }
448 }
449 
450 template<typename... Args>
451 void NodeMonitor::log(const char* format, const Args& ... args)
452 {
453  logMessageSignal(name(), fmt::format(format, args...));
454 }
455 
456 static boost::iterator_range<std::string::const_iterator>
457 corePatternFormatFinder(std::string::const_iterator begin, std::string::const_iterator end)
458 {
459  for(; begin != end && begin+1 != end; ++begin)
460  {
461  if(*begin == '%')
462  return {begin, begin+2};
463  }
464 
465  return {end, end};
466 }
467 
469 {
470  char core_pattern[256];
471  int core_fd = open("/proc/sys/kernel/core_pattern", O_RDONLY | O_CLOEXEC);
472  if(core_fd < 0)
473  {
474  log("could not open /proc/sys/kernel/core_pattern: {}", strerror(errno));
475  return;
476  }
477 
478  int bytes = read(core_fd, core_pattern, sizeof(core_pattern)-1);
479  close(core_fd);
480 
481  if(bytes < 1)
482  {
483  log("Could not read /proc/sys/kernel/core_pattern: {}", strerror(errno));
484  return;
485  }
486 
487  core_pattern[bytes-1] = 0; // Strip off the newline at the end
488 
489  if(core_pattern[0] == '|')
490  {
491  // This may be apport, but apport still writes a "core" file if the
492  // limit is set appropriately.
493  strncpy(core_pattern, "core", sizeof(core_pattern));
494  }
495 
496  auto formatter = [&](boost::iterator_range<std::string::const_iterator> match) -> std::string {
497  char code = *(match.begin()+1);
498 
499  switch(code)
500  {
501  case '%':
502  return "%";
503  case 'p':
504  return std::to_string(m_pid);
505  case 'u':
506  return std::to_string(getuid());
507  case 'g':
508  return std::to_string(getgid());
509  case 's':
510  return std::to_string(signal);
511  case 't':
512  return "*"; // No chance
513  case 'h':
514  {
515  utsname uts;
516  memset(&uts, 0, sizeof(uts));
517  if(uname(&uts) != 0)
518  return "*";
519 
520  return uts.nodename;
521  }
522  case 'e':
523  return m_launchNode->type().substr(0, TASK_COMM_LEN-1);
524  case 'E':
525  {
526  std::string executable = m_launchNode->executable();
527  boost::replace_all(executable, "/", "!");
528  return executable;
529  }
530  case 'c':
531  {
532  rlimit limit;
533  memset(&limit, 0, sizeof(limit));
534  getrlimit(RLIMIT_CORE, &limit);
535 
536  // core limit is set to the maximum above
537  return std::to_string(limit.rlim_max);
538  }
539  default:
540  return "*";
541  }
542  };
543 
544  std::string coreGlob = boost::find_format_all_copy(std::string(core_pattern), corePatternFormatFinder, formatter);
545 
546  // If the pattern is not absolute, it is relative to our node's cwd.
547  if(coreGlob[0] != '/')
548  coreGlob = m_processWorkingDirectory + "/" + coreGlob;
549 
550  log("Determined pattern '{}'", coreGlob);
551 
552  glob_t results;
553  memset(&results, 0, sizeof(results));
554  int ret = glob(coreGlob.c_str(), GLOB_NOSORT, nullptr, &results);
555 
556  if(ret != 0 || results.gl_pathc == 0)
557  {
558  log("Could not find a matching core file :-(");
559  globfree(&results);
560  return;
561  }
562 
563  if(results.gl_pathc > 1)
564  {
565  log("Found multiple matching core files :-(");
566  globfree(&results);
567  return;
568  }
569 
570  std::string coreFile = results.gl_pathv[0];
571  globfree(&results);
572 
573  log("Found core file '{}'", coreFile);
574 
575  std::stringstream ss;
576 
577  ss << "gdb " << m_launchNode->executable() << " " << coreFile;
578 
579  m_debuggerCommand = ss.str();
580 }
581 
583 {
584  std::string cmd;
585 
586  if(coredumpAvailable())
587  cmd = m_debuggerCommand;
588  else
589  {
590  std::stringstream ss;
591  ss << "gdb -p " << m_pid;
592  cmd = ss.str();
593  }
594 
595  if(!getenv("DISPLAY"))
596  {
597  log("No X11 available, run gdb yourself: {}", cmd);
598  }
599  else
600  {
601  char* envTerm = getenv("ROSMON_DEBUGGER_TERMINAL");
602  std::string term = "xterm -e";
603  if(envTerm)
604  term = envTerm;
605  else if(getenv("KONSOLE_DBUS_SESSION"))
606  term = "konsole -e";
607  else if(getenv("VTE_VERSION"))
608  term = "gnome-terminal -e";
609 
610  log("Launching debugger: '{}'", cmd);
611 
612  // system() is not particularly elegant here, but we trust our cmd.
613  if(system((term + " '" + cmd + "' &").c_str()) != 0)
614  {
615  log("Could not launch debugger");
616  }
617  }
618 }
619 
620 
622 {
623  m_userTime = 0;
624  m_systemTime = 0;
625  m_memory = 0;
626 }
627 
628 void NodeMonitor::addCPUTime(uint64_t userTime, uint64_t systemTime)
629 {
630  m_userTime += userTime;
631  m_systemTime += systemTime;
632 }
633 
634 void NodeMonitor::addMemory(uint64_t memoryBytes)
635 {
636  m_memory += memoryBytes;
637 }
638 
639 void NodeMonitor::endStatUpdate(uint64_t elapsedTime)
640 {
641  m_userLoad = float(m_userTime) / elapsedTime;
642  m_systemLoad = float(m_systemTime) / elapsedTime;
643 }
644 
645 }
646 }
std::shared_ptr< const Node > ConstPtr
Definition: node.h:24
boost::signals2::signal< void(std::string)> exitedSignal
Signalled whenever the process exits.
Definition: node_monitor.h:183
void start()
Start the node.
void forceExit()
Finish shutdown sequence.
void setPeriod(const WallDuration &period, bool reset=true)
void addMemory(uint64_t memoryBytes)
void stop(bool restart=false)
Stop the node.
boost::circular_buffer< char > m_rxBuffer
Definition: node_monitor.h:205
std::string m_processWorkingDirectory
Definition: node_monitor.h:228
bool coredumpAvailable() const
Is a core dump available from a crash under rosmon control?
Definition: node_monitor.h:88
static boost::iterator_range< std::string::const_iterator > corePatternFormatFinder(std::string::const_iterator begin, std::string::const_iterator end)
void addCPUTime(uint64_t userTime, uint64_t systemTime)
Idle (e.g. exited with code 0)
Definition: node_monitor.h:33
float f(float t)
Definition: husl.c:142
void gatherCoredump(int signal)
#define ROS_INFO(...)
void launchDebugger()
Launch gdb interactively.
Waiting for automatic restart after crash.
Definition: node_monitor.h:36
WallTimer createWallTimer(WallDuration period, void(T::*callback)(const WallTimerEvent &), T *obj, bool oneshot=false, bool autostart=true) const
void log(const char *format, const Args &...args)
std::runtime_error error(const char *fmt, const Args &...args)
Definition: fd_watcher.cpp:15
#define TASK_COMM_LEN
int pid() const
Node PID.
Definition: node_monitor.h:168
ros::WallTimer m_stopCheckTimer
Definition: node_monitor.h:211
void shutdown()
Start shutdown sequence.
std::string arg(const std::string &name, const ParseContext &context)
bool running() const
Is the node running?
std::vector< std::string > composeCommand() const
void endStatUpdate(uint64_t elapsedTime)
launch::Node::ConstPtr m_launchNode
Definition: node_monitor.h:201
Crashed (i.e. exited with code != 0)
Definition: node_monitor.h:35
State state() const
Get process state.
boost::signals2::signal< void(std::string, std::string)> logMessageSignal
Logging signal.
Definition: node_monitor.h:180
void restart()
Restart the node.
std::string name() const
Node name.
Definition: node_monitor.h:160
#define ROS_ERROR(...)


rosmon_core
Author(s): Max Schwarz
autogenerated on Wed Jul 10 2019 03:10:12