16 #include <sys/types.h> 18 #include <sys/resource.h> 19 #include <sys/utsname.h> 20 #include <sys/prctl.h> 24 #include <boost/tokenizer.hpp> 25 #include <boost/range.hpp> 26 #include <boost/algorithm/string.hpp> 28 #include <fmt/format.h> 30 #define TASK_COMM_LEN 16 // from linux/sched.h 34 template<
typename... Args>
35 std::runtime_error
error(
const char* fmt,
const Args& ... args)
37 return std::runtime_error(fmt::format(fmt, args...));
45 explicit final_act(F
f) noexcept
46 : f_(std::move(f)), invoke_(
true) {}
48 final_act(final_act&& other) noexcept
49 : f_(std::move(other.f_)),
50 invoke_(other.invoke_)
52 other.invoke_ =
false;
55 final_act(
const final_act&) =
delete;
56 final_act& operator=(
const final_act&) =
delete;
68 inline final_act<F>
finally(
const F&
f) noexcept
70 return final_act<F>(
f);
74 inline final_act<F>
finally(F&& f) noexcept
76 return final_act<F>(std::forward<F>(
f));
79 static bool g_coreIsRelative =
true;
80 static bool g_coreIsRelative_valid =
false;
89 : m_launchNode(
std::move(launchNode))
90 , m_fdWatcher(
std::move(fdWatcher))
101 if(!g_coreIsRelative_valid)
103 char core_pattern[256];
104 int core_fd = open(
"/proc/sys/kernel/core_pattern", O_RDONLY | O_CLOEXEC);
107 log(
"could not open /proc/sys/kernel/core_pattern: {}", strerror(errno));
111 int bytes = read(core_fd, core_pattern,
sizeof(core_pattern)-1);
116 log(
"Could not read /proc/sys/kernel/core_pattern: {}", strerror(errno));
120 g_coreIsRelative = (core_pattern[0] !=
'/');
121 g_coreIsRelative_valid =
true;
129 kill(
m_pid, SIGKILL);
137 throw error(
"Could not find node '{}' in package '{}'",
143 std::vector<std::string> cmd =
m_launchNode->launchPrefix();
150 std::copy(args.begin(), args.end(), std::back_inserter(cmd));
158 cmd.push_back(map.first +
":=" + map.second);
178 strncpy(tmpfile,
"/tmp/rosmon-node-XXXXXX",
sizeof(tmpfile));
179 tmpfile[
sizeof(tmpfile)-1] = 0;
188 std::vector<char*> args;
189 auto argsCleaner =
finally([&args](){
190 for(
char*
arg : args)
200 if(openpty(&master, &slave,
nullptr,
nullptr,
nullptr) == -1)
201 throw error(
"Could not open pseudo terminal for child process: {}", strerror(errno));
205 args.push_back(strdup(
"rosrun"));
206 args.push_back(strdup(
"rosmon_core"));
207 args.push_back(strdup(
"_shim"));
209 args.push_back(strdup(
"--tty"));
210 args.push_back(strdup(fmt::format(
"{}", slave).c_str()));
214 args.push_back(strdup(
"--namespace"));
215 args.push_back(strdup(
m_launchNode->namespaceString().c_str()));
220 args.push_back(strdup(
"--env"));
221 args.push_back(strdup(fmt::format(
"{}={}", pair.first, pair.second).c_str()));
226 args.push_back(strdup(
"--coredump"));
230 args.push_back(strdup(
"--coredump-relative"));
235 args.push_back(strdup(
"--run"));
238 args.push_back(strdup(c.c_str()));
240 args.push_back(
nullptr);
246 throw error(
"Could not fork(): {}", strerror(errno));
252 if(execvp(
"rosrun", args.data()) != 0)
254 std::stringstream ss;
255 for(
const auto& part : cmd)
258 fmt::print(stderr,
"Could not execute '{}': {}\n", ss.str(), strerror(errno));
287 kill(-
m_pid, SIGINT);
296 log(
"required SIGKILL");
297 kill(
m_pid, SIGKILL);
318 kill(-
m_pid, SIGINT);
326 kill(-
m_pid, SIGKILL);
352 int bytes = read(
m_fd, buf,
sizeof(buf));
354 if(bytes == 0 || (bytes < 0 && errno == EIO))
360 if(waitpid(
m_pid, &status, 0) > 0)
363 if(errno == EINTR || errno == EAGAIN)
366 throw error(
"{}: Could not waitpid(): {}",
m_launchNode->name(), strerror(errno));
369 if(WIFEXITED(status))
371 log(
"{} exited with status {}",
name(), WEXITSTATUS(status));
372 ROS_INFO(
"rosmon: %s exited with status %d",
name().c_str(), WEXITSTATUS(status));
375 else if(WIFSIGNALED(status))
377 log(
"{} died from signal {}",
name(), WTERMSIG(status));
378 ROS_ERROR(
"rosmon: %s died from signal %d",
name().c_str(), WTERMSIG(status));
383 if(WCOREDUMP(status))
387 log(
"{} used launch-prefix, not collecting core dump as it is probably useless.",
name());
392 log(
"{} left a core dump",
name());
402 log(
"Could not remove process working directory '{}' after process exit: {}",
432 throw error(
"{}: Could not read: {}",
name(), strerror(errno));
434 for(
int i = 0; i < bytes; ++i)
450 template<
typename... Args>
456 static boost::iterator_range<std::string::const_iterator>
459 for(; begin != end && begin+1 != end; ++begin)
462 return {begin, begin+2};
470 char core_pattern[256];
471 int core_fd = open(
"/proc/sys/kernel/core_pattern", O_RDONLY | O_CLOEXEC);
474 log(
"could not open /proc/sys/kernel/core_pattern: {}", strerror(errno));
478 int bytes = read(core_fd, core_pattern,
sizeof(core_pattern)-1);
483 log(
"Could not read /proc/sys/kernel/core_pattern: {}", strerror(errno));
487 core_pattern[bytes-1] = 0;
489 if(core_pattern[0] ==
'|')
493 strncpy(core_pattern,
"core",
sizeof(core_pattern));
496 auto formatter = [&](boost::iterator_range<std::string::const_iterator> match) -> std::string {
497 char code = *(match.begin()+1);
504 return std::to_string(
m_pid);
506 return std::to_string(getuid());
508 return std::to_string(getgid());
510 return std::to_string(signal);
516 memset(&uts, 0,
sizeof(uts));
527 boost::replace_all(executable,
"/",
"!");
533 memset(&limit, 0,
sizeof(limit));
534 getrlimit(RLIMIT_CORE, &limit);
537 return std::to_string(limit.rlim_max);
544 std::string coreGlob = boost::find_format_all_copy(std::string(core_pattern),
corePatternFormatFinder, formatter);
547 if(coreGlob[0] !=
'/')
550 log(
"Determined pattern '{}'", coreGlob);
553 memset(&results, 0,
sizeof(results));
554 int ret = glob(coreGlob.c_str(), GLOB_NOSORT,
nullptr, &results);
556 if(ret != 0 || results.gl_pathc == 0)
558 log(
"Could not find a matching core file :-(");
563 if(results.gl_pathc > 1)
565 log(
"Found multiple matching core files :-(");
570 std::string coreFile = results.gl_pathv[0];
573 log(
"Found core file '{}'", coreFile);
575 std::stringstream ss;
577 ss <<
"gdb " <<
m_launchNode->executable() <<
" " << coreFile;
590 std::stringstream ss;
591 ss <<
"gdb -p " <<
m_pid;
595 if(!getenv(
"DISPLAY"))
597 log(
"No X11 available, run gdb yourself: {}", cmd);
601 char* envTerm = getenv(
"ROSMON_DEBUGGER_TERMINAL");
602 std::string term =
"xterm -e";
605 else if(getenv(
"KONSOLE_DBUS_SESSION"))
607 else if(getenv(
"VTE_VERSION"))
608 term =
"gnome-terminal -e";
610 log(
"Launching debugger: '{}'", cmd);
613 if(system((term +
" '" + cmd +
"' &").c_str()) != 0)
615 log(
"Could not launch debugger");
std::shared_ptr< const Node > ConstPtr
std::string m_debuggerCommand
boost::signals2::signal< void(std::string)> exitedSignal
Signalled whenever the process exits.
void start()
Start the node.
void forceExit()
Finish shutdown sequence.
void setPeriod(const WallDuration &period, bool reset=true)
void addMemory(uint64_t memoryBytes)
void stop(bool restart=false)
Stop the node.
boost::circular_buffer< char > m_rxBuffer
std::string m_processWorkingDirectory
bool coredumpAvailable() const
Is a core dump available from a crash under rosmon control?
static boost::iterator_range< std::string::const_iterator > corePatternFormatFinder(std::string::const_iterator begin, std::string::const_iterator end)
void addCPUTime(uint64_t userTime, uint64_t systemTime)
Idle (e.g. exited with code 0)
void gatherCoredump(int signal)
unsigned int m_restartCount
void launchDebugger()
Launch gdb interactively.
Waiting for automatic restart after crash.
WallTimer createWallTimer(WallDuration period, void(T::*callback)(const WallTimerEvent &), T *obj, bool oneshot=false, bool autostart=true) const
void log(const char *format, const Args &...args)
std::runtime_error error(const char *fmt, const Args &...args)
FDWatcher::Ptr m_fdWatcher
int pid() const
Node PID.
ros::WallTimer m_stopCheckTimer
void shutdown()
Start shutdown sequence.
std::string arg(const std::string &name, const ParseContext &context)
bool running() const
Is the node running?
std::vector< std::string > composeCommand() const
void endStatUpdate(uint64_t elapsedTime)
launch::Node::ConstPtr m_launchNode
Crashed (i.e. exited with code != 0)
State state() const
Get process state.
boost::signals2::signal< void(std::string, std::string)> logMessageSignal
Logging signal.
bool m_processWorkingDirectoryCreated
void restart()
Restart the node.
std::string name() const
Node name.
ros::WallTimer m_restartTimer