Improve error detection during server termination
We added default conditions, while working on #349 and #252:
Segmentation fault:
if self.process.output_beautifier.stderr:find('Segmentation fault') then
error(
('Segmentation fault during process termination (alias: %s, workdir: %s, pid: %d)\n%s')
...
)
end
Memory leak from sanitizer:
if self.process.output_beautifier.stderr:find('LeakSanitizer') then
error(
('Memory leak during process termination (alias: %s, workdir: %s, pid: %s)\n%s')
...
)
end
Obviously, if we add error handling they will extend the server:stop() method. It is necessary to think about some kind of uniform approach.
Error codes in Tarantool source code.
It would be nice to catch an exit code/a signal number. I've experimented around a bit.
It seems, we can't just use waitpid(), because libev reaps childs on its own using signalfd.
OTOH, we can handle SIGCHLD using libev from tarantool sources.
diff --git a/src/lua/popen.c b/src/lua/popen.c
index 942f99d083..9fdd4c2e7a 100644
--- a/src/lua/popen.c
+++ b/src/lua/popen.c
@@ -37,6 +37,8 @@
#include <small/region.h>
+#include "tarantool_ev.h"
+
#include "diag.h"
#include "core/popen.h"
#include "core/fiber.h"
@@ -2410,6 +2412,59 @@ lbox_popen_gc(struct lua_State *L)
return 0;
}
+struct ww {
+ pid_t pid;
+ bool done;
+ int wstatus;
+ ev_child ev_sigchld;
+};
+
+static void
+sigchld_handler(EV_P_ ev_child *w, int revents)
+{
+ (void)revents;
+
+ ev_child_stop(EV_A_ w);
+
+ struct ww *ww = container_of(w, struct ww, ev_sigchld);
+ assert(w->rpid == ww->pid);
+ ww->wstatus = w->rstatus;
+ ww->done = true;
+}
+
+static int
+lbox_popen_w(struct lua_State *L)
+{
+ struct ww ww;
+
+ ww.pid = lua_tointeger(L, 1);
+ ww.done = false;
+ ww.wstatus = 0;
+
+ ev_child_init(&ww.ev_sigchld, sigchld_handler, ww.pid, 0);
+ ev_child_start(EV_DEFAULT_ &ww.ev_sigchld);
+
+ while (!ww.done) {
+ fiber_sleep(0.1);
+ }
+
+ lua_createtable(L, 0, 2);
+
+ if (WIFEXITED(ww.wstatus)) {
+ lua_pushliteral(L, "exited");
+ lua_setfield(L, -2, "state");
+ lua_pushinteger(L, WEXITSTATUS(ww.wstatus));
+ lua_setfield(L, -2, "exit_code");
+ } else {
+ lua_pushliteral(L, "signaled");
+ lua_setfield(L, -2, "state");
+ lua_pushinteger(L, WTERMSIG(ww.wstatus));
+ lua_setfield(L, -2, "signo");
+ }
+
+ return 1;
+}
+
/* }}} */
/* {{{ Module initialization */
@@ -2453,6 +2508,7 @@ tarantool_lua_popen_init(struct lua_State *L)
static const struct luaL_Reg popen_methods[] = {
{"new", lbox_popen_new, },
{"shell", lbox_popen_shell, },
+ {"w", lbox_popen_w, },
{NULL, NULL},
};
luaT_newmodule(L, "popen", popen_methods);
And use this function from luatest.
diff --git a/luatest/process.lua b/luatest/process.lua
index 9d447b3..2084dec 100644
--- a/luatest/process.lua
+++ b/luatest/process.lua
@@ -4,6 +4,7 @@ local fun = require('fun')
local ffi = require('ffi')
local fio = require('fio')
local log = require('log')
+local fiber = require('fiber')
local Class = require('luatest.class')
local OutputBeautifier = require('luatest.output_beautifier')
@@ -55,7 +56,15 @@ function Process:start(path, args, env, options)
if output_beautifier then
output_beautifier:enable({track_pid = pid})
end
- return self:from({pid = pid, ignore_gc = options.ignore_gc, output_beautifier = output_beautifier})
+ local x = self:from({pid = pid, ignore_gc = options.ignore_gc, output_beautifier = output_beautifier})
+ x._wait_cond = fiber.cond()
+ x._wait_cond_signaled = false
+ fiber.create(function()
+ x.wstatus = require('popen').w(pid)
+ x._wait_cond_signaled = true
+ x._wait_cond:broadcast()
+ end)
+ return x
end
-- luacov: disable
if options.chdir then
@@ -91,6 +100,13 @@ function Process.mt:is_alive()
return self.pid ~= nil and self.class.is_pid_alive(self.pid)
end
+function Process.mt:wait()
+ if not self._wait_cond_signaled then
+ self._wait_cond:wait()
+ end
+ return self.wstatus
+end
+
function Process.kill_pid(pid, signal, options)
checks('number|string', '?number|string', {quiet = '?boolean'})
-- Signal values are platform-dependent so we can not use ffi here
diff --git a/luatest/server.lua b/luatest/server.lua
index 5506871..d5abcb5 100644
--- a/luatest/server.lua
+++ b/luatest/server.lua
@@ -440,15 +440,15 @@ function Server:stop()
self.net_box = nil
end
- if self.process and self.process:is_alive() then
- self.process:kill()
- local ok, err = pcall(wait_for_condition, 'process is terminated', self, function()
- return not self.process:is_alive()
- end)
- if not ok and not err:find('Process is terminated when waiting for') then
- error(err)
+ if self.process then
+ if self.process:is_alive() then
+ self.process:kill()
end
- if self.process.output_beautifier.stderr:find('Segmentation fault') then
+
+ local wstatus = self.process:wait()
+ local ok = wstatus.state == 'exited' and wstatus.exit_code == 0
+
+ if not ok and self.process.output_beautifier.stderr:find('Segmentation fault') then
error(
('Segmentation fault during process termination (alias: %s, workdir: %s, pid: %d)\n%s')
:format(
@@ -459,6 +459,9 @@ function Server:stop()
)
)
end
+ if not ok then
+ error(('wstatus: %s'):format(json.encode(wstatus)))
+ end
log.debug('Killed server process PID ' .. self.process.pid)
self.process = nil
end
(BTW, the built-in popen module performs all these things for processes started using popen.new() or popen.shell().)
It is interesting that I see the exit code 0, when tarantool is terminated by SIGTERM. It is a bit surprising, but I guess that it is implemented deliberately.