mirror of
https://github.com/ceph/ceph
synced 2025-03-06 08:20:12 +00:00
mds,qa: exit instead of respawn under valgrind
valgrind can't handle execve of /proc/self/exe: 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== execve(0x18546740(/proc/self/exe), 0x18546670, 0x133ef310) failed, errno 2 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== EXEC FAILED: I can't recover from execve() failing, so I'm dying. 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== Add more stringent tests in PRE(sys_execve), or work out how to recover. So configure the MDS to just exit so it can be restarted by QA infra (the daemon watchdog). Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
This commit is contained in:
parent
3e5e03d4d2
commit
5faf0ee0f3
@ -13,6 +13,8 @@ overrides:
|
||||
osd heartbeat grace: 60
|
||||
mds heartbeat grace: 60
|
||||
mds beacon grace: 60
|
||||
mds:
|
||||
mds valgrind exit: true
|
||||
mon:
|
||||
mon osd crush smoke test: false
|
||||
osd:
|
||||
@ -20,6 +22,8 @@ overrides:
|
||||
valgrind:
|
||||
mon: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
|
||||
mds: [--tool=memcheck]
|
||||
watchdog:
|
||||
daemon_restart: normal
|
||||
ceph-fuse:
|
||||
client.0:
|
||||
valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
|
||||
|
@ -17,14 +17,19 @@ class DaemonWatchdog(Greenlet):
|
||||
SIGTERM to all daemons. The duration of an extended failure is configurable
|
||||
with watchdog_daemon_timeout.
|
||||
|
||||
watchdog_daemon_timeout [default: 300]: number of seconds a daemon
|
||||
is allowed to be failed before the watchdog will bark.
|
||||
ceph:
|
||||
watchdog:
|
||||
daemon_restart [default: no]: restart daemon if "normal" exit (status==0).
|
||||
|
||||
daemon_timeout [default: 300]: number of seconds a daemon
|
||||
is allowed to be failed before the
|
||||
watchdog will bark.
|
||||
"""
|
||||
|
||||
def __init__(self, ctx, config, thrashers):
|
||||
super(DaemonWatchdog, self).__init__()
|
||||
self.config = ctx.config.get('watchdog', {})
|
||||
self.ctx = ctx
|
||||
self.config = config
|
||||
self.e = None
|
||||
self.logger = log.getChild('daemon_watchdog')
|
||||
self.cluster = config.get('cluster', 'ceph')
|
||||
@ -70,7 +75,8 @@ class DaemonWatchdog(Greenlet):
|
||||
|
||||
def watch(self):
|
||||
self.log("watchdog starting")
|
||||
daemon_timeout = int(self.config.get('watchdog_daemon_timeout', 300))
|
||||
daemon_timeout = int(self.config.get('daemon_timeout', 300))
|
||||
daemon_restart = self.config.get('daemon_restart', False)
|
||||
daemon_failure_time = {}
|
||||
while not self.stopping.is_set():
|
||||
bark = False
|
||||
@ -97,6 +103,9 @@ class DaemonWatchdog(Greenlet):
|
||||
self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
|
||||
if delta > daemon_timeout:
|
||||
bark = True
|
||||
if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
|
||||
self.log(f"attempting to restart daemon {name}")
|
||||
daemon.restart()
|
||||
|
||||
# If a daemon is no longer failed, remove it from tracking:
|
||||
for name in list(daemon_failure_time.keys()):
|
||||
|
@ -8054,6 +8054,10 @@ std::vector<Option> get_mds_options() {
|
||||
.set_flag(Option::FLAG_RUNTIME)
|
||||
.set_description("set the maximum length of alternate names for dentries"),
|
||||
|
||||
Option("mds_valgrind_exit", Option::TYPE_BOOL, Option::LEVEL_DEV)
|
||||
.set_default(false)
|
||||
.set_flag(Option::FLAG_RUNTIME),
|
||||
|
||||
Option("mds_numa_node", Option::TYPE_INT, Option::LEVEL_ADVANCED)
|
||||
.set_default(-1)
|
||||
.set_flag(Option::FLAG_STARTUP)
|
||||
|
@ -845,6 +845,11 @@ void MDSDaemon::respawn()
|
||||
* be removed from the MDSMap leading to respawn. */
|
||||
g_ceph_context->_log->dump_recent();
|
||||
|
||||
/* valgrind can't handle execve; just exit and let QA infra restart */
|
||||
if (g_conf().get_val<bool>("mds_valgrind_exit")) {
|
||||
_exit(0);
|
||||
}
|
||||
|
||||
char *new_argv[orig_argc+1];
|
||||
dout(1) << " e: '" << orig_argv[0] << "'" << dendl;
|
||||
for (int i=0; i<orig_argc; i++) {
|
||||
|
Loading…
Reference in New Issue
Block a user