ceph/qa/tasks/daemonwatchdog.py

import logging
import signal
import time

from gevent import sleep
from gevent.greenlet import Greenlet
from gevent.event import Event

log = logging.getLogger(__name__)

class DaemonWatchdog(Greenlet):
    """
    DaemonWatchdog::

    Watch Ceph daemons for failures. If an extended failure is detected (i.e.
    not intentional), then the watchdog will unmount file systems and send
    SIGTERM to all daemons. The duration of an extended failure is configurable
    with watchdog_daemon_timeout.

    ceph:
      watchdog:
        daemon_restart [default: no]: restart daemon if "normal" exit (status==0).

        daemon_timeout [default: 300]: number of seconds a daemon
                                              is allowed to be failed before the
                                              watchdog will bark.
    """

    def __init__(self, ctx, config, thrashers):
        super(DaemonWatchdog, self).__init__()
        self.config = ctx.config.get('watchdog', {})
        self.ctx = ctx
        self.e = None
        self.logger = log.getChild('daemon_watchdog')
        self.cluster = config.get('cluster', 'ceph')
        self.name = 'watchdog'
        self.stopping = Event()
        self.thrashers = thrashers

    def _run(self):
        try:
            self.watch()
        except Exception as e:
            # See _run exception comment for MDSThrasher
            self.e = e
            self.logger.exception("exception:")
            # allow successful completion so gevent doesn't see an exception...

    def log(self, x):
        """Write data to logger"""
        self.logger.info(x)

    def stop(self):
        self.stopping.set()

    def bark(self):
        self.log("BARK! unmounting mounts and killing all daemons")
        for mount in self.ctx.mounts.values():
            try:
                mount.umount_wait(force=True)
            except:
                self.logger.exception("ignoring exception:")
        daemons = []
        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)))
        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)))
        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)))
        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)))
        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)))

        for daemon in daemons:
            try:
                daemon.signal(signal.SIGTERM)
            except:
                self.logger.exception("ignoring exception:")

    def watch(self):
        self.log("watchdog starting")
        daemon_timeout = int(self.config.get('daemon_timeout', 300))
        daemon_restart = self.config.get('daemon_restart', False)
        daemon_failure_time = {}
        while not self.stopping.is_set():
            bark = False
            now = time.time()

            osds = self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)
            mons = self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)
            mdss = self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)
            rgws = self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)
            mgrs = self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)

            daemon_failures = []
            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, osds))
            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mons))
            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mdss))
            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, rgws))
            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mgrs))

            for daemon in daemon_failures:
                name = daemon.role + '.' + daemon.id_
                dt = daemon_failure_time.setdefault(name, (daemon, now))
                assert dt[0] is daemon
                delta = now-dt[1]
                self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
                if delta > daemon_timeout:
                    bark = True
                if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
                    self.log(f"attempting to restart daemon {name}")
                    daemon.restart()

            # If a daemon is no longer failed, remove it from tracking:
            for name in list(daemon_failure_time.keys()):
                if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
                    self.log("daemon {name} has been restored".format(name=name))
                    del daemon_failure_time[name]

            for thrasher in self.thrashers:
                if thrasher.exception is not None:
                    self.log("{name} failed".format(name=thrasher.name))
                    bark = True

            if bark:
                self.bark()
                return

            sleep(5)

        self.log("watchdog finished")
qa/tasks: move DaemonWatchdog to new file * Moved DaemonWatchdog class to a new file daemonwatchdog.py * Dropped the client watch Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-06 11:27:09 +00:00			`import logging`
			`import signal`
			`import time`

			`from gevent import sleep`
			`from gevent.greenlet import Greenlet`
			`from gevent.event import Event`

			`log = logging.getLogger(__name__)`

			`class DaemonWatchdog(Greenlet):`
			`"""`
			`DaemonWatchdog::`

			`Watch Ceph daemons for failures. If an extended failure is detected (i.e.`
			`not intentional), then the watchdog will unmount file systems and send`
			`SIGTERM to all daemons. The duration of an extended failure is configurable`
			`with watchdog_daemon_timeout.`

mds,qa: exit instead of respawn under valgrind valgrind can't handle execve of /proc/self/exe: 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== execve(0x18546740(/proc/self/exe), 0x18546670, 0x133ef310) failed, errno 2 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== EXEC FAILED: I can't recover from execve() failing, so I'm dying. 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== Add more stringent tests in PRE(sys_execve), or work out how to recover. So configure the MDS to just exit so it can be restarted by QA infra (the daemon watchdog). Signed-off-by: Patrick Donnelly <pdonnell@redhat.com> 2021-03-03 03:39:09 +00:00			`ceph:`
			`watchdog:`
			`daemon_restart [default: no]: restart daemon if "normal" exit (status==0).`

			`daemon_timeout [default: 300]: number of seconds a daemon`
			`is allowed to be failed before the`
			`watchdog will bark.`
qa/tasks: move DaemonWatchdog to new file * Moved DaemonWatchdog class to a new file daemonwatchdog.py * Dropped the client watch Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-06 11:27:09 +00:00			`"""`

qa/tasks: make watch and bark handle more daemons * make watch and bark handle more daemons * drop the manager parameter, as it wont be available when DaemonWatchdog starts * get the cluster from the config Fixes: http://tracker.ceph.com/issues/10369 Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-07 08:33:15 +00:00			`def __init__(self, ctx, config, thrashers):`
qa/tasks: introduce Thrasher base class * Introduced a Thrasher base class. * Updated thrashers to inherit from Thrasher. * Replaced the magic variable e with Thrasher.exception as per the discussion. Now the exception variable sets by default as the thrashers are inheriting from the Thrasher class. Fixes: https://github.com/ceph/ceph/pull/28378#discussion_r309337928 Fixes: https://tracker.ceph.com/issues/41133 Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-08-05 10:52:10 +00:00			`super(DaemonWatchdog, self).__init__()`
mds,qa: exit instead of respawn under valgrind valgrind can't handle execve of /proc/self/exe: 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== execve(0x18546740(/proc/self/exe), 0x18546670, 0x133ef310) failed, errno 2 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== EXEC FAILED: I can't recover from execve() failing, so I'm dying. 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== Add more stringent tests in PRE(sys_execve), or work out how to recover. So configure the MDS to just exit so it can be restarted by QA infra (the daemon watchdog). Signed-off-by: Patrick Donnelly <pdonnell@redhat.com> 2021-03-03 03:39:09 +00:00			`self.config = ctx.config.get('watchdog', {})`
qa/tasks: move DaemonWatchdog to new file * Moved DaemonWatchdog class to a new file daemonwatchdog.py * Dropped the client watch Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-06 11:27:09 +00:00			`self.ctx = ctx`
			`self.e = None`
			`self.logger = log.getChild('daemon_watchdog')`
qa/tasks: make watch and bark handle more daemons * make watch and bark handle more daemons * drop the manager parameter, as it wont be available when DaemonWatchdog starts * get the cluster from the config Fixes: http://tracker.ceph.com/issues/10369 Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-07 08:33:15 +00:00			`self.cluster = config.get('cluster', 'ceph')`
qa/tasks: move DaemonWatchdog to new file * Moved DaemonWatchdog class to a new file daemonwatchdog.py * Dropped the client watch Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-06 11:27:09 +00:00			`self.name = 'watchdog'`
			`self.stopping = Event()`
			`self.thrashers = thrashers`

			`def _run(self):`
			`try:`
			`self.watch()`
			`except Exception as e:`
			`# See _run exception comment for MDSThrasher`
			`self.e = e`
			`self.logger.exception("exception:")`
			`# allow successful completion so gevent doesn't see an exception...`

			`def log(self, x):`
			`"""Write data to logger"""`
			`self.logger.info(x)`

			`def stop(self):`
			`self.stopping.set()`

			`def bark(self):`
			`self.log("BARK! unmounting mounts and killing all daemons")`
			`for mount in self.ctx.mounts.values():`
			`try:`
			`mount.umount_wait(force=True)`
			`except:`
			`self.logger.exception("ignoring exception:")`
			`daemons = []`
qa/tasks: make watch and bark handle more daemons * make watch and bark handle more daemons * drop the manager parameter, as it wont be available when DaemonWatchdog starts * get the cluster from the config Fixes: http://tracker.ceph.com/issues/10369 Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-07 08:33:15 +00:00			`daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)))`
			`daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)))`
			`daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)))`
			`daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)))`
			`daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)))`

qa/tasks: move DaemonWatchdog to new file * Moved DaemonWatchdog class to a new file daemonwatchdog.py * Dropped the client watch Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-06 11:27:09 +00:00			`for daemon in daemons:`
			`try:`
			`daemon.signal(signal.SIGTERM)`
			`except:`
			`self.logger.exception("ignoring exception:")`

			`def watch(self):`
			`self.log("watchdog starting")`
mds,qa: exit instead of respawn under valgrind valgrind can't handle execve of /proc/self/exe: 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== execve(0x18546740(/proc/self/exe), 0x18546670, 0x133ef310) failed, errno 2 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== EXEC FAILED: I can't recover from execve() failing, so I'm dying. 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== Add more stringent tests in PRE(sys_execve), or work out how to recover. So configure the MDS to just exit so it can be restarted by QA infra (the daemon watchdog). Signed-off-by: Patrick Donnelly <pdonnell@redhat.com> 2021-03-03 03:39:09 +00:00			`daemon_timeout = int(self.config.get('daemon_timeout', 300))`
			`daemon_restart = self.config.get('daemon_restart', False)`
qa/tasks: move DaemonWatchdog to new file * Moved DaemonWatchdog class to a new file daemonwatchdog.py * Dropped the client watch Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-06 11:27:09 +00:00			`daemon_failure_time = {}`
			`while not self.stopping.is_set():`
			`bark = False`
			`now = time.time()`

qa/tasks: make watch and bark handle more daemons * make watch and bark handle more daemons * drop the manager parameter, as it wont be available when DaemonWatchdog starts * get the cluster from the config Fixes: http://tracker.ceph.com/issues/10369 Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-07 08:33:15 +00:00			`osds = self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)`
			`mons = self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)`
			`mdss = self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)`
			`rgws = self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)`
			`mgrs = self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)`
qa/tasks: move DaemonWatchdog to new file * Moved DaemonWatchdog class to a new file daemonwatchdog.py * Dropped the client watch Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-06 11:27:09 +00:00
			`daemon_failures = []`
qa/tasks: make watch and bark handle more daemons * make watch and bark handle more daemons * drop the manager parameter, as it wont be available when DaemonWatchdog starts * get the cluster from the config Fixes: http://tracker.ceph.com/issues/10369 Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-07 08:33:15 +00:00			`daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, osds))`
qa/tasks: move DaemonWatchdog to new file * Moved DaemonWatchdog class to a new file daemonwatchdog.py * Dropped the client watch Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-06 11:27:09 +00:00			`daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mons))`
			`daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mdss))`
qa/tasks: make watch and bark handle more daemons * make watch and bark handle more daemons * drop the manager parameter, as it wont be available when DaemonWatchdog starts * get the cluster from the config Fixes: http://tracker.ceph.com/issues/10369 Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-07 08:33:15 +00:00			`daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, rgws))`
			`daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mgrs))`

qa/tasks: move DaemonWatchdog to new file * Moved DaemonWatchdog class to a new file daemonwatchdog.py * Dropped the client watch Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-06 11:27:09 +00:00			`for daemon in daemon_failures:`
			`name = daemon.role + '.' + daemon.id_`
			`dt = daemon_failure_time.setdefault(name, (daemon, now))`
			`assert dt[0] is daemon`
			`delta = now-dt[1]`
			`self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))`
			`if delta > daemon_timeout:`
			`bark = True`
mds,qa: exit instead of respawn under valgrind valgrind can't handle execve of /proc/self/exe: 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== execve(0x18546740(/proc/self/exe), 0x18546670, 0x133ef310) failed, errno 2 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== EXEC FAILED: I can't recover from execve() failing, so I'm dying. 2021-02-27T05:52:37.813 INFO:tasks.ceph.mds.d.smithi073.stderr:==00:01:03:20.556 41218== Add more stringent tests in PRE(sys_execve), or work out how to recover. So configure the MDS to just exit so it can be restarted by QA infra (the daemon watchdog). Signed-off-by: Patrick Donnelly <pdonnell@redhat.com> 2021-03-03 03:39:09 +00:00			`if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:`
			`self.log(f"attempting to restart daemon {name}")`
			`daemon.restart()`
qa/tasks: move DaemonWatchdog to new file * Moved DaemonWatchdog class to a new file daemonwatchdog.py * Dropped the client watch Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-06 11:27:09 +00:00
			`# If a daemon is no longer failed, remove it from tracking:`
qa/tasks/daemonwatchdog: py3 compat Address error: RuntimeError: dictionary changed size during iteration Signed-off-by: Kyr Shatskyy <kyrylo.shatskyy@suse.com> 2019-12-17 01:26:12 +00:00			`for name in list(daemon_failure_time.keys()):`
qa/tasks: move DaemonWatchdog to new file * Moved DaemonWatchdog class to a new file daemonwatchdog.py * Dropped the client watch Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-06 11:27:09 +00:00			`if name not in [d.role + '.' + d.id_ for d in daemon_failures]:`
			`self.log("daemon {name} has been restored".format(name=name))`
			`del daemon_failure_time[name]`

			`for thrasher in self.thrashers:`
qa/tasks: introduce Thrasher base class * Introduced a Thrasher base class. * Updated thrashers to inherit from Thrasher. * Replaced the magic variable e with Thrasher.exception as per the discussion. Now the exception variable sets by default as the thrashers are inheriting from the Thrasher class. Fixes: https://github.com/ceph/ceph/pull/28378#discussion_r309337928 Fixes: https://tracker.ceph.com/issues/41133 Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-08-05 10:52:10 +00:00			`if thrasher.exception is not None:`
qa/tasks: Better handling of thrasher names and __init__ calls Fixes: https://tracker.ceph.com/issues/42062 Fixes: https://tracker.ceph.com/issues/42478 Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-09-27 03:39:00 +00:00			`self.log("{name} failed".format(name=thrasher.name))`
qa/tasks: move DaemonWatchdog to new file * Moved DaemonWatchdog class to a new file daemonwatchdog.py * Dropped the client watch Signed-off-by: Jos Collin <jcollin@redhat.com> 2019-05-06 11:27:09 +00:00			`bark = True`

			`if bark:`
			`self.bark()`
			`return`

			`sleep(5)`

			`self.log("watchdog finished")`