From c885ee7f0c9c7c232fda81bfd6f9eca0e182ee3d Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 14 Jul 2019 11:31:26 -0500 Subject: [PATCH] mgr/crash: raise RECENT_CRASH warning for recent (new) crashes Signed-off-by: Sage Weil --- doc/rados/operations/health-checks.rst | 40 +++++++++++++++++++++ src/pybind/mgr/crash/module.py | 48 +++++++++++++++++++++++++- 2 files changed, 87 insertions(+), 1 deletion(-) diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst index 88ca193d8ad..e83cd479b66 100644 --- a/doc/rados/operations/health-checks.rst +++ b/doc/rados/operations/health-checks.rst @@ -857,3 +857,43 @@ happen if they are misplaced or degraded (see *PG_AVAILABILITY* and You can manually initiate a scrub of a clean PG with:: ceph pg deep-scrub + + +Miscellaneous +------------- + +RECENT_CRASH +____________ + +One or more Ceph daemons has crashed recently, and the crash has not +yet been archived (acknowledged) by the administrator. This may +indicate a software bug, a hardware problem (e.g., a failing disk), or +some other problem. + +New crashes can be listed with:: + + ceph crash ls-new + +Information about a specific crash can be examined with:: + + ceph crash info + +This warning can be silenced by "archiving" the crash (perhaps after +being examined by an administrator) so that it does not generate this +warning:: + + ceph crash archive + +Similarly, all new crashes can be archived with:: + + ceph crash archive-all + +Archived crashes will still be visible via ``ceph crash ls`` but not +``ceph crash ls-new``. + +The time period for what "recent" means is controlled by the option +``mgr/crash/warn_recent_interval`` (default: two weeks). + +These warnings can be disabled entirely with:: + + ceph config set mgr/crash/warn_recent_interval 0 diff --git a/src/pybind/mgr/crash/module.py b/src/pybind/mgr/crash/module.py index dec4f50a557..133cd5d417b 100644 --- a/src/pybind/mgr/crash/module.py +++ b/src/pybind/mgr/crash/module.py @@ -11,9 +11,18 @@ from threading import Event DATEFMT = '%Y-%m-%dT%H:%M:%S.%f' OLD_DATEFMT = '%Y-%m-%d %H:%M:%S.%f' +MAX_WAIT = 600 +MIN_WAIT = 60 class Module(MgrModule): MODULE_OPTIONS = [ + { + 'name': 'warn_recent_interval', + 'type': 'secs', + 'default': 60*60*24*14, + 'desc': 'time interval in which to warn about recent crashes', + 'runtime': True, + }, ] def __init__(self, *args, **kwargs): @@ -29,7 +38,9 @@ class Module(MgrModule): def serve(self): self.config_notify() while self.run: - self.event.wait(self.warn_recent_interval / 100) + self._refresh_health_checks() + wait = min(MAX_WAIT, max(self.warn_recent_interval / 100, MIN_WAIT)) + self.event.wait(wait) self.event.clear() def config_notify(self): @@ -44,6 +55,35 @@ class Module(MgrModule): raw = self.get_store_prefix('crash/') self.crashes = {k[6:]: json.loads(m) for (k, m) in raw.items()} + def _refresh_health_checks(self): + if not self.crashes: + self._load_crashes() + cutoff = datetime.datetime.utcnow() - datetime.timedelta( + seconds=self.warn_recent_interval) + recent = { + crashid: crash for crashid, crash in self.crashes.items() + if self.time_from_string(crash['timestamp']) > cutoff and 'archived' not in crash + } + num = len(recent) + health_checks = {} + if recent: + detail = [ + '%s crashed on host %s at %s' % ( + crash.get('entity_name', 'unidentified daemon'), + crash.get('utsname_hostname', '(unknown)'), + crash.get('timestamp', 'unknown time')) + for (_, crash) in recent.items()] + if num > 30: + detail = detail[0:30] + detail.append('and %d more' % (num - 30)) + self.log.debug('detail %s' % detail) + health_checks['RECENT_CRASH'] = { + 'severity': 'warning', + 'summary': '%d daemons have recently crashed' % (num), + 'detail': detail, + } + self.set_health_checks(health_checks) + def handle_command(self, inbuf, command): if not self.crashes: self._load_crashes() @@ -137,6 +177,7 @@ class Module(MgrModule): del self.crashes[crashid] key = 'crash/%s' % crashid self.set_store(key, None) # removes key + self._refresh_health_checks() return 0, '', '' def do_prune(self, cmd, inbuf): @@ -159,6 +200,9 @@ class Module(MgrModule): del self.crashes[crashid] key = 'crash/%s' % crashid self.set_store(key, None) + removed_any = True + if removed_any: + self._refresh_health_checks() def do_archive(self, cmd, inbuf): crashid = cmd['id'] @@ -170,6 +214,7 @@ class Module(MgrModule): self.crashes[crashid] = crash key = 'crash/%s' % crashid self.set_store(key, json.dumps(crash)) + self._refresh_health_checks() return 0, '', '' def do_archive_all(self, cmd, inbuf): @@ -179,6 +224,7 @@ class Module(MgrModule): self.crashes[crashid] = crash key = 'crash/%s' % crashid self.set_store(key, json.dumps(crash)) + self._refresh_health_checks() return 0, '', '' def do_stat(self, cmd, inbuf):