mirror of
https://github.com/ceph/ceph
synced 2025-03-19 08:57:29 +00:00
mgr/crash: raise RECENT_CRASH warning for recent (new) crashes
Signed-off-by: Sage Weil <sage@redhat.com>
This commit is contained in:
parent
bebb1c3100
commit
c885ee7f0c
@ -857,3 +857,43 @@ happen if they are misplaced or degraded (see *PG_AVAILABILITY* and
|
|||||||
You can manually initiate a scrub of a clean PG with::
|
You can manually initiate a scrub of a clean PG with::
|
||||||
|
|
||||||
ceph pg deep-scrub <pgid>
|
ceph pg deep-scrub <pgid>
|
||||||
|
|
||||||
|
|
||||||
|
Miscellaneous
|
||||||
|
-------------
|
||||||
|
|
||||||
|
RECENT_CRASH
|
||||||
|
____________
|
||||||
|
|
||||||
|
One or more Ceph daemons has crashed recently, and the crash has not
|
||||||
|
yet been archived (acknowledged) by the administrator. This may
|
||||||
|
indicate a software bug, a hardware problem (e.g., a failing disk), or
|
||||||
|
some other problem.
|
||||||
|
|
||||||
|
New crashes can be listed with::
|
||||||
|
|
||||||
|
ceph crash ls-new
|
||||||
|
|
||||||
|
Information about a specific crash can be examined with::
|
||||||
|
|
||||||
|
ceph crash info <crash-id>
|
||||||
|
|
||||||
|
This warning can be silenced by "archiving" the crash (perhaps after
|
||||||
|
being examined by an administrator) so that it does not generate this
|
||||||
|
warning::
|
||||||
|
|
||||||
|
ceph crash archive <crash-id>
|
||||||
|
|
||||||
|
Similarly, all new crashes can be archived with::
|
||||||
|
|
||||||
|
ceph crash archive-all
|
||||||
|
|
||||||
|
Archived crashes will still be visible via ``ceph crash ls`` but not
|
||||||
|
``ceph crash ls-new``.
|
||||||
|
|
||||||
|
The time period for what "recent" means is controlled by the option
|
||||||
|
``mgr/crash/warn_recent_interval`` (default: two weeks).
|
||||||
|
|
||||||
|
These warnings can be disabled entirely with::
|
||||||
|
|
||||||
|
ceph config set mgr/crash/warn_recent_interval 0
|
||||||
|
@ -11,9 +11,18 @@ from threading import Event
|
|||||||
DATEFMT = '%Y-%m-%dT%H:%M:%S.%f'
|
DATEFMT = '%Y-%m-%dT%H:%M:%S.%f'
|
||||||
OLD_DATEFMT = '%Y-%m-%d %H:%M:%S.%f'
|
OLD_DATEFMT = '%Y-%m-%d %H:%M:%S.%f'
|
||||||
|
|
||||||
|
MAX_WAIT = 600
|
||||||
|
MIN_WAIT = 60
|
||||||
|
|
||||||
class Module(MgrModule):
|
class Module(MgrModule):
|
||||||
MODULE_OPTIONS = [
|
MODULE_OPTIONS = [
|
||||||
|
{
|
||||||
|
'name': 'warn_recent_interval',
|
||||||
|
'type': 'secs',
|
||||||
|
'default': 60*60*24*14,
|
||||||
|
'desc': 'time interval in which to warn about recent crashes',
|
||||||
|
'runtime': True,
|
||||||
|
},
|
||||||
]
|
]
|
||||||
|
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
@ -29,7 +38,9 @@ class Module(MgrModule):
|
|||||||
def serve(self):
|
def serve(self):
|
||||||
self.config_notify()
|
self.config_notify()
|
||||||
while self.run:
|
while self.run:
|
||||||
self.event.wait(self.warn_recent_interval / 100)
|
self._refresh_health_checks()
|
||||||
|
wait = min(MAX_WAIT, max(self.warn_recent_interval / 100, MIN_WAIT))
|
||||||
|
self.event.wait(wait)
|
||||||
self.event.clear()
|
self.event.clear()
|
||||||
|
|
||||||
def config_notify(self):
|
def config_notify(self):
|
||||||
@ -44,6 +55,35 @@ class Module(MgrModule):
|
|||||||
raw = self.get_store_prefix('crash/')
|
raw = self.get_store_prefix('crash/')
|
||||||
self.crashes = {k[6:]: json.loads(m) for (k, m) in raw.items()}
|
self.crashes = {k[6:]: json.loads(m) for (k, m) in raw.items()}
|
||||||
|
|
||||||
|
def _refresh_health_checks(self):
|
||||||
|
if not self.crashes:
|
||||||
|
self._load_crashes()
|
||||||
|
cutoff = datetime.datetime.utcnow() - datetime.timedelta(
|
||||||
|
seconds=self.warn_recent_interval)
|
||||||
|
recent = {
|
||||||
|
crashid: crash for crashid, crash in self.crashes.items()
|
||||||
|
if self.time_from_string(crash['timestamp']) > cutoff and 'archived' not in crash
|
||||||
|
}
|
||||||
|
num = len(recent)
|
||||||
|
health_checks = {}
|
||||||
|
if recent:
|
||||||
|
detail = [
|
||||||
|
'%s crashed on host %s at %s' % (
|
||||||
|
crash.get('entity_name', 'unidentified daemon'),
|
||||||
|
crash.get('utsname_hostname', '(unknown)'),
|
||||||
|
crash.get('timestamp', 'unknown time'))
|
||||||
|
for (_, crash) in recent.items()]
|
||||||
|
if num > 30:
|
||||||
|
detail = detail[0:30]
|
||||||
|
detail.append('and %d more' % (num - 30))
|
||||||
|
self.log.debug('detail %s' % detail)
|
||||||
|
health_checks['RECENT_CRASH'] = {
|
||||||
|
'severity': 'warning',
|
||||||
|
'summary': '%d daemons have recently crashed' % (num),
|
||||||
|
'detail': detail,
|
||||||
|
}
|
||||||
|
self.set_health_checks(health_checks)
|
||||||
|
|
||||||
def handle_command(self, inbuf, command):
|
def handle_command(self, inbuf, command):
|
||||||
if not self.crashes:
|
if not self.crashes:
|
||||||
self._load_crashes()
|
self._load_crashes()
|
||||||
@ -137,6 +177,7 @@ class Module(MgrModule):
|
|||||||
del self.crashes[crashid]
|
del self.crashes[crashid]
|
||||||
key = 'crash/%s' % crashid
|
key = 'crash/%s' % crashid
|
||||||
self.set_store(key, None) # removes key
|
self.set_store(key, None) # removes key
|
||||||
|
self._refresh_health_checks()
|
||||||
return 0, '', ''
|
return 0, '', ''
|
||||||
|
|
||||||
def do_prune(self, cmd, inbuf):
|
def do_prune(self, cmd, inbuf):
|
||||||
@ -159,6 +200,9 @@ class Module(MgrModule):
|
|||||||
del self.crashes[crashid]
|
del self.crashes[crashid]
|
||||||
key = 'crash/%s' % crashid
|
key = 'crash/%s' % crashid
|
||||||
self.set_store(key, None)
|
self.set_store(key, None)
|
||||||
|
removed_any = True
|
||||||
|
if removed_any:
|
||||||
|
self._refresh_health_checks()
|
||||||
|
|
||||||
def do_archive(self, cmd, inbuf):
|
def do_archive(self, cmd, inbuf):
|
||||||
crashid = cmd['id']
|
crashid = cmd['id']
|
||||||
@ -170,6 +214,7 @@ class Module(MgrModule):
|
|||||||
self.crashes[crashid] = crash
|
self.crashes[crashid] = crash
|
||||||
key = 'crash/%s' % crashid
|
key = 'crash/%s' % crashid
|
||||||
self.set_store(key, json.dumps(crash))
|
self.set_store(key, json.dumps(crash))
|
||||||
|
self._refresh_health_checks()
|
||||||
return 0, '', ''
|
return 0, '', ''
|
||||||
|
|
||||||
def do_archive_all(self, cmd, inbuf):
|
def do_archive_all(self, cmd, inbuf):
|
||||||
@ -179,6 +224,7 @@ class Module(MgrModule):
|
|||||||
self.crashes[crashid] = crash
|
self.crashes[crashid] = crash
|
||||||
key = 'crash/%s' % crashid
|
key = 'crash/%s' % crashid
|
||||||
self.set_store(key, json.dumps(crash))
|
self.set_store(key, json.dumps(crash))
|
||||||
|
self._refresh_health_checks()
|
||||||
return 0, '', ''
|
return 0, '', ''
|
||||||
|
|
||||||
def do_stat(self, cmd, inbuf):
|
def do_stat(self, cmd, inbuf):
|
||||||
|
Loading…
Reference in New Issue
Block a user