mirror of
https://github.com/ceph/ceph
synced 2025-03-30 07:19:14 +00:00
Merge PR #24493 into master
* refs/pull/24493/head: mgr/DaemonState: clean up device life_expectancy values mgr/devicehealth: warn based on life_expectancy_max mgr/devicehealth: warn on failing devices at 6 weeks Reviewed-by: John Spray <john.spray@redhat.com>
This commit is contained in:
commit
36c3bce219
@ -43,9 +43,21 @@ void DeviceState::set_life_expectancy(utime_t from, utime_t to, utime_t now)
|
||||
{
|
||||
life_expectancy = make_pair(from, to);
|
||||
life_expectancy_stamp = now;
|
||||
metadata["life_expectancy_min"] = stringify(life_expectancy.first);
|
||||
metadata["life_expectancy_max"] = stringify(life_expectancy.second);
|
||||
metadata["life_expectancy_stamp"] = stringify(life_expectancy_stamp);
|
||||
if (from != utime_t()) {
|
||||
metadata["life_expectancy_min"] = from;
|
||||
} else {
|
||||
metadata["life_expectancy_min"] = "";
|
||||
}
|
||||
if (to != utime_t()) {
|
||||
metadata["life_expectancy_max"] = to;
|
||||
} else {
|
||||
metadata["life_expectancy_max"] = "";
|
||||
}
|
||||
if (now != utime_t()) {
|
||||
metadata["life_expectancy_stamp"] = stringify(now);
|
||||
} else {
|
||||
metadata["life_expectancy_stamp"] = "";
|
||||
}
|
||||
}
|
||||
|
||||
void DeviceState::rm_life_expectancy()
|
||||
|
@ -47,7 +47,7 @@ class Module(MgrModule):
|
||||
},
|
||||
{
|
||||
'name': 'warn_threshold',
|
||||
'default': str(86400 * 14 * 2),
|
||||
'default': str(86400 * 14 * 6),
|
||||
},
|
||||
{
|
||||
'name': 'self_heal',
|
||||
@ -425,21 +425,24 @@ class Module(MgrModule):
|
||||
assert osdmap is not None
|
||||
for dev in devs['devices']:
|
||||
devid = dev['devid']
|
||||
if 'life_expectancy_min' not in dev:
|
||||
if 'life_expectancy_max' not in dev:
|
||||
continue
|
||||
# ignore devices that are not consumed by any daemons
|
||||
if not dev['daemons']:
|
||||
continue
|
||||
if not dev['life_expectancy_max'] or \
|
||||
dev['life_expectancy_max'] == '0.000000':
|
||||
continue
|
||||
# life_expectancy_(min/max) is in the format of:
|
||||
# '%Y-%m-%d %H:%M:%S.%f', e.g.:
|
||||
# '2019-01-20 21:12:12.000000'
|
||||
life_expectancy_min = datetime.strptime(
|
||||
dev['life_expectancy_min'],
|
||||
life_expectancy_max = datetime.strptime(
|
||||
dev['life_expectancy_max'],
|
||||
'%Y-%m-%d %H:%M:%S.%f')
|
||||
self.log.debug('device %s expectancy min %s', dev,
|
||||
life_expectancy_min)
|
||||
self.log.debug('device %s expectancy max %s', dev,
|
||||
life_expectancy_max)
|
||||
|
||||
if life_expectancy_min - now <= mark_out_threshold_td:
|
||||
if life_expectancy_max - now <= mark_out_threshold_td:
|
||||
if self.self_heal:
|
||||
# dev['daemons'] == ["osd.0","osd.1","osd.2"]
|
||||
if dev['daemons']:
|
||||
@ -448,11 +451,11 @@ class Module(MgrModule):
|
||||
osd_ids = map(lambda x: x[4:], osds)
|
||||
for _id in osd_ids:
|
||||
if self.is_osd_in(osdmap, _id):
|
||||
osds_in[_id] = life_expectancy_min
|
||||
osds_in[_id] = life_expectancy_max
|
||||
else:
|
||||
osds_out[_id] = 1
|
||||
|
||||
if life_expectancy_min - now <= warn_threshold_td:
|
||||
if life_expectancy_max - now <= warn_threshold_td:
|
||||
# device can appear in more than one location in case
|
||||
# of SCSI multipath
|
||||
device_locations = map(lambda x: x['host'] + ':' + x['dev'],
|
||||
@ -462,11 +465,8 @@ class Module(MgrModule):
|
||||
% (dev['devid'],
|
||||
','.join(device_locations),
|
||||
','.join(dev.get('daemons', ['none'])),
|
||||
dev['life_expectancy_min'],
|
||||
dev['life_expectancy_max'],
|
||||
dev.get('life_expectancy_max', 'unknown')))
|
||||
# TODO: by default, dev['life_expectancy_max'] == '0.000000',
|
||||
# so dev.get('life_expectancy_max', 'unknown')
|
||||
# above should be altered.
|
||||
|
||||
# OSD might be marked 'out' (which means it has no
|
||||
# data), however PGs are still attached to it.
|
||||
|
Loading…
Reference in New Issue
Block a user