Merge PR #24493 into master

* refs/pull/24493/head:
	mgr/DaemonState: clean up device life_expectancy values
	mgr/devicehealth: warn based on life_expectancy_max
	mgr/devicehealth: warn on failing devices at 6 weeks

Reviewed-by: John Spray <john.spray@redhat.com>
This commit is contained in:
Sage Weil 2018-10-15 08:36:18 -05:00
commit 36c3bce219
2 changed files with 28 additions and 16 deletions

View File

@ -43,9 +43,21 @@ void DeviceState::set_life_expectancy(utime_t from, utime_t to, utime_t now)
{
life_expectancy = make_pair(from, to);
life_expectancy_stamp = now;
metadata["life_expectancy_min"] = stringify(life_expectancy.first);
metadata["life_expectancy_max"] = stringify(life_expectancy.second);
metadata["life_expectancy_stamp"] = stringify(life_expectancy_stamp);
if (from != utime_t()) {
metadata["life_expectancy_min"] = from;
} else {
metadata["life_expectancy_min"] = "";
}
if (to != utime_t()) {
metadata["life_expectancy_max"] = to;
} else {
metadata["life_expectancy_max"] = "";
}
if (now != utime_t()) {
metadata["life_expectancy_stamp"] = stringify(now);
} else {
metadata["life_expectancy_stamp"] = "";
}
}
void DeviceState::rm_life_expectancy()

View File

@ -47,7 +47,7 @@ class Module(MgrModule):
},
{
'name': 'warn_threshold',
'default': str(86400 * 14 * 2),
'default': str(86400 * 14 * 6),
},
{
'name': 'self_heal',
@ -425,21 +425,24 @@ class Module(MgrModule):
assert osdmap is not None
for dev in devs['devices']:
devid = dev['devid']
if 'life_expectancy_min' not in dev:
if 'life_expectancy_max' not in dev:
continue
# ignore devices that are not consumed by any daemons
if not dev['daemons']:
continue
if not dev['life_expectancy_max'] or \
dev['life_expectancy_max'] == '0.000000':
continue
# life_expectancy_(min/max) is in the format of:
# '%Y-%m-%d %H:%M:%S.%f', e.g.:
# '2019-01-20 21:12:12.000000'
life_expectancy_min = datetime.strptime(
dev['life_expectancy_min'],
life_expectancy_max = datetime.strptime(
dev['life_expectancy_max'],
'%Y-%m-%d %H:%M:%S.%f')
self.log.debug('device %s expectancy min %s', dev,
life_expectancy_min)
self.log.debug('device %s expectancy max %s', dev,
life_expectancy_max)
if life_expectancy_min - now <= mark_out_threshold_td:
if life_expectancy_max - now <= mark_out_threshold_td:
if self.self_heal:
# dev['daemons'] == ["osd.0","osd.1","osd.2"]
if dev['daemons']:
@ -448,11 +451,11 @@ class Module(MgrModule):
osd_ids = map(lambda x: x[4:], osds)
for _id in osd_ids:
if self.is_osd_in(osdmap, _id):
osds_in[_id] = life_expectancy_min
osds_in[_id] = life_expectancy_max
else:
osds_out[_id] = 1
if life_expectancy_min - now <= warn_threshold_td:
if life_expectancy_max - now <= warn_threshold_td:
# device can appear in more than one location in case
# of SCSI multipath
device_locations = map(lambda x: x['host'] + ':' + x['dev'],
@ -462,11 +465,8 @@ class Module(MgrModule):
% (dev['devid'],
','.join(device_locations),
','.join(dev.get('daemons', ['none'])),
dev['life_expectancy_min'],
dev['life_expectancy_max'],
dev.get('life_expectancy_max', 'unknown')))
# TODO: by default, dev['life_expectancy_max'] == '0.000000',
# so dev.get('life_expectancy_max', 'unknown')
# above should be altered.
# OSD might be marked 'out' (which means it has no
# data), however PGs are still attached to it.