From dde10a0b6e5eacd93535ef107fb86b92fdcf54cd Mon Sep 17 00:00:00 2001 From: John Spray Date: Tue, 24 Jul 2018 18:40:41 -0400 Subject: [PATCH] mgr/progress: no progress event on unmoved pgs PGs may not be moved on osd out, if there is no suitable location for them to move to. In this situation it doesn't make sense to have a progress event, as the health warnings adequately communicate the situation. Signed-off-by: John Spray --- src/pybind/mgr/progress/module.py | 37 ++++++++++++++++++++++++++++--- 1 file changed, 34 insertions(+), 3 deletions(-) diff --git a/src/pybind/mgr/progress/module.py b/src/pybind/mgr/progress/module.py index b998b1bc1d7..72f73d1fe9f 100644 --- a/src/pybind/mgr/progress/module.py +++ b/src/pybind/mgr/progress/module.py @@ -143,7 +143,6 @@ class PgRecoveryEvent(Event): self._original_bytes_recovered = {} for pg in self._pgs: pg_str = str(pg) - log.debug(json.dumps(pg_to_state[pg_str], indent=2)) self._original_bytes_recovered[pg] = \ pg_to_state[pg_str]['stat_sum']['num_bytes_recovered'] @@ -260,17 +259,49 @@ class Module(MgrModule): def _osd_out(self, old_map, old_dump, new_map, osd_id): affected_pgs = [] + unmoved_pgs = [] for pool in old_dump['pools']: pool_id = pool['pool'] for ps in range(0, pool['pg_num']): + up_acting = old_map.pg_to_up_acting_osds(pool['pool'], ps) + + # Was this OSD affected by the OSD going out? + old_osds = set(up_acting['up']) | set(up_acting['acting']) + was_on_out_osd = osd_id in old_osds + if not was_on_out_osd: + continue + self.log.debug("pool_id, ps = {0}, {1}".format( pool_id, ps )) - up_acting = old_map.pg_to_up_acting_osds(pool['pool'], ps) + self.log.debug( "up_acting: {0}".format(json.dumps(up_acting, indent=2))) - if osd_id in up_acting['up'] or osd_id in up_acting['acting']: + + new_up_acting = new_map.pg_to_up_acting_osds(pool['pool'], ps) + new_osds = set(new_up_acting['up']) | set(new_up_acting['acting']) + + # Has this OSD been assigned a new location? + # (it might not be if there is no suitable place to move + # after an OSD failure) + is_relocated = len(new_osds - old_osds) > 0 + + self.log.debug( + "new_up_acting: {0}".format(json.dumps(new_up_acting, + indent=2))) + + if was_on_out_osd and is_relocated: + # This PG is now in motion, track its progress affected_pgs.append(PgId(pool_id, ps)) + elif not is_relocated: + # This PG didn't get a new location, we'll log it + unmoved_pgs.append(PgId(pool_id, ps)) + + # In the case that we ignored some PGs, log the reason why (we may + # not end up creating a progress event) + if len(unmoved_pgs): + self.log.warn("{0} PGs were on osd.{1}, but didn't get new locations".format( + len(unmoved_pgs), osd_id)) self.log.warn("{0} PGs affected by osd.{1} going out".format( len(affected_pgs), osd_id))