pybind/mgr/cephadm: upgrade MDS if no MDS is "up"

The upgrade process can get stuck if an MDS crashes. This should be rare
when straddling v16.2.5 where the compatset of the file system inherits
the FSMap "default". The MDS from pre-v16.2.5 do not yet share a
compatset with the mons so the mons will do no promotions, causing
upgrade task to get stuck.

Fixes: https://tracker.ceph.com/issues/53074
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
This commit is contained in:
Patrick Donnelly 2021-10-27 20:26:55 -04:00
parent 6d0e87ccaf
commit d771725664
No known key found for this signature in database
GPG Key ID: BE69BB7D36E459B4

View File

@ -441,23 +441,31 @@ class CephadmUpgrade:
continue_upgrade = False
continue
if not (mdsmap['in'] == [0] and len(mdsmap['up']) == 1):
if not (mdsmap['in'] == [0] and len(mdsmap['up']) <= 1):
self.mgr.log.info('Upgrade: Waiting for fs %s to scale down to reach 1 MDS' % (fs_name))
time.sleep(10)
continue_upgrade = False
continue
mdss = list(mdsmap['info'].values())
assert len(mdss) == 1
lone_mds = mdss[0]
if lone_mds['state'] != 'up:active':
self.mgr.log.info('Upgrade: Waiting for mds.%s to be up:active (currently %s)' % (
lone_mds['name'],
lone_mds['state'],
))
time.sleep(10)
continue_upgrade = False
continue
if len(mdsmap['up']) == 0:
self.mgr.log.warning("Upgrade: No mds is up; continuing upgrade procedure to poke things in the right direction")
# This can happen because the current version MDS have
# incompatible compatsets; the mons will not do any promotions.
# We must upgrade to continue.
elif len(mdsmap['up']) > 0:
mdss = list(mdsmap['info'].values())
assert len(mdss) == 1
lone_mds = mdss[0]
if lone_mds['state'] != 'up:active':
self.mgr.log.info('Upgrade: Waiting for mds.%s to be up:active (currently %s)' % (
lone_mds['name'],
lone_mds['state'],
))
time.sleep(10)
continue_upgrade = False
continue
else:
assert False
return continue_upgrade