qa: avoid infinite wait if no repl. can be made

The thrasher can enter an infinite loop waiting for an MDS to take a
certain rank when a replacement may not be possible. For example,
max_mds actives are already running.

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
This commit is contained in:
Patrick Donnelly 2017-01-26 00:57:40 -05:00
parent 638bccb2bb
commit 1185326c45
No known key found for this signature in database
GPG Key ID: 3A2A7E25BEA8AADB

View File

@ -168,23 +168,26 @@ class MDSThrasher(Greenlet):
def wait_for_stable(self, rank = None, gid = None):
self.log('waiting for mds cluster to stabilize...')
status = self.fs.status()
itercount = 0
while True:
status = self.fs.status()
max_mds = status.get_fsmap(self.fs.id)['mdsmap']['max_mds']
ranks = list(status.get_ranks(self.fs.id))
actives = filter(lambda info: "up:active" == info['state'] and "laggy_since" not in info, ranks)
if rank is not None:
if len(actives) >= max_mds:
# no replacement can occur!
return status
try:
info = status.get_rank(self.fs.id, rank)
if info['gid'] != gid and "up:active" == info['state']:
self.log('mds.{name} has gained rank={rank}, replacing gid={gid}'.format(name = info['name'], rank = rank, gid = gid))
return status, info['name']
return status
except:
pass # no rank present
else:
ranks = filter(lambda info: "up:active" == info['state'] and "laggy_since" not in info, list(status.get_ranks(self.fs.id)))
count = len(ranks)
if count >= max_mds:
self.log('mds cluster has {count} alive and active, now stable!'.format(count = count))
if len(actives) >= max_mds:
self.log('mds cluster has {count} alive and active, now stable!'.format(count = len(actives)))
return status, None
itercount = itercount + 1
if itercount > 300/2: # 5 minutes
@ -192,7 +195,6 @@ class MDSThrasher(Greenlet):
elif itercount > 10:
self.log('mds map: {status}'.format(status=self.fs.status()))
time.sleep(2)
status = self.fs.status()
def do_thrash(self):
"""
@ -290,8 +292,7 @@ class MDSThrasher(Greenlet):
self.log('{label} down, removed from mdsmap'.format(label=label, since=last_laggy_since))
# wait for a standby mds to takeover and become active
status, takeover_mds = self.wait_for_stable(rank, gid)
self.log('New active mds is mds.{_id}'.format(_id=takeover_mds))
status = self.wait_for_stable(rank, gid)
# wait for a while before restarting old active to become new
# standby