ceph/qa/tasks/mon_recovery.py

81 lines
2.2 KiB
Python

"""
Monitor recovery
"""
import logging
from tasks import ceph_manager
from teuthology import misc as teuthology
log = logging.getLogger(__name__)
def task(ctx, config):
"""
Test monitor recovery.
"""
if config is None:
config = {}
assert isinstance(config, dict), \
'task only accepts a dict for configuration'
first_mon = teuthology.get_first_mon(ctx, config)
(mon,) = ctx.cluster.only(first_mon).remotes.keys()
manager = ceph_manager.CephManager(
mon,
ctx=ctx,
logger=log.getChild('ceph_manager'),
)
mons = [f.split('.')[1] for f in teuthology.get_mon_names(ctx)]
log.info("mon ids = %s" % mons)
manager.wait_for_mon_quorum_size(len(mons))
log.info('verifying all monitors are in the quorum')
for m in mons:
s = manager.get_mon_status(m)
assert s['state'] == 'leader' or s['state'] == 'peon'
assert len(s['quorum']) == len(mons)
log.info('restarting each monitor in turn')
for m in mons:
# stop a monitor
manager.kill_mon(m)
manager.wait_for_mon_quorum_size(len(mons) - 1)
# restart
manager.revive_mon(m)
manager.wait_for_mon_quorum_size(len(mons))
# in forward and reverse order,
rmons = mons
rmons.reverse()
for mons in mons, rmons:
log.info('stopping all monitors')
for m in mons:
manager.kill_mon(m)
log.info('forming a minimal quorum for %s, then adding monitors' % mons)
qnum = (len(mons) // 2) + 1
num = 0
for m in mons:
manager.revive_mon(m)
num += 1
if num >= qnum:
manager.wait_for_mon_quorum_size(num)
# on both leader and non-leader ranks...
for rank in [0, 1]:
# take one out
log.info('removing mon %s' % mons[rank])
manager.kill_mon(mons[rank])
manager.wait_for_mon_quorum_size(len(mons) - 1)
log.info('causing some monitor log activity')
m = 30
for n in range(1, m):
manager.raw_cluster_cmd('log', '%d of %d' % (n, m))
log.info('adding mon %s back in' % mons[rank])
manager.revive_mon(mons[rank])
manager.wait_for_mon_quorum_size(len(mons))