mirror of
https://github.com/ceph/ceph
synced 2025-01-21 02:31:19 +00:00
ab1b78ae00
The helper gets a sequence number from the osd (or osds), and then polls the mon until that seq is reflected there. This is overkill in some cases, since many tests only require that the stats be reflected on the mgr (not the mon), but waiting for it to also reach the mon is sufficient! Signed-off-by: Sage Weil <sage@redhat.com>
159 lines
4.2 KiB
Python
159 lines
4.2 KiB
Python
"""
|
|
Dump_stuck command
|
|
"""
|
|
import logging
|
|
import re
|
|
import time
|
|
|
|
import ceph_manager
|
|
from teuthology import misc as teuthology
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
|
|
"""
|
|
Do checks. Make sure get_stuck_pgs return the right amout of information, then
|
|
extract health information from the raw_cluster_cmd and compare the results with
|
|
values passed in. This passes if all asserts pass.
|
|
|
|
:param num_manager: Ceph manager
|
|
:param num_inactive: number of inaactive pages that are stuck
|
|
:param num_unclean: number of unclean pages that are stuck
|
|
:paran num_stale: number of stale pages that are stuck
|
|
:param timeout: timeout value for get_stuck_pgs calls
|
|
"""
|
|
inactive = manager.get_stuck_pgs('inactive', timeout)
|
|
unclean = manager.get_stuck_pgs('unclean', timeout)
|
|
stale = manager.get_stuck_pgs('stale', timeout)
|
|
log.info('inactive %s / %d, unclean %s / %d, stale %s / %d',
|
|
len(inactive), num_inactive,
|
|
len(unclean), num_unclean,
|
|
len(stale), num_stale)
|
|
assert len(inactive) == num_inactive
|
|
assert len(unclean) == num_unclean
|
|
assert len(stale) == num_stale
|
|
|
|
def task(ctx, config):
|
|
"""
|
|
Test the dump_stuck command.
|
|
|
|
:param ctx: Context
|
|
:param config: Configuration
|
|
"""
|
|
assert config is None, \
|
|
'dump_stuck requires no configuration'
|
|
assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
|
|
'dump_stuck requires exactly 2 osds'
|
|
|
|
timeout = 60
|
|
first_mon = teuthology.get_first_mon(ctx, config)
|
|
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
|
|
|
|
manager = ceph_manager.CephManager(
|
|
mon,
|
|
ctx=ctx,
|
|
logger=log.getChild('ceph_manager'),
|
|
)
|
|
|
|
manager.flush_pg_stats([0, 1])
|
|
manager.wait_for_clean(timeout)
|
|
|
|
manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--',
|
|
# '--mon-osd-report-timeout 90',
|
|
'--mon-pg-stuck-threshold 10')
|
|
|
|
check_stuck(
|
|
manager,
|
|
num_inactive=0,
|
|
num_unclean=0,
|
|
num_stale=0,
|
|
)
|
|
num_pgs = manager.get_num_pgs()
|
|
|
|
manager.mark_out_osd(0)
|
|
time.sleep(timeout)
|
|
manager.flush_pg_stats([1])
|
|
manager.wait_for_recovery(timeout)
|
|
|
|
check_stuck(
|
|
manager,
|
|
num_inactive=0,
|
|
num_unclean=num_pgs,
|
|
num_stale=0,
|
|
)
|
|
|
|
manager.mark_in_osd(0)
|
|
manager.flush_pg_stats([0, 1])
|
|
manager.wait_for_clean(timeout)
|
|
|
|
check_stuck(
|
|
manager,
|
|
num_inactive=0,
|
|
num_unclean=0,
|
|
num_stale=0,
|
|
)
|
|
|
|
log.info('stopping first osd')
|
|
manager.kill_osd(0)
|
|
manager.mark_down_osd(0)
|
|
|
|
log.info('waiting for all to be unclean')
|
|
starttime = time.time()
|
|
done = False
|
|
while not done:
|
|
try:
|
|
check_stuck(
|
|
manager,
|
|
num_inactive=0,
|
|
num_unclean=num_pgs,
|
|
num_stale=0,
|
|
)
|
|
done = True
|
|
except AssertionError:
|
|
# wait up to 15 minutes to become stale
|
|
if time.time() - starttime > 900:
|
|
raise
|
|
|
|
|
|
log.info('stopping second osd')
|
|
manager.kill_osd(1)
|
|
manager.mark_down_osd(1)
|
|
|
|
log.info('waiting for all to be stale')
|
|
starttime = time.time()
|
|
done = False
|
|
while not done:
|
|
try:
|
|
check_stuck(
|
|
manager,
|
|
num_inactive=0,
|
|
num_unclean=num_pgs,
|
|
num_stale=num_pgs,
|
|
)
|
|
done = True
|
|
except AssertionError:
|
|
# wait up to 15 minutes to become stale
|
|
if time.time() - starttime > 900:
|
|
raise
|
|
|
|
log.info('reviving')
|
|
for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
|
|
manager.revive_osd(id_)
|
|
manager.mark_in_osd(id_)
|
|
while True:
|
|
try:
|
|
manager.flush_pg_stats([0, 1])
|
|
break
|
|
except Exception:
|
|
log.exception('osds must not be started yet, waiting...')
|
|
time.sleep(1)
|
|
manager.wait_for_clean(timeout)
|
|
|
|
check_stuck(
|
|
manager,
|
|
num_inactive=0,
|
|
num_unclean=0,
|
|
num_stale=0,
|
|
)
|