ceph/teuthology/task/dump_stuck.py

126 lines
3.4 KiB
Python
Raw Normal View History

import logging
import re
import time
import ceph_manager
from teuthology import misc as teuthology
log = logging.getLogger(__name__)
def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
inactive = manager.get_stuck_pgs('inactive', timeout)
assert len(inactive) == num_inactive
unclean = manager.get_stuck_pgs('unclean', timeout)
assert len(unclean) == num_unclean
stale = manager.get_stuck_pgs('stale', timeout)
assert len(stale) == num_stale
# check health output as well
health = manager.raw_cluster_cmd('health')
log.debug('ceph health is: %s', health)
if num_inactive > 0:
m = re.search('(\d+) pgs stuck inactive', health)
assert int(m.group(1)) == num_inactive
if num_unclean > 0:
m = re.search('(\d+) pgs stuck unclean', health)
assert int(m.group(1)) == num_unclean
if num_stale > 0:
m = re.search('(\d+) pgs stuck stale', health)
assert int(m.group(1)) == num_stale
def task(ctx, config):
"""
Test the dump_stuck command.
The ceph configuration should include::
mon_osd_report_timeout = 90
mon_pg_stuck_threshold = 10
"""
assert config is None, \
'dump_stuck requires no configuration'
assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
'dump_stuck requires exactly 2 osds'
timeout = 60
first_mon = teuthology.get_first_mon(ctx, config)
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
manager = ceph_manager.CephManager(
mon,
ctx=ctx,
logger=log.getChild('ceph_manager'),
)
manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
manager.wait_for_clean(timeout)
check_stuck(
manager,
num_inactive=0,
num_unclean=0,
num_stale=0,
)
num_pgs = manager.get_num_pgs()
manager.mark_out_osd(0)
time.sleep(timeout)
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
manager.wait_for_recovery(timeout)
check_stuck(
manager,
num_inactive=0,
num_unclean=num_pgs,
num_stale=0,
)
manager.mark_in_osd(0)
manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
manager.wait_for_clean(timeout)
check_stuck(
manager,
num_inactive=0,
num_unclean=0,
num_stale=0,
)
for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
manager.kill_osd(id_)
manager.mark_down_osd(id_)
starttime = time.time()
done = False
while not done:
try:
check_stuck(
manager,
num_inactive=0,
num_unclean=0,
num_stale=num_pgs,
)
done = True
except AssertionError:
# wait up to 15 minutes to become stale
if time.time() - starttime > 900:
raise
for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
manager.revive_osd(id_)
manager.mark_in_osd(id_)
time.sleep(timeout)
manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
manager.wait_for_clean(timeout)
check_stuck(
manager,
num_inactive=0,
num_unclean=0,
num_stale=0,
)