ceph/tasks/dump_stuck.py

"""
Dump_stuck command
"""
import logging
import re
import time

import ceph_manager
from teuthology import misc as teuthology


log = logging.getLogger(__name__)

def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
    """
    Do checks.  Make sure get_stuck_pgs return the right amout of information, then
    extract health information from the raw_cluster_cmd and compare the results with
    values passed in.  This passes if all asserts pass.
 
    :param num_manager: Ceph manager
    :param num_inactive: number of inaactive pages that are stuck
    :param num_unclean: number of unclean pages that are stuck
    :paran num_stale: number of stale pages that are stuck
    :param timeout: timeout value for get_stuck_pgs calls
    """
    inactive = manager.get_stuck_pgs('inactive', timeout)
    assert len(inactive) == num_inactive
    unclean = manager.get_stuck_pgs('unclean', timeout)
    assert len(unclean) == num_unclean
    stale = manager.get_stuck_pgs('stale', timeout)
    assert len(stale) == num_stale

    # check health output as well
    health = manager.raw_cluster_cmd('health')
    log.debug('ceph health is: %s', health)
    if num_inactive > 0:
        m = re.search('(\d+) pgs stuck inactive', health)
        assert int(m.group(1)) == num_inactive
    if num_unclean > 0:
        m = re.search('(\d+) pgs stuck unclean', health)
        assert int(m.group(1)) == num_unclean
    if num_stale > 0:
        m = re.search('(\d+) pgs stuck stale', health)
        assert int(m.group(1)) == num_stale

def task(ctx, config):
    """
    Test the dump_stuck command.

    :param ctx: Context
    :param config: Configuration
    """
    assert config is None, \
        'dump_stuck requires no configuration'
    assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
        'dump_stuck requires exactly 2 osds'

    timeout = 60
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.wait_for_clean(timeout)

    manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--',
#                            '--mon-osd-report-timeout 90',
                            '--mon-pg-stuck-threshold 10')

    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
        )
    num_pgs = manager.get_num_pgs()

    manager.mark_out_osd(0)
    time.sleep(timeout)
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.wait_for_recovery(timeout)

    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=num_pgs,
        num_stale=0,
        )

    manager.mark_in_osd(0)
    manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
    manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
    manager.wait_for_clean(timeout)

    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
        )

    for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
        manager.kill_osd(id_)
        manager.mark_down_osd(id_)

    starttime = time.time()
    done = False
    while not done:
        try:
            check_stuck(
                manager,
                num_inactive=0,
                num_unclean=0,
                num_stale=num_pgs,
                )
            done = True
        except AssertionError:
            # wait up to 15 minutes to become stale
            if time.time() - starttime > 900:
                raise

    for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
        manager.revive_osd(id_)
        manager.mark_in_osd(id_)
    while True:
        try:
            manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
            manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
            break
        except Exception:
            log.exception('osds must not be started yet, waiting...')
            time.sleep(1)
    manager.wait_for_clean(timeout)

    check_stuck(
        manager,
        num_inactive=0,
        num_unclean=0,
        num_stale=0,
        )
Added docstrings, and improved some of the comments on several tasks. 2013-10-12 08:28:27 +00:00			`"""`
			`Dump_stuck command`
			`"""`
Add a task for testing stuck pg visibility. 2012-02-21 21:11:05 +00:00			`import logging`
dump_stuck: verify that 'ceph health' mentions the right number of inactive/unclean/stale pgs 2012-02-28 21:55:46 +00:00			`import re`
Add a task for testing stuck pg visibility. 2012-02-21 21:11:05 +00:00			`import time`

			`import ceph_manager`
			`from teuthology import misc as teuthology`


			`log = logging.getLogger(__name__)`

			`def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):`
Added docstrings, and improved some of the comments on several tasks. 2013-10-12 08:28:27 +00:00			`"""`
			`Do checks. Make sure get_stuck_pgs return the right amout of information, then`
			`extract health information from the raw_cluster_cmd and compare the results with`
			`values passed in. This passes if all asserts pass.`

			`:param num_manager: Ceph manager`
			`:param num_inactive: number of inaactive pages that are stuck`
			`:param num_unclean: number of unclean pages that are stuck`
			`:paran num_stale: number of stale pages that are stuck`
			`:param timeout: timeout value for get_stuck_pgs calls`
			`"""`
Add a task for testing stuck pg visibility. 2012-02-21 21:11:05 +00:00			`inactive = manager.get_stuck_pgs('inactive', timeout)`
			`assert len(inactive) == num_inactive`
			`unclean = manager.get_stuck_pgs('unclean', timeout)`
			`assert len(unclean) == num_unclean`
			`stale = manager.get_stuck_pgs('stale', timeout)`
			`assert len(stale) == num_stale`

dump_stuck: verify that 'ceph health' mentions the right number of inactive/unclean/stale pgs 2012-02-28 21:55:46 +00:00			`# check health output as well`
			`health = manager.raw_cluster_cmd('health')`
			`log.debug('ceph health is: %s', health)`
			`if num_inactive > 0:`
			`m = re.search('(\d+) pgs stuck inactive', health)`
			`assert int(m.group(1)) == num_inactive`
			`if num_unclean > 0:`
			`m = re.search('(\d+) pgs stuck unclean', health)`
			`assert int(m.group(1)) == num_unclean`
			`if num_stale > 0:`
			`m = re.search('(\d+) pgs stuck stale', health)`
			`assert int(m.group(1)) == num_stale`

Add a task for testing stuck pg visibility. 2012-02-21 21:11:05 +00:00			`def task(ctx, config):`
			`"""`
			`Test the dump_stuck command.`
dump_stuck: note required ceph configuration 2012-02-29 23:47:17 +00:00
Added docstrings, and improved some of the comments on several tasks. 2013-10-12 08:28:27 +00:00			`:param ctx: Context`
			`:param config: Configuration`
Add a task for testing stuck pg visibility. 2012-02-21 21:11:05 +00:00			`"""`
			`assert config is None, \`
			`'dump_stuck requires no configuration'`
			`assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \`
			`'dump_stuck requires exactly 2 osds'`

			`timeout = 60`
			`first_mon = teuthology.get_first_mon(ctx, config)`
Revert "Lines formerly of the form '(remote,) = ctx.cluster.only(role).remotes.keys()'" This reverts commit d693b3f8950ffd1f2492a4db0f8234fee31f00f0. 2014-03-27 16:35:28 +00:00			`(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()`
Add a task for testing stuck pg visibility. 2012-02-21 21:11:05 +00:00
			`manager = ceph_manager.CephManager(`
			`mon,`
			`ctx=ctx,`
			`logger=log.getChild('ceph_manager'),`
			`)`

dump_stuck: verify that 'ceph health' mentions the right number of inactive/unclean/stale pgs 2012-02-28 21:55:46 +00:00			`manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')`
			`manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')`
Add a task for testing stuck pg visibility. 2012-02-21 21:11:05 +00:00			`manager.wait_for_clean(timeout)`
dump_stuck: verify that 'ceph health' mentions the right number of inactive/unclean/stale pgs 2012-02-28 21:55:46 +00:00
dump_stuck: fix test The mon-osd-report-timeout setting shouldn't be there! We will set the other item explicitly, and remove both from the suite yaml. Fixes: #5440 2013-06-25 19:45:22 +00:00			`manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--',`
ceph_manager, dump_stuck: fix injectargs args Signed-off-by: Sage Weil <sage@inktank.com> 2013-07-28 00:41:51 +00:00			`# '--mon-osd-report-timeout 90',`
			`'--mon-pg-stuck-threshold 10')`
dump_stuck: fix test The mon-osd-report-timeout setting shouldn't be there! We will set the other item explicitly, and remove both from the suite yaml. Fixes: #5440 2013-06-25 19:45:22 +00:00
Add a task for testing stuck pg visibility. 2012-02-21 21:11:05 +00:00			`check_stuck(`
			`manager,`
			`num_inactive=0,`
			`num_unclean=0,`
			`num_stale=0,`
			`)`
			`num_pgs = manager.get_num_pgs()`

			`manager.mark_out_osd(0)`
			`time.sleep(timeout)`
dump_stuck: flush stats before waiting for recovery/clean 2012-02-24 01:07:26 +00:00			`manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')`
Add a task for testing stuck pg visibility. 2012-02-21 21:11:05 +00:00			`manager.wait_for_recovery(timeout)`

			`check_stuck(`
			`manager,`
			`num_inactive=0,`
			`num_unclean=num_pgs,`
			`num_stale=0,`
			`)`

			`manager.mark_in_osd(0)`
dump_stuck: flush stats before waiting for recovery/clean 2012-02-24 01:07:26 +00:00			`manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')`
			`manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')`
Add a task for testing stuck pg visibility. 2012-02-21 21:11:05 +00:00			`manager.wait_for_clean(timeout)`

			`check_stuck(`
			`manager,`
			`num_inactive=0,`
			`num_unclean=0,`
			`num_stale=0,`
			`)`

			`for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):`
			`manager.kill_osd(id_)`
			`manager.mark_down_osd(id_)`

			`starttime = time.time()`
			`done = False`
			`while not done:`
			`try:`
			`check_stuck(`
			`manager,`
			`num_inactive=0,`
			`num_unclean=0,`
			`num_stale=num_pgs,`
			`)`
			`done = True`
			`except AssertionError:`
			`# wait up to 15 minutes to become stale`
			`if time.time() - starttime > 900:`
			`raise`

			`for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):`
			`manager.revive_osd(id_)`
			`manager.mark_in_osd(id_)`
dump_stuck: fix race with osd start Occasionally we don't wait long enough for the osd to start and mark itself up. Keep trying until flush succeeds. Fixes: #5431 Signed-off-by: Sage Weil <sage@inktank.com> 2013-06-23 23:21:45 +00:00			`while True:`
			`try:`
			`manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')`
			`manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')`
			`break`
Never use 'except:' without specifying an Exception. 2013-08-30 15:58:10 +00:00			`except Exception:`
			`log.exception('osds must not be started yet, waiting...')`
dump_stuck: fix race with osd start Occasionally we don't wait long enough for the osd to start and mark itself up. Keep trying until flush succeeds. Fixes: #5431 Signed-off-by: Sage Weil <sage@inktank.com> 2013-06-23 23:21:45 +00:00			`time.sleep(1)`
Add a task for testing stuck pg visibility. 2012-02-21 21:11:05 +00:00			`manager.wait_for_clean(timeout)`

			`check_stuck(`
			`manager,`
			`num_inactive=0,`
			`num_unclean=0,`
			`num_stale=0,`
			`)`