ceph/tasks/scrub_test.py

"""Scrub testing"""
from cStringIO import StringIO

import logging
import os
import time

import ceph_manager
from teuthology import misc as teuthology

log = logging.getLogger(__name__)

def task(ctx, config):
    """
    Test [deep] scrub

    tasks:
    - chef:
    - install:
    - ceph:
        log-whitelist:
        - '!= known digest'
        - '!= known omap_digest'
        - deep-scrub 0 missing, 1 inconsistent objects
        - deep-scrub 1 errors
        - repair 0 missing, 1 inconsistent objects
        - repair 1 errors, 1 fixed
    - scrub_test: 
    
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'scrub_test task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
    
    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    for i in range(num_osds):
        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats')
    manager.wait_for_clean()

    # write some data
    p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1', 'write', '-b', '4096'])
    err = p.exitstatus
    log.info('err is %d' % err)

    # wait for some PG to have data that we can mess with
    victim = None
    osd = None
    while victim is None:
        stats = manager.get_pg_stats()
        for pg in stats:
            size = pg['stat_sum']['num_bytes']
            if size > 0:
                victim = pg['pgid']
                osd = pg['acting'][0]
                break

        if victim is None:
            time.sleep(3)

    log.info('messing with PG %s on osd %d' % (victim, osd))

    (osd_remote,) = ctx.cluster.only('osd.%d' % osd).remotes.iterkeys()
    data_path = os.path.join(
        '/var/lib/ceph/osd',
        'ceph-{id}'.format(id=osd),
        'current',
        '{pg}_head'.format(pg=victim)
        )

    # fuzz time
    ls_fp = StringIO()
    osd_remote.run(
        args=[ 'ls', data_path ],
        stdout=ls_fp,
    )
    ls_out = ls_fp.getvalue()
    ls_fp.close()

    # find an object file we can mess with
    osdfilename = None
    for line in ls_out.split('\n'):
        if 'object' in line:
            osdfilename = line
            break
    assert osdfilename is not None

    # Get actual object name from osd stored filename
    tmp=osdfilename.split('__')
    objname=tmp[0]
    objname=objname.replace('\u', '_')
    log.info('fuzzing %s' % objname)

    # put a single \0 at the beginning of the file
    osd_remote.run(
        args=[ 'sudo', 'dd',
               'if=/dev/zero',
               'of=%s' % os.path.join(data_path, osdfilename),
               'bs=1', 'count=1', 'conv=notrunc'
             ]
    )

    # scrub, verify inconsistent
    manager.raw_cluster_cmd('pg', 'deep-scrub', victim)
    # Give deep-scrub a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert inconsistent
        break


    # repair, verify no longer inconsistent
    manager.raw_cluster_cmd('pg', 'repair', victim)
    # Give repair a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert not inconsistent
        break

    # Test deep-scrub with various omap modifications
    manager.do_rados(mon, ['-p', 'rbd', 'setomapval', objname, 'key', 'val'])
    manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', objname, 'hdr'])

    # Modify omap on specific osd
    log.info('fuzzing omap of %s' % objname)
    manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'key']);
    manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname, 'badkey', 'badval']);
    manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'badhdr']);

    # scrub, verify inconsistent
    manager.raw_cluster_cmd('pg', 'deep-scrub', victim)
    # Give deep-scrub a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert inconsistent
        break

    # repair, verify no longer inconsistent
    manager.raw_cluster_cmd('pg', 'repair', victim)
    # Give repair a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert not inconsistent
        break

    log.info('test successful!')
Added docstrings, and improved some of the comments on several tasks. 2013-10-12 08:28:27 +00:00			`"""Scrub testing"""`
task: verify scrub detects files whose contents changed Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-08-02 17:58:08 +00:00			`from cStringIO import StringIO`

			`import logging`
			`import os`
			`import time`

			`import ceph_manager`
			`from teuthology import misc as teuthology`

			`log = logging.getLogger(__name__)`

			`def task(ctx, config):`
			`"""`
			`Test [deep] scrub`
Fix scrub_test.py permission error Add description of yaml file including log-whitelist Add sudo to dd that corrupts data Signed-off-by: David Zafman <david.zafman@inktank.com> Reviewed-by: Warren Usui <warren.usui@inktank.com>:wq 2013-05-14 23:17:10 +00:00
			`tasks:`
			`- chef:`
			`- install:`
			`- ceph:`
			`log-whitelist:`
			`- '!= known digest'`
			`- '!= known omap_digest'`
			`- deep-scrub 0 missing, 1 inconsistent objects`
			`- deep-scrub 1 errors`
			`- repair 0 missing, 1 inconsistent objects`
			`- repair 1 errors, 1 fixed`
			`- scrub_test:`

task: verify scrub detects files whose contents changed Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-08-02 17:58:08 +00:00			`"""`
			`if config is None:`
			`config = {}`
			`assert isinstance(config, dict), \`
			`'scrub_test task only accepts a dict for configuration'`
			`first_mon = teuthology.get_first_mon(ctx, config)`
Revert "Lines formerly of the form '(remote,) = ctx.cluster.only(role).remotes.keys()'" This reverts commit d693b3f8950ffd1f2492a4db0f8234fee31f00f0. 2014-03-27 16:35:28 +00:00			`(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()`
task: verify scrub detects files whose contents changed Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-08-02 17:58:08 +00:00
			`num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')`
			`log.info('num_osds is %s' % num_osds)`

			`manager = ceph_manager.CephManager(`
			`mon,`
			`ctx=ctx,`
			`logger=log.getChild('ceph_manager'),`
			`)`

			`while len(manager.get_osd_status()['up']) < num_osds:`
			`time.sleep(10)`

			`for i in range(num_osds):`
			`manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats')`
			`manager.wait_for_clean()`

			`# write some data`
osd: Testing of deep-scrub omap changes Fix scrub_test.py and add omap corruption test Signed-off-by: David Zafman <david.zafman@inktank.com> Reviewed-by: Samuel Just <sam.just@inktank.com> 2013-01-19 01:11:09 +00:00			`p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1', 'write', '-b', '4096'])`
task: verify scrub detects files whose contents changed Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-08-02 17:58:08 +00:00			`err = p.exitstatus`
			`log.info('err is %d' % err)`

			`# wait for some PG to have data that we can mess with`
			`victim = None`
			`osd = None`
			`while victim is None:`
			`stats = manager.get_pg_stats()`
			`for pg in stats:`
			`size = pg['stat_sum']['num_bytes']`
			`if size > 0:`
			`victim = pg['pgid']`
			`osd = pg['acting'][0]`
			`break`

			`if victim is None:`
			`time.sleep(3)`

			`log.info('messing with PG %s on osd %d' % (victim, osd))`

Revert "Lines formerly of the form '(remote,) = ctx.cluster.only(role).remotes.keys()'" This reverts commit d693b3f8950ffd1f2492a4db0f8234fee31f00f0. 2014-03-27 16:35:28 +00:00			`(osd_remote,) = ctx.cluster.only('osd.%d' % osd).remotes.iterkeys()`
ceph: use default data, keyring locations This required reordering the cluster setup so that we do the ceph-osd --mkfs --mkkey prior to gathering keys and initializing the monitors. Also, run daemons as root. Signed-off-by: Sage Weil <sage@inktank.com> 2013-02-17 06:32:16 +00:00			`data_path = os.path.join(`
			`'/var/lib/ceph/osd',`
			`'ceph-{id}'.format(id=osd),`
			`'current',`
			`'{pg}_head'.format(pg=victim)`
			`)`
task: verify scrub detects files whose contents changed Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-08-02 17:58:08 +00:00
			`# fuzz time`
			`ls_fp = StringIO()`
			`osd_remote.run(`
			`args=[ 'ls', data_path ],`
			`stdout=ls_fp,`
			`)`
			`ls_out = ls_fp.getvalue()`
			`ls_fp.close()`

			`# find an object file we can mess with`
osd: Testing of deep-scrub omap changes Fix scrub_test.py and add omap corruption test Signed-off-by: David Zafman <david.zafman@inktank.com> Reviewed-by: Samuel Just <sam.just@inktank.com> 2013-01-19 01:11:09 +00:00			`osdfilename = None`
task: verify scrub detects files whose contents changed Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-08-02 17:58:08 +00:00			`for line in ls_out.split('\n'):`
osd: Testing of deep-scrub omap changes Fix scrub_test.py and add omap corruption test Signed-off-by: David Zafman <david.zafman@inktank.com> Reviewed-by: Samuel Just <sam.just@inktank.com> 2013-01-19 01:11:09 +00:00			`if 'object' in line:`
			`osdfilename = line`
task: verify scrub detects files whose contents changed Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-08-02 17:58:08 +00:00			`break`
osd: Testing of deep-scrub omap changes Fix scrub_test.py and add omap corruption test Signed-off-by: David Zafman <david.zafman@inktank.com> Reviewed-by: Samuel Just <sam.just@inktank.com> 2013-01-19 01:11:09 +00:00			`assert osdfilename is not None`
task: verify scrub detects files whose contents changed Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-08-02 17:58:08 +00:00
osd: Testing of deep-scrub omap changes Fix scrub_test.py and add omap corruption test Signed-off-by: David Zafman <david.zafman@inktank.com> Reviewed-by: Samuel Just <sam.just@inktank.com> 2013-01-19 01:11:09 +00:00			`# Get actual object name from osd stored filename`
			`tmp=osdfilename.split('__')`
			`objname=tmp[0]`
			`objname=objname.replace('\u', '_')`
			`log.info('fuzzing %s' % objname)`
task: verify scrub detects files whose contents changed Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-08-02 17:58:08 +00:00
			`# put a single \0 at the beginning of the file`
			`osd_remote.run(`
Fix scrub_test.py permission error Add description of yaml file including log-whitelist Add sudo to dd that corrupts data Signed-off-by: David Zafman <david.zafman@inktank.com> Reviewed-by: Warren Usui <warren.usui@inktank.com>:wq 2013-05-14 23:17:10 +00:00			`args=[ 'sudo', 'dd',`
task: verify scrub detects files whose contents changed Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-08-02 17:58:08 +00:00			`'if=/dev/zero',`
osd: Testing of deep-scrub omap changes Fix scrub_test.py and add omap corruption test Signed-off-by: David Zafman <david.zafman@inktank.com> Reviewed-by: Samuel Just <sam.just@inktank.com> 2013-01-19 01:11:09 +00:00			`'of=%s' % os.path.join(data_path, osdfilename),`
task: verify scrub detects files whose contents changed Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-08-02 17:58:08 +00:00			`'bs=1', 'count=1', 'conv=notrunc'`
			`]`
			`)`

			`# scrub, verify inconsistent`
			`manager.raw_cluster_cmd('pg', 'deep-scrub', victim)`
osd: Testing of deep-scrub omap changes Fix scrub_test.py and add omap corruption test Signed-off-by: David Zafman <david.zafman@inktank.com> Reviewed-by: Samuel Just <sam.just@inktank.com> 2013-01-19 01:11:09 +00:00			`# Give deep-scrub a chance to start`
			`time.sleep(60)`
task: verify scrub detects files whose contents changed Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-08-02 17:58:08 +00:00
			`while True:`
			`stats = manager.get_single_pg_stats(victim)`
			`state = stats['state']`

			`# wait for the scrub to finish`
osd: Testing of deep-scrub omap changes Fix scrub_test.py and add omap corruption test Signed-off-by: David Zafman <david.zafman@inktank.com> Reviewed-by: Samuel Just <sam.just@inktank.com> 2013-01-19 01:11:09 +00:00			`if 'scrubbing' in state:`
task: verify scrub detects files whose contents changed Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-08-02 17:58:08 +00:00			`time.sleep(3)`
			`continue`

			`inconsistent = stats['state'].find('+inconsistent') != -1`
			`assert inconsistent`
			`break`


			`# repair, verify no longer inconsistent`
			`manager.raw_cluster_cmd('pg', 'repair', victim)`
osd: Testing of deep-scrub omap changes Fix scrub_test.py and add omap corruption test Signed-off-by: David Zafman <david.zafman@inktank.com> Reviewed-by: Samuel Just <sam.just@inktank.com> 2013-01-19 01:11:09 +00:00			`# Give repair a chance to start`
			`time.sleep(60)`

			`while True:`
			`stats = manager.get_single_pg_stats(victim)`
			`state = stats['state']`

			`# wait for the scrub to finish`
			`if 'scrubbing' in state:`
			`time.sleep(3)`
			`continue`

			`inconsistent = stats['state'].find('+inconsistent') != -1`
			`assert not inconsistent`
			`break`

			`# Test deep-scrub with various omap modifications`
			`manager.do_rados(mon, ['-p', 'rbd', 'setomapval', objname, 'key', 'val'])`
			`manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', objname, 'hdr'])`

			`# Modify omap on specific osd`
			`log.info('fuzzing omap of %s' % objname)`
			`manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'key']);`
			`manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname, 'badkey', 'badval']);`
			`manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'badhdr']);`

			`# scrub, verify inconsistent`
			`manager.raw_cluster_cmd('pg', 'deep-scrub', victim)`
			`# Give deep-scrub a chance to start`
			`time.sleep(60)`

			`while True:`
			`stats = manager.get_single_pg_stats(victim)`
			`state = stats['state']`

			`# wait for the scrub to finish`
			`if 'scrubbing' in state:`
			`time.sleep(3)`
			`continue`

			`inconsistent = stats['state'].find('+inconsistent') != -1`
			`assert inconsistent`
			`break`

			`# repair, verify no longer inconsistent`
			`manager.raw_cluster_cmd('pg', 'repair', victim)`
			`# Give repair a chance to start`
			`time.sleep(60)`
task: verify scrub detects files whose contents changed Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-08-02 17:58:08 +00:00
			`while True:`
			`stats = manager.get_single_pg_stats(victim)`
			`state = stats['state']`

			`# wait for the scrub to finish`
osd: Testing of deep-scrub omap changes Fix scrub_test.py and add omap corruption test Signed-off-by: David Zafman <david.zafman@inktank.com> Reviewed-by: Samuel Just <sam.just@inktank.com> 2013-01-19 01:11:09 +00:00			`if 'scrubbing' in state:`
task: verify scrub detects files whose contents changed Signed-off-by: Mike Ryan <mike.ryan@inktank.com> 2012-08-02 17:58:08 +00:00			`time.sleep(3)`
			`continue`

			`inconsistent = stats['state'].find('+inconsistent') != -1`
			`assert not inconsistent`
			`break`

			`log.info('test successful!')`