ceph/tasks/scrub_test.py

"""Scrub testing"""
from cStringIO import StringIO

import logging
import os
import time

import ceph_manager
from teuthology import misc as teuthology

log = logging.getLogger(__name__)

def task(ctx, config):
    """
    Test [deep] scrub

    tasks:
    - chef:
    - install:
    - ceph:
        log-whitelist:
        - '!= known digest'
        - '!= known omap_digest'
        - deep-scrub 0 missing, 1 inconsistent objects
        - deep-scrub 1 errors
        - repair 0 missing, 1 inconsistent objects
        - repair 1 errors, 1 fixed
    - scrub_test:

    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'scrub_test task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    for i in range(num_osds):
        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'flush_pg_stats')
    manager.wait_for_clean()

    # write some data
    p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1', 'write', '-b', '4096'])
    err = p.exitstatus
    log.info('err is %d' % err)

    # wait for some PG to have data that we can mess with
    victim = None
    osd = None
    while victim is None:
        stats = manager.get_pg_stats()
        for pg in stats:
            size = pg['stat_sum']['num_bytes']
            if size > 0:
                victim = pg['pgid']
                osd = pg['acting'][0]
                break

        if victim is None:
            time.sleep(3)

    log.info('messing with PG %s on osd %d' % (victim, osd))

    (osd_remote,) = ctx.cluster.only('osd.%d' % osd).remotes.iterkeys()
    data_path = os.path.join(
        '/var/lib/ceph/osd',
        'ceph-{id}'.format(id=osd),
        'current',
        '{pg}_head'.format(pg=victim)
        )

    # fuzz time
    ls_fp = StringIO()
    osd_remote.run(
        args=[ 'sudo', 'ls', data_path ],
        stdout=ls_fp,
    )
    ls_out = ls_fp.getvalue()
    ls_fp.close()

    # find an object file we can mess with
    osdfilename = None
    for line in ls_out.split('\n'):
        if 'object' in line:
            osdfilename = line
            break
    assert osdfilename is not None

    # Get actual object name from osd stored filename
    tmp=osdfilename.split('__')
    objname=tmp[0]
    objname=objname.replace('\u', '_')
    log.info('fuzzing %s' % objname)

    # put a single \0 at the beginning of the file
    osd_remote.run(
        args=[ 'sudo', 'dd',
               'if=/dev/zero',
               'of=%s' % os.path.join(data_path, osdfilename),
               'bs=1', 'count=1', 'conv=notrunc'
             ]
    )

    # scrub, verify inconsistent
    manager.raw_cluster_cmd('pg', 'deep-scrub', victim)
    # Give deep-scrub a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert inconsistent
        break


    # repair, verify no longer inconsistent
    manager.raw_cluster_cmd('pg', 'repair', victim)
    # Give repair a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert not inconsistent
        break

    # Test deep-scrub with various omap modifications
    manager.do_rados(mon, ['-p', 'rbd', 'setomapval', objname, 'key', 'val'])
    manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', objname, 'hdr'])

    # Modify omap on specific osd
    log.info('fuzzing omap of %s' % objname)
    manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'key']);
    manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname, 'badkey', 'badval']);
    manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'badhdr']);

    # scrub, verify inconsistent
    manager.raw_cluster_cmd('pg', 'deep-scrub', victim)
    # Give deep-scrub a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert inconsistent
        break

    # repair, verify no longer inconsistent
    manager.raw_cluster_cmd('pg', 'repair', victim)
    # Give repair a chance to start
    time.sleep(60)

    while True:
        stats = manager.get_single_pg_stats(victim)
        state = stats['state']

        # wait for the scrub to finish
        if 'scrubbing' in state:
            time.sleep(3)
            continue

        inconsistent = stats['state'].find('+inconsistent') != -1
        assert not inconsistent
        break

    log.info('test successful!')