ceph/teuthology/task/osd_failsafe_enospc.py

from cStringIO import StringIO
import logging
import ceph_manager
from teuthology import misc as teuthology
import time
from ..orchestra import run

log = logging.getLogger(__name__)


def rados(testdir, remote, cmd, wait=True):
    log.info("rados %s" % ' '.join(cmd))
    pre = [
        '{tdir}/enable-coredump'.format(tdir=testdir),
        'ceph-coverage',
        '{tdir}/archive/coverage'.format(tdir=testdir),
        'rados',
        ];
    pre.extend(cmd)
    proc = remote.run(
        args=pre,
        check_status=False,
        wait=wait
        )
    if wait:
        return proc.exitstatus
    else:
        return proc

def task(ctx, config):
    """
    Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio
    configuration settings

    In order for test to pass must use log-whitelist as follows

        tasks:
            - chef:
            - install:
            - ceph:
                log-whitelist: ['OSD near full', 'OSD full dropping all updates']
            - osd_failsafe_enospc:

    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'osd_failsafe_enospc task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()

    testdir = teuthology.get_testdir(ctx)
    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )
    ctx.manager = manager

    # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding
    sleep_time = 50

    # something that is always there
    dummyfile = '/etc/fstab'
    dummyfile2 = '/etc/resolv.conf'

    # create 1 pg pool with 1 rep which can only be on osd.0
    osds = manager.get_osd_dump()
    for osd in osds:
        if osd['osd'] != 0:
            manager.mark_out_osd(osd['osd'])

    log.info('creating pool foo')
    manager.create_pool("foo")
    manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')

    # State NONE -> NEAR
    log.info('1. Verify warning messages when exceeding nearfull_ratio')

    proc = mon.run(
             args=[
                '{tdir}/daemon-helper'.format(tdir=teuthology.get_testdir(ctx)),
                'kill',
                'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=StringIO(),
             wait=False,
        )

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.exitstatus.get()

    lines = proc.stdout.getvalue().split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count

    # State NEAR -> FULL
    log.info('2. Verify error messages when exceeding full_ratio')

    proc = mon.run(
             args=[
                '{tdir}/daemon-helper'.format(tdir=teuthology.get_testdir(ctx)),
                'kill',
                'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=StringIO(),
             wait=False,
        )

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.exitstatus.get()

    lines = proc.stdout.getvalue().split('\n')

    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count

    log.info('3. Verify write failure when exceeding full_ratio')

    # Write data should fail
    ret = rados(testdir, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])
    assert ret != 0, 'Expected write failure but it succeeded with exit status 0'

    # Put back default
    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
    time.sleep(10)

    # State FULL -> NEAR
    log.info('4. Verify write success when NOT exceeding full_ratio')

    # Write should succeed
    ret = rados(testdir, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])
    assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret

    log.info('5. Verify warning messages again when exceeding nearfull_ratio')

    proc = mon.run(
             args=[
                '{tdir}/daemon-helper'.format(tdir=teuthology.get_testdir(ctx)),
                'kill',
                'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=StringIO(),
             wait=False,
        )

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.exitstatus.get()

    lines = proc.stdout.getvalue().split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')
    time.sleep(10)

    # State NONE -> FULL
    log.info('6. Verify error messages again when exceeding full_ratio')

    proc = mon.run(
             args=[
                '{tdir}/daemon-helper'.format(tdir=teuthology.get_testdir(ctx)),
                'kill',
                'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=StringIO(),
             wait=False,
        )

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.exitstatus.get()

    lines = proc.stdout.getvalue().split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count

    # State FULL -> NONE
    log.info('7. Verify no messages settings back to default')

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
    time.sleep(10)

    proc = mon.run(
             args=[
                '{tdir}/daemon-helper'.format(tdir=teuthology.get_testdir(ctx)),
                'kill',
                'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=StringIO(),
             wait=False,
        )

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.exitstatus.get()

    lines = proc.stdout.getvalue().split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count

    log.info('Test Passed')

    # Bring all OSDs back in
    manager.remove_pool("foo")
    for osd in osds:
        if osd['osd'] != 0:
            manager.mark_in_osd(osd['osd'])
osd: data loss: low space handling Automated test cases for feature #4197 Signed-off-by: David Zafman <david.zafman@inktank.com> Reported-by: Sam Just <sam.just@inktank.com> 2013-03-15 04:53:44 +00:00			`from cStringIO import StringIO`
			`import logging`
			`import ceph_manager`
			`from teuthology import misc as teuthology`
			`import time`
			`from ..orchestra import run`

			`log = logging.getLogger(__name__)`


			`def rados(testdir, remote, cmd, wait=True):`
			`log.info("rados %s" % ' '.join(cmd))`
			`pre = [`
			`'{tdir}/enable-coredump'.format(tdir=testdir),`
			`'ceph-coverage',`
			`'{tdir}/archive/coverage'.format(tdir=testdir),`
			`'rados',`
			`];`
			`pre.extend(cmd)`
			`proc = remote.run(`
			`args=pre,`
			`check_status=False,`
			`wait=wait`
			`)`
			`if wait:`
			`return proc.exitstatus`
			`else:`
			`return proc`

			`def task(ctx, config):`
			`"""`
			`Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio`
			`configuration settings`

			`In order for test to pass must use log-whitelist as follows`

			`tasks:`
			`- chef:`
			`- install:`
			`- ceph:`
			`log-whitelist: ['OSD near full', 'OSD full dropping all updates']`
			`- osd_failsafe_enospc:`

			`"""`
			`if config is None:`
			`config = {}`
			`assert isinstance(config, dict), \`
			`'osd_failsafe_enospc task only accepts a dict for configuration'`
			`first_mon = teuthology.get_first_mon(ctx, config)`
			`(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()`

			`testdir = teuthology.get_testdir(ctx)`
			`manager = ceph_manager.CephManager(`
			`mon,`
			`ctx=ctx,`
			`logger=log.getChild('ceph_manager'),`
			`)`
			`ctx.manager = manager`

			`# Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding`
			`sleep_time = 50`

			`# something that is always there`
			`dummyfile = '/etc/fstab'`
			`dummyfile2 = '/etc/resolv.conf'`

			`# create 1 pg pool with 1 rep which can only be on osd.0`
			`osds = manager.get_osd_dump()`
			`for osd in osds:`
			`if osd['osd'] != 0:`
			`manager.mark_out_osd(osd['osd'])`

			`log.info('creating pool foo')`
			`manager.create_pool("foo")`
			`manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')`

			`# State NONE -> NEAR`
			`log.info('1. Verify warning messages when exceeding nearfull_ratio')`

			`proc = mon.run(`
			`args=[`
			`'{tdir}/daemon-helper'.format(tdir=teuthology.get_testdir(ctx)),`
			`'kill',`
			`'ceph', '-w'`
			`],`
			`stdin=run.PIPE,`
			`stdout=StringIO(),`
			`wait=False,`
			`)`

			`manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')`

			`time.sleep(sleep_time)`
			`proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w`
			`proc.exitstatus.get()`

			`lines = proc.stdout.getvalue().split('\n')`

			`count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))`
			`assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count`
			`count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))`
			`assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count`

			`# State NEAR -> FULL`
			`log.info('2. Verify error messages when exceeding full_ratio')`

			`proc = mon.run(`
			`args=[`
			`'{tdir}/daemon-helper'.format(tdir=teuthology.get_testdir(ctx)),`
			`'kill',`
			`'ceph', '-w'`
			`],`
			`stdin=run.PIPE,`
			`stdout=StringIO(),`
			`wait=False,`
			`)`

			`manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')`

			`time.sleep(sleep_time)`
			`proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w`
			`proc.exitstatus.get()`

			`lines = proc.stdout.getvalue().split('\n')`

			`count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))`
			`assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count`

			`log.info('3. Verify write failure when exceeding full_ratio')`

			`# Write data should fail`
			`ret = rados(testdir, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])`
			`assert ret != 0, 'Expected write failure but it succeeded with exit status 0'`

			`# Put back default`
			`manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')`
			`time.sleep(10)`

			`# State FULL -> NEAR`
			`log.info('4. Verify write success when NOT exceeding full_ratio')`

			`# Write should succeed`
			`ret = rados(testdir, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])`
			`assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret`

			`log.info('5. Verify warning messages again when exceeding nearfull_ratio')`

			`proc = mon.run(`
			`args=[`
			`'{tdir}/daemon-helper'.format(tdir=teuthology.get_testdir(ctx)),`
			`'kill',`
			`'ceph', '-w'`
			`],`
			`stdin=run.PIPE,`
			`stdout=StringIO(),`
			`wait=False,`
			`)`

			`time.sleep(sleep_time)`
			`proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w`
			`proc.exitstatus.get()`

			`lines = proc.stdout.getvalue().split('\n')`

			`count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))`
			`assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count`
			`count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))`
			`assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count`

			`manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')`
			`time.sleep(10)`

			`# State NONE -> FULL`
			`log.info('6. Verify error messages again when exceeding full_ratio')`

			`proc = mon.run(`
			`args=[`
			`'{tdir}/daemon-helper'.format(tdir=teuthology.get_testdir(ctx)),`
			`'kill',`
			`'ceph', '-w'`
			`],`
			`stdin=run.PIPE,`
			`stdout=StringIO(),`
			`wait=False,`
			`)`

			`manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')`

			`time.sleep(sleep_time)`
			`proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w`
			`proc.exitstatus.get()`

			`lines = proc.stdout.getvalue().split('\n')`

			`count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))`
			`assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count`
			`count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))`
			`assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count`

			`# State FULL -> NONE`
			`log.info('7. Verify no messages settings back to default')`

			`manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')`
			`time.sleep(10)`

			`proc = mon.run(`
			`args=[`
			`'{tdir}/daemon-helper'.format(tdir=teuthology.get_testdir(ctx)),`
			`'kill',`
			`'ceph', '-w'`
			`],`
			`stdin=run.PIPE,`
			`stdout=StringIO(),`
			`wait=False,`
			`)`

			`time.sleep(sleep_time)`
			`proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w`
			`proc.exitstatus.get()`

			`lines = proc.stdout.getvalue().split('\n')`

			`count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))`
			`assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count`
			`count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))`
			`assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count`

			`log.info('Test Passed')`

			`# Bring all OSDs back in`
			`manager.remove_pool("foo")`
			`for osd in osds:`
			`if osd['osd'] != 0:`
			`manager.mark_in_osd(osd['osd'])`