ceph/teuthology/task/osd_failsafe_enospc.py

"""
Handle osdfailsafe configuration settings (nearfull ratio and full ratio)
"""
from cStringIO import StringIO
import logging
import time

import ceph_manager
from ..orchestra import run
from teuthology.task_util.rados import rados
from teuthology import misc as teuthology

log = logging.getLogger(__name__)

def task(ctx, config):
    """
    Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio
    configuration settings

    In order for test to pass must use log-whitelist as follows

        tasks:
            - chef:
            - install:
            - ceph:
                log-whitelist: ['OSD near full', 'OSD full dropping all updates']
            - osd_failsafe_enospc:

    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'osd_failsafe_enospc task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    mon = teuthology.get_single_remote_value(ctx, first_mon)

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )
    ctx.manager = manager

    # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding
    sleep_time = 50

    # something that is always there
    dummyfile = '/etc/fstab'
    dummyfile2 = '/etc/resolv.conf'

    # create 1 pg pool with 1 rep which can only be on osd.0
    osds = manager.get_osd_dump()
    for osd in osds:
        if osd['osd'] != 0:
            manager.mark_out_osd(osd['osd'])

    log.info('creating pool foo')
    manager.create_pool("foo")
    manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')

    # State NONE -> NEAR
    log.info('1. Verify warning messages when exceeding nearfull_ratio')

    proc = mon.run(
             args=[
                'daemon-helper',
                'kill',
                'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=StringIO(),
             wait=False,
        )

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.exitstatus.get()

    lines = proc.stdout.getvalue().split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count

    # State NEAR -> FULL
    log.info('2. Verify error messages when exceeding full_ratio')

    proc = mon.run(
             args=[
                'daemon-helper',
                'kill',
                'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=StringIO(),
             wait=False,
        )

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.exitstatus.get()

    lines = proc.stdout.getvalue().split('\n')

    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count

    log.info('3. Verify write failure when exceeding full_ratio')

    # Write data should fail
    ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])
    assert ret != 0, 'Expected write failure but it succeeded with exit status 0'

    # Put back default
    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
    time.sleep(10)

    # State FULL -> NEAR
    log.info('4. Verify write success when NOT exceeding full_ratio')

    # Write should succeed
    ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])
    assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret

    log.info('5. Verify warning messages again when exceeding nearfull_ratio')

    proc = mon.run(
             args=[
                'daemon-helper',
                'kill',
                'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=StringIO(),
             wait=False,
        )

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.exitstatus.get()

    lines = proc.stdout.getvalue().split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')
    time.sleep(10)

    # State NONE -> FULL
    log.info('6. Verify error messages again when exceeding full_ratio')

    proc = mon.run(
             args=[
                'daemon-helper',
                'kill',
                'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=StringIO(),
             wait=False,
        )

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.exitstatus.get()

    lines = proc.stdout.getvalue().split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count

    # State FULL -> NONE
    log.info('7. Verify no messages settings back to default')

    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
    time.sleep(10)

    proc = mon.run(
             args=[
                'daemon-helper',
                'kill',
                'ceph', '-w'
             ],
             stdin=run.PIPE,
             stdout=StringIO(),
             wait=False,
        )

    time.sleep(sleep_time)
    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
    proc.exitstatus.get()

    lines = proc.stdout.getvalue().split('\n')

    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
    assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count

    log.info('Test Passed')

    # Bring all OSDs back in
    manager.remove_pool("foo")
    for osd in osds:
        if osd['osd'] != 0:
            manager.mark_in_osd(osd['osd'])
Added docstrings, and improved some of the comments on several tasks. 2013-10-12 08:28:27 +00:00			`"""`
			`Handle osdfailsafe configuration settings (nearfull ratio and full ratio)`
			`"""`
osd: data loss: low space handling Automated test cases for feature #4197 Signed-off-by: David Zafman <david.zafman@inktank.com> Reported-by: Sam Just <sam.just@inktank.com> 2013-03-15 04:53:44 +00:00			`from cStringIO import StringIO`
			`import logging`
			`import time`
task_util: move rados command here Six copies are replaced with one, with an added option to check status automatically. This should probably be used in a few places where the return code is ignored. Signed-off-by: Josh Durgin <josh.durgin@inktank.com> 2013-07-22 21:21:51 +00:00
			`import ceph_manager`
osd: data loss: low space handling Automated test cases for feature #4197 Signed-off-by: David Zafman <david.zafman@inktank.com> Reported-by: Sam Just <sam.just@inktank.com> 2013-03-15 04:53:44 +00:00			`from ..orchestra import run`
task_util: move rados command here Six copies are replaced with one, with an added option to check status automatically. This should probably be used in a few places where the return code is ignored. Signed-off-by: Josh Durgin <josh.durgin@inktank.com> 2013-07-22 21:21:51 +00:00			`from teuthology.task_util.rados import rados`
			`from teuthology import misc as teuthology`
osd: data loss: low space handling Automated test cases for feature #4197 Signed-off-by: David Zafman <david.zafman@inktank.com> Reported-by: Sam Just <sam.just@inktank.com> 2013-03-15 04:53:44 +00:00
			`log = logging.getLogger(__name__)`

			`def task(ctx, config):`
			`"""`
			`Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio`
			`configuration settings`

			`In order for test to pass must use log-whitelist as follows`

			`tasks:`
			`- chef:`
			`- install:`
			`- ceph:`
			`log-whitelist: ['OSD near full', 'OSD full dropping all updates']`
			`- osd_failsafe_enospc:`

			`"""`
			`if config is None:`
			`config = {}`
			`assert isinstance(config, dict), \`
			`'osd_failsafe_enospc task only accepts a dict for configuration'`
			`first_mon = teuthology.get_first_mon(ctx, config)`
Lines formerly of the form '(remote,) = ctx.cluster.only(role).remotes.keys()' and '(remote,) = ctx.cluster.only(role).remotes.iterkeys()' would fail with ValueError and no message if there were less than 0 or more than 1 key. Now a new function, get_single_remote_value() is called which prints out more understandable messages. Fixes: 7510 Reviewed-by: Josh Durgin <josh.durgin@inktank.com> Signed-off-by: Warren Usui <warren.usui@inktank.com> 2014-03-01 03:13:40 +00:00			`mon = teuthology.get_single_remote_value(ctx, first_mon)`
osd: data loss: low space handling Automated test cases for feature #4197 Signed-off-by: David Zafman <david.zafman@inktank.com> Reported-by: Sam Just <sam.just@inktank.com> 2013-03-15 04:53:44 +00:00
			`manager = ceph_manager.CephManager(`
			`mon,`
			`ctx=ctx,`
			`logger=log.getChild('ceph_manager'),`
			`)`
			`ctx.manager = manager`

			`# Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding`
			`sleep_time = 50`

			`# something that is always there`
			`dummyfile = '/etc/fstab'`
			`dummyfile2 = '/etc/resolv.conf'`

			`# create 1 pg pool with 1 rep which can only be on osd.0`
			`osds = manager.get_osd_dump()`
			`for osd in osds:`
			`if osd['osd'] != 0:`
			`manager.mark_out_osd(osd['osd'])`

			`log.info('creating pool foo')`
			`manager.create_pool("foo")`
			`manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')`

			`# State NONE -> NEAR`
			`log.info('1. Verify warning messages when exceeding nearfull_ratio')`

			`proc = mon.run(`
			`args=[`
Helper scripts live in /usr/local/bin now! 2013-09-06 20:08:01 +00:00			`'daemon-helper',`
osd: data loss: low space handling Automated test cases for feature #4197 Signed-off-by: David Zafman <david.zafman@inktank.com> Reported-by: Sam Just <sam.just@inktank.com> 2013-03-15 04:53:44 +00:00			`'kill',`
			`'ceph', '-w'`
			`],`
			`stdin=run.PIPE,`
			`stdout=StringIO(),`
			`wait=False,`
			`)`

			`manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')`

			`time.sleep(sleep_time)`
			`proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w`
			`proc.exitstatus.get()`

			`lines = proc.stdout.getvalue().split('\n')`

			`count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))`
			`assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count`
			`count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))`
			`assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count`

			`# State NEAR -> FULL`
			`log.info('2. Verify error messages when exceeding full_ratio')`

			`proc = mon.run(`
			`args=[`
Helper scripts live in /usr/local/bin now! 2013-09-06 20:08:01 +00:00			`'daemon-helper',`
osd: data loss: low space handling Automated test cases for feature #4197 Signed-off-by: David Zafman <david.zafman@inktank.com> Reported-by: Sam Just <sam.just@inktank.com> 2013-03-15 04:53:44 +00:00			`'kill',`
			`'ceph', '-w'`
			`],`
			`stdin=run.PIPE,`
			`stdout=StringIO(),`
			`wait=False,`
			`)`

			`manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')`

			`time.sleep(sleep_time)`
			`proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w`
			`proc.exitstatus.get()`

			`lines = proc.stdout.getvalue().split('\n')`

			`count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))`
			`assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count`

			`log.info('3. Verify write failure when exceeding full_ratio')`

			`# Write data should fail`
task_util: move rados command here Six copies are replaced with one, with an added option to check status automatically. This should probably be used in a few places where the return code is ignored. Signed-off-by: Josh Durgin <josh.durgin@inktank.com> 2013-07-22 21:21:51 +00:00			`ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])`
osd: data loss: low space handling Automated test cases for feature #4197 Signed-off-by: David Zafman <david.zafman@inktank.com> Reported-by: Sam Just <sam.just@inktank.com> 2013-03-15 04:53:44 +00:00			`assert ret != 0, 'Expected write failure but it succeeded with exit status 0'`

			`# Put back default`
			`manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')`
			`time.sleep(10)`

			`# State FULL -> NEAR`
			`log.info('4. Verify write success when NOT exceeding full_ratio')`

			`# Write should succeed`
task_util: move rados command here Six copies are replaced with one, with an added option to check status automatically. This should probably be used in a few places where the return code is ignored. Signed-off-by: Josh Durgin <josh.durgin@inktank.com> 2013-07-22 21:21:51 +00:00			`ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])`
osd: data loss: low space handling Automated test cases for feature #4197 Signed-off-by: David Zafman <david.zafman@inktank.com> Reported-by: Sam Just <sam.just@inktank.com> 2013-03-15 04:53:44 +00:00			`assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret`

			`log.info('5. Verify warning messages again when exceeding nearfull_ratio')`

			`proc = mon.run(`
			`args=[`
Helper scripts live in /usr/local/bin now! 2013-09-06 20:08:01 +00:00			`'daemon-helper',`
osd: data loss: low space handling Automated test cases for feature #4197 Signed-off-by: David Zafman <david.zafman@inktank.com> Reported-by: Sam Just <sam.just@inktank.com> 2013-03-15 04:53:44 +00:00			`'kill',`
			`'ceph', '-w'`
			`],`
			`stdin=run.PIPE,`
			`stdout=StringIO(),`
			`wait=False,`
			`)`

			`time.sleep(sleep_time)`
			`proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w`
			`proc.exitstatus.get()`

			`lines = proc.stdout.getvalue().split('\n')`

			`count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))`
			`assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count`
			`count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))`
			`assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count`

			`manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')`
			`time.sleep(10)`

			`# State NONE -> FULL`
			`log.info('6. Verify error messages again when exceeding full_ratio')`

			`proc = mon.run(`
			`args=[`
Helper scripts live in /usr/local/bin now! 2013-09-06 20:08:01 +00:00			`'daemon-helper',`
osd: data loss: low space handling Automated test cases for feature #4197 Signed-off-by: David Zafman <david.zafman@inktank.com> Reported-by: Sam Just <sam.just@inktank.com> 2013-03-15 04:53:44 +00:00			`'kill',`
			`'ceph', '-w'`
			`],`
			`stdin=run.PIPE,`
			`stdout=StringIO(),`
			`wait=False,`
			`)`

			`manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')`

			`time.sleep(sleep_time)`
			`proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w`
			`proc.exitstatus.get()`

			`lines = proc.stdout.getvalue().split('\n')`

			`count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))`
			`assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count`
			`count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))`
			`assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count`

			`# State FULL -> NONE`
			`log.info('7. Verify no messages settings back to default')`

			`manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')`
			`time.sleep(10)`

			`proc = mon.run(`
			`args=[`
Helper scripts live in /usr/local/bin now! 2013-09-06 20:08:01 +00:00			`'daemon-helper',`
osd: data loss: low space handling Automated test cases for feature #4197 Signed-off-by: David Zafman <david.zafman@inktank.com> Reported-by: Sam Just <sam.just@inktank.com> 2013-03-15 04:53:44 +00:00			`'kill',`
			`'ceph', '-w'`
			`],`
			`stdin=run.PIPE,`
			`stdout=StringIO(),`
			`wait=False,`
			`)`

			`time.sleep(sleep_time)`
			`proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w`
			`proc.exitstatus.get()`

			`lines = proc.stdout.getvalue().split('\n')`

			`count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))`
			`assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count`
			`count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))`
			`assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count`

			`log.info('Test Passed')`

			`# Bring all OSDs back in`
			`manager.remove_pool("foo")`
			`for osd in osds:`
			`if osd['osd'] != 0:`
			`manager.mark_in_osd(osd['osd'])`