ceph/teuthology/task/internal.py

from cStringIO import StringIO
import contextlib
import gevent
import logging
import os
import tarfile
import time
import yaml

from teuthology import lock
from teuthology import misc as teuthology
from teuthology import safepath
from ..orchestra import run

log = logging.getLogger(__name__)

@contextlib.contextmanager
def base(ctx, config):
    log.info('Creating base directory...')
    run.wait(
        ctx.cluster.run(
            args=[
                'mkdir', '-m0755', '--',
                '/tmp/cephtest',
                ],
            wait=False,
            )
        )

    try:
        yield
    finally:
        log.info('Tidying up after the test...')
        # if this fails, one of the earlier cleanups is flawed; don't
        # just cram an rm -rf here
        run.wait(
            ctx.cluster.run(
                args=[
                    'rmdir',
                    '--',
                    '/tmp/cephtest',
                    ],
                wait=False,
                ),
            )


@contextlib.contextmanager
def lock_machines(ctx, config):
    log.info('Locking machines...')
    assert isinstance(config, int), 'config must be an integer'

    while True:
        # make sure there are enough machines up
        machines = lock.list_locks(ctx)
        if machines is None:
            if ctx.block:
                log.warn('error listing machines, trying again')
                time.sleep(20)
                continue
            else:
                assert 0, 'error listing machines'
        num_up = len(filter(lambda machine: machine['up'], machines))
        assert num_up >= config, 'not enough machines are up'

        # make sure there are machines for non-automated jobs to run
        num_free = len(filter(
                lambda machine: machine['up'] and machine['locked'] == 0,
                machines
                ))
        if num_free < 6 and ctx.owner.startswith('scheduled'):
            if ctx.block:
                log.info('waiting for more machines to be free...')
                time.sleep(10)
                continue
            else:
                assert 0, 'not enough machines free'

        newly_locked = lock.lock_many(ctx, config, ctx.owner)
        if len(newly_locked) == config:
            ctx.config['targets'] = newly_locked
            log.info('\n  '.join(['Locked targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines()))
            break
        elif not ctx.block:
            assert 0, 'not enough machines are available'

        log.warn('Could not lock enough machines, waiting...')
        time.sleep(10)
    try:
        yield
    finally:
        if ctx.summary.get('success', False):
            log.info('Unlocking machines...')
            for machine in ctx.config['targets'].iterkeys():
                lock.unlock(ctx, machine, ctx.owner)

def save_config(ctx, config):
    log.info('Saving configuration')
    if ctx.archive is not None:
        with file(os.path.join(ctx.archive, 'config.yaml'), 'w') as f:
            yaml.safe_dump(ctx.config, f, default_flow_style=False)

def check_lock(ctx, config):
    if ctx.config.get('check-locks') == False:
        log.info('Lock checking disabled.')
        return
    log.info('Checking locks...')
    for machine in ctx.config['targets'].iterkeys():
        status = lock.get_status(ctx, machine)
        log.debug('machine status is %s', repr(status))
        assert status is not None, \
            'could not read lock status for {name}'.format(name=machine)
        assert status['up'], 'machine {name} is marked down'.format(name=machine)
        assert status['locked'], \
            'machine {name} is not locked'.format(name=machine)
        assert status['locked_by'] == ctx.owner, \
            'machine {name} is locked by {user}, not {owner}'.format(
            name=machine,
            user=status['locked_by'],
            owner=ctx.owner,
            )

@contextlib.contextmanager
def timer(ctx, config):
    log.info('Starting timer...')
    start = time.time()
    try:
        yield
    finally:
        duration = time.time() - start
        log.info('Duration was %f seconds', duration)
        ctx.summary['duration'] = duration

def connect(ctx, config):
    log.info('Opening connections...')
    from ..orchestra import connection, remote
    from ..orchestra import cluster
    remotes = []
    for t, key in ctx.config['targets'].iteritems():
        log.debug('connecting to %s', t)
        remotes.append(
            remote.Remote(name=t,
                          ssh=connection.connect(user_at_host=t,
                                                 host_key=key,
                                                 keep_alive=True)))
    ctx.cluster = cluster.Cluster()
    if 'roles' in ctx.config:
        for rem, roles in zip(remotes, ctx.config['roles']):
            assert all(isinstance(role, str) for role in roles), \
                "Roles in config must be strings: %r" % roles
            ctx.cluster.add(rem, roles)
            log.info('roles: %s - %s' % (rem, roles))
    else:
        for rem in remotes:
            ctx.cluster.add(rem, rem.name)

def check_conflict(ctx, config):
    log.info('Checking for old test directory...')
    processes = ctx.cluster.run(
        args=[
            'test', '!', '-e', '/tmp/cephtest',
            ],
        wait=False,
        )
    failed = False
    for proc in processes:
        assert isinstance(proc.exitstatus, gevent.event.AsyncResult)
        try:
            proc.exitstatus.get()
        except run.CommandFailedError:
            log.error('Host %s has stale cephtest directory, check your lock and reboot to clean up.', proc.remote.shortname)
            failed = True
    if failed:
        raise RuntimeError('Stale jobs detected, aborting.')

@contextlib.contextmanager
def archive(ctx, config):
    log.info('Creating archive directory...')
    run.wait(
        ctx.cluster.run(
            args=[
                'install', '-d', '-m0755', '--',
                '/tmp/cephtest/archive',
                ],
            wait=False,
            )
        )

    try:
        yield
    finally:
        if ctx.archive is not None:

            log.info('Transferring archived files...')
            logdir = os.path.join(ctx.archive, 'remote')
            os.mkdir(logdir)
            for remote in ctx.cluster.remotes.iterkeys():
                path = os.path.join(logdir, remote.shortname)
                os.mkdir(path)
                log.debug('Transferring archived files from %s to %s', remote.shortname, path)
                proc = remote.run(
                    args=[
                        'tar',
                        'c',
                        '-f', '-',
                        '-C', '/tmp/cephtest/archive',
                        '--',
                        '.',
                        ],
                    stdout=run.PIPE,
                    wait=False,
                    )
                tar = tarfile.open(mode='r|', fileobj=proc.stdout)
                while True:
                    ti = tar.next()
                    if ti is None:
                        break

                    if ti.isdir():
                        # ignore silently; easier to just create leading dirs below
                        pass
                    elif ti.isfile():
                        sub = safepath.munge(ti.name)
                        safepath.makedirs(root=path, path=os.path.dirname(sub))
                        tar.makefile(ti, targetpath=os.path.join(path, sub))
                    else:
                        if ti.isdev():
                            type_ = 'device'
                        elif ti.issym():
                            type_ = 'symlink'
                        elif ti.islnk():
                            type_ = 'hard link'
                        else:
                            type_ = 'unknown'
                        log.info('Ignoring tar entry: %r type %r', ti.name, type_)
                        continue
                proc.exitstatus.get()

        log.info('Removing archive directory...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'rm',
                    '-rf',
                    '--',
                    '/tmp/cephtest/archive',
                    ],
                wait=False,
                ),
            )

@contextlib.contextmanager
def coredump(ctx, config):
    log.info('Enabling coredump saving...')
    run.wait(
        ctx.cluster.run(
            args=[
                'install', '-d', '-m0755', '--',
                '/tmp/cephtest/archive/coredump',
                run.Raw('&&'),
                'sudo', 'sysctl', '-w', 'kernel.core_pattern=/tmp/cephtest/archive/coredump/%t.%p.core',
                ],
            wait=False,
            )
        )

    try:
        yield
    finally:
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo', 'sysctl', '-w', 'kernel.core_pattern=core',
                    run.Raw('&&'),
                    # don't litter the archive dir if there were no cores dumped
                    'rmdir',
                    '--ignore-fail-on-non-empty',
                    '--',
                    '/tmp/cephtest/archive/coredump',
                    ],
                wait=False,
                )
            )

        # set success=false if the dir is still there = coredumps were
        # seen
        for remote in ctx.cluster.remotes.iterkeys():
            r = remote.run(
                args=[
                    'if', 'test', '!', '-e', '/tmp/cephtest/archive/coredump', run.Raw(';'), 'then',
                    'echo', 'OK', run.Raw(';'),
                    'fi',
                    ],
                stdout=StringIO(),
                )
            if r.stdout.getvalue() != 'OK\n':
                log.warning('Found coredumps on %s, flagging run as failed', remote)
                ctx.summary['success'] = False
                if 'failure_reason' not in ctx.summary:
                    ctx.summary['failure_reason'] = \
                        'Found coredumps on {remote}'.format(remote=remote)

@contextlib.contextmanager
def syslog(ctx, config):
    if ctx.archive is None:
        # disable this whole feature if we're not going to archive the data anyway
        yield
        return

    log.info('Starting syslog monitoring...')

    run.wait(
        ctx.cluster.run(
            args=[
                'mkdir', '-m0755', '--',
                '/tmp/cephtest/archive/syslog',
                ],
            wait=False,
            )
        )

    CONF = '/etc/rsyslog.d/80-cephtest.conf'
    conf_fp = StringIO("""
kern.* -/tmp/cephtest/archive/syslog/kern.log;RSYSLOG_FileFormat
*.*;kern.none -/tmp/cephtest/archive/syslog/misc.log;RSYSLOG_FileFormat
""")
    try:
        for rem in ctx.cluster.remotes.iterkeys():
            teuthology.sudo_write_file(
                remote=rem,
                path=CONF,
                data=conf_fp,
                )
            conf_fp.seek(0)
        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'initctl',
                    # a mere reload (SIGHUP) doesn't seem to make
                    # rsyslog open the files
                    'restart',
                    'rsyslog',
                    ],
                wait=False,
                ),
            )

        yield
    finally:
        log.info('Shutting down syslog monitoring...')

        run.wait(
            ctx.cluster.run(
                args=[
                    'sudo',
                    'rm',
                    '-f',
                    '--',
                    CONF,
                    run.Raw('&&'),
                    'sudo',
                    'initctl',
                    'restart',
                    'rsyslog',
                    ],
                wait=False,
                ),
            )
        # race condition: nothing actually says rsyslog had time to
        # flush the file fully. oh well.

        log.info('Checking logs for errors...')
        for remote in ctx.cluster.remotes.iterkeys():
            log.debug('Checking %s', remote.name)
            r = remote.run(
                args=[
                    'egrep',
                    'BUG|INFO|DEADLOCK',
                    run.Raw('/tmp/cephtest/archive/syslog/*.log'),
                    run.Raw('|'),
                    'grep', '-v', 'task .* blocked for more than .* seconds',
                    run.Raw('|'),
                    'grep', '-v', 'lockdep is turned off',
                    run.Raw('|'),
                    'grep', '-v', 'trying to register non-static key',
                    run.Raw('|'),
                    'head', '-n', '1',
                    ],
                stdout=StringIO(),
                )
            stdout = r.stdout.getvalue()
            if stdout != '':
                log.error('Error in syslog on %s: %s', remote.name, stdout)
                ctx.summary['success'] = False
                if 'failure_reason' not in ctx.summary:
                    ctx.summary['failure_reason'] = \
                        "'{error}' in syslog".format(error=stdout)

        log.info('Compressing syslogs...')
        run.wait(
            ctx.cluster.run(
                args=[
                    'find',
                    '/tmp/cephtest/archive/syslog',
                    '-name',
                    '*.log',
                    '-print0',
                    run.Raw('|'),
                    'xargs',
                    '-0',
                    '--no-run-if-empty',
                    '--',
                    'gzip',
                    '--',
                    ],
                wait=False,
                ),
            )
Archive cores dumped during test, record test as failed if any seen. 2011-06-17 23:00:39 +00:00			`from cStringIO import StringIO`
Move non-ceph logic out of the ceph task: base dir, archive transfer. 2011-06-16 21:34:19 +00:00			`import contextlib`
Move non-ceph logic out of the ceph task: host in use check. To avoid every config always listing basic tasks, we silently add internal.* tasks in front of the task list. 2011-06-16 21:17:14 +00:00			`import gevent`
			`import logging`
Move non-ceph logic out of the ceph task: base dir, archive transfer. 2011-06-16 21:34:19 +00:00			`import os`
			`import tarfile`
Add --block option to retry until machines are locked. If there are not enough machines up, fail immediately. 2011-07-07 23:15:18 +00:00			`import time`
clean up locked machine list 2011-07-11 22:27:50 +00:00			`import yaml`
Move non-ceph logic out of the ceph task: host in use check. To avoid every config always listing basic tasks, we silently add internal.* tasks in front of the task list. 2011-06-16 21:17:14 +00:00
Check that all machines are locked, and add an option to lock machines instead of providing targets. 2011-07-06 21:22:43 +00:00			`from teuthology import lock`
Archive syslog messages while the test was in progress. 2011-06-20 20:19:08 +00:00			`from teuthology import misc as teuthology`
Move non-ceph logic out of the ceph task: base dir, archive transfer. 2011-06-16 21:34:19 +00:00			`from teuthology import safepath`
Move orchestra to teuthology.orchestra so there's just one top-level package. 2011-09-13 21:53:02 +00:00			`from ..orchestra import run`
Move non-ceph logic out of the ceph task: host in use check. To avoid every config always listing basic tasks, we silently add internal.* tasks in front of the task list. 2011-06-16 21:17:14 +00:00
			`log = logging.getLogger(__name__)`

Move non-ceph logic out of the ceph task: base dir, archive transfer. 2011-06-16 21:34:19 +00:00			`@contextlib.contextmanager`
			`def base(ctx, config):`
			`log.info('Creating base directory...')`
			`run.wait(`
			`ctx.cluster.run(`
			`args=[`
			`'mkdir', '-m0755', '--',`
			`'/tmp/cephtest',`
			`],`
			`wait=False,`
			`)`
			`)`

			`try:`
			`yield`
			`finally:`
			`log.info('Tidying up after the test...')`
			`# if this fails, one of the earlier cleanups is flawed; don't`
			`# just cram an rm -rf here`
			`run.wait(`
			`ctx.cluster.run(`
			`args=[`
			`'rmdir',`
			`'--',`
			`'/tmp/cephtest',`
			`],`
			`wait=False,`
			`),`
			`)`


Check that all machines are locked, and add an option to lock machines instead of providing targets. 2011-07-06 21:22:43 +00:00			`@contextlib.contextmanager`
			`def lock_machines(ctx, config):`
			`log.info('Locking machines...')`
			`assert isinstance(config, int), 'config must be an integer'`
Add --block option to retry until machines are locked. If there are not enough machines up, fail immediately. 2011-07-07 23:15:18 +00:00
			`while True:`
Make scheduled tasks leave some machines free. 2011-08-05 01:32:57 +00:00			`# make sure there are enough machines up`
			`machines = lock.list_locks(ctx)`
Retry listing machines if the lock server goes down. 2011-10-05 00:19:56 +00:00			`if machines is None:`
			`if ctx.block:`
			`log.warn('error listing machines, trying again')`
			`time.sleep(20)`
			`continue`
			`else:`
			`assert 0, 'error listing machines'`
Make scheduled tasks leave some machines free. 2011-08-05 01:32:57 +00:00			`num_up = len(filter(lambda machine: machine['up'], machines))`
			`assert num_up >= config, 'not enough machines are up'`

			`# make sure there are machines for non-automated jobs to run`
Down machines shouldn't be considered free. 2011-08-05 17:59:16 +00:00			`num_free = len(filter(`
			`lambda machine: machine['up'] and machine['locked'] == 0,`
			`machines`
			`))`
Make scheduled tasks leave some machines free. 2011-08-05 01:32:57 +00:00			`if num_free < 6 and ctx.owner.startswith('scheduled'):`
			`if ctx.block:`
			`log.info('waiting for more machines to be free...')`
			`time.sleep(10)`
			`continue`
			`else:`
			`assert 0, 'not enough machines free'`

Add --block option to retry until machines are locked. If there are not enough machines up, fail immediately. 2011-07-07 23:15:18 +00:00			`newly_locked = lock.lock_many(ctx, config, ctx.owner)`
			`if len(newly_locked) == config:`
			`ctx.config['targets'] = newly_locked`
clean up locked machine list 2011-07-11 22:27:50 +00:00			`log.info('\n '.join(['Locked targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines()))`
Add --block option to retry until machines are locked. If there are not enough machines up, fail immediately. 2011-07-07 23:15:18 +00:00			`break`
			`elif not ctx.block:`
			`assert 0, 'not enough machines are available'`

			`log.warn('Could not lock enough machines, waiting...')`
			`time.sleep(10)`
Check that all machines are locked, and add an option to lock machines instead of providing targets. 2011-07-06 21:22:43 +00:00			`try:`
			`yield`
			`finally:`
Remove --keep-locked-on-error, and behave as if it were specified This will help prevent machines with cephtest dirs still present from being used. It's easy to unlock machines - the targets yaml fragment is output during a run. 2011-10-07 21:45:01 +00:00			`if ctx.summary.get('success', False):`
Add an option to keep machines locked if a test fails. 2011-07-11 22:48:42 +00:00			`log.info('Unlocking machines...')`
Make targets a dictionary mapping hosts to ssh host keys. 2011-07-14 23:47:29 +00:00			`for machine in ctx.config['targets'].iterkeys():`
Add an option to keep machines locked if a test fails. 2011-07-11 22:48:42 +00:00			`lock.unlock(ctx, machine, ctx.owner)`
Check that all machines are locked, and add an option to lock machines instead of providing targets. 2011-07-06 21:22:43 +00:00
Save config after locking nodes, so targets are included. 2011-11-17 19:57:07 +00:00			`def save_config(ctx, config):`
			`log.info('Saving configuration')`
			`if ctx.archive is not None:`
			`with file(os.path.join(ctx.archive, 'config.yaml'), 'w') as f:`
			`yaml.safe_dump(ctx.config, f, default_flow_style=False)`

Check that all machines are locked, and add an option to lock machines instead of providing targets. 2011-07-06 21:22:43 +00:00			`def check_lock(ctx, config):`
Allow user to disable lock checking. The new plana hardware isn't in the old sepia lock database, and the machine pools are risky to merge as nothing in the software guarantees allocation from just one pool. This allows us to hand-allocate machines temporarily. 2012-01-31 16:05:36 +00:00			`if ctx.config.get('check-locks') == False:`
			`log.info('Lock checking disabled.')`
			`return`
Check that all machines are locked, and add an option to lock machines instead of providing targets. 2011-07-06 21:22:43 +00:00			`log.info('Checking locks...')`
Make targets a dictionary mapping hosts to ssh host keys. 2011-07-14 23:47:29 +00:00			`for machine in ctx.config['targets'].iterkeys():`
Read lock server from ~/teuthology.yaml. 2011-07-07 18:43:35 +00:00			`status = lock.get_status(ctx, machine)`
Check that all machines are locked, and add an option to lock machines instead of providing targets. 2011-07-06 21:22:43 +00:00			`log.debug('machine status is %s', repr(status))`
			`assert status is not None, \`
			`'could not read lock status for {name}'.format(name=machine)`
			`assert status['up'], 'machine {name} is marked down'.format(name=machine)`
			`assert status['locked'], \`
			`'machine {name} is not locked'.format(name=machine)`
			`assert status['locked_by'] == ctx.owner, \`
			`'machine {name} is locked by {user}, not {owner}'.format(`
			`name=machine,`
			`user=status['locked_by'],`
			`owner=ctx.owner,`
			`)`

Move duration calculation to an internal task This excludes all generic start up costs, like waiting for locks, rebooting into a new kernel, etc. 2012-02-21 23:01:45 +00:00			`@contextlib.contextmanager`
			`def timer(ctx, config):`
			`log.info('Starting timer...')`
			`start = time.time()`
			`try:`
			`yield`
			`finally:`
			`duration = time.time() - start`
			`log.info('Duration was %f seconds', duration)`
			`ctx.summary['duration'] = duration`

Check that all machines are locked, and add an option to lock machines instead of providing targets. 2011-07-06 21:22:43 +00:00			`def connect(ctx, config):`
			`log.info('Opening connections...')`
Move orchestra to teuthology.orchestra so there's just one top-level package. 2011-09-13 21:53:02 +00:00			`from ..orchestra import connection, remote`
Fix leftover orchestra import clause. This seems to be a leftover from a2372fce12b6bd1818e155d1d8ed5134dbd8fd4a, no idea how it stayed hidden this long. 2011-11-07 21:05:14 +00:00			`from ..orchestra import cluster`
Log connections to targets This way you can tell which machines have problems in case of an error. 2011-08-04 22:19:13 +00:00			`remotes = []`
			`for t, key in ctx.config['targets'].iteritems():`
			`log.debug('connecting to %s', t)`
			`remotes.append(`
			`remote.Remote(name=t,`
Keep each ssh connection alive. With long-running jobs like thrashing, ssh connections were timing out. 2011-11-03 20:08:39 +00:00			`ssh=connection.connect(user_at_host=t,`
			`host_key=key,`
			`keep_alive=True)))`
Fix leftover orchestra import clause. This seems to be a leftover from a2372fce12b6bd1818e155d1d8ed5134dbd8fd4a, no idea how it stayed hidden this long. 2011-11-07 21:05:14 +00:00			`ctx.cluster = cluster.Cluster()`
make connect work if no roles are specified This is useful for -nuke. 2011-07-11 21:23:31 +00:00			`if 'roles' in ctx.config:`
			`for rem, roles in zip(remotes, ctx.config['roles']):`
Add assert to catch simple typos in roles list. Input of "roles:\n- [mds,1]" used to make teuthology crash in a non-obviou way. 2011-08-15 16:36:06 +00:00			`assert all(isinstance(role, str) for role in roles), \`
			`"Roles in config must be strings: %r" % roles`
make connect work if no roles are specified This is useful for -nuke. 2011-07-11 21:23:31 +00:00			`ctx.cluster.add(rem, roles)`
show host -> roles mapping on startup Less guessing when manually inspecting an in-progress or hung run. 2012-01-14 05:56:37 +00:00			`log.info('roles: %s - %s' % (rem, roles))`
make connect work if no roles are specified This is useful for -nuke. 2011-07-11 21:23:31 +00:00			`else:`
			`for rem in remotes:`
			`ctx.cluster.add(rem, rem.name)`
Check that all machines are locked, and add an option to lock machines instead of providing targets. 2011-07-06 21:22:43 +00:00
Move non-ceph logic out of the ceph task: host in use check. To avoid every config always listing basic tasks, we silently add internal.* tasks in front of the task list. 2011-06-16 21:17:14 +00:00			`def check_conflict(ctx, config):`
			`log.info('Checking for old test directory...')`
			`processes = ctx.cluster.run(`
			`args=[`
			`'test', '!', '-e', '/tmp/cephtest',`
			`],`
			`wait=False,`
			`)`
			`failed = False`
			`for proc in processes:`
			`assert isinstance(proc.exitstatus, gevent.event.AsyncResult)`
			`try:`
			`proc.exitstatus.get()`
			`except run.CommandFailedError:`
			`log.error('Host %s has stale cephtest directory, check your lock and reboot to clean up.', proc.remote.shortname)`
			`failed = True`
			`if failed:`
			`raise RuntimeError('Stale jobs detected, aborting.')`
Move non-ceph logic out of the ceph task: base dir, archive transfer. 2011-06-16 21:34:19 +00:00
			`@contextlib.contextmanager`
			`def archive(ctx, config):`
			`log.info('Creating archive directory...')`
			`run.wait(`
			`ctx.cluster.run(`
			`args=[`
			`'install', '-d', '-m0755', '--',`
			`'/tmp/cephtest/archive',`
			`],`
			`wait=False,`
			`)`
			`)`

			`try:`
			`yield`
			`finally:`
			`if ctx.archive is not None:`

			`log.info('Transferring archived files...')`
			`logdir = os.path.join(ctx.archive, 'remote')`
			`os.mkdir(logdir)`
			`for remote in ctx.cluster.remotes.iterkeys():`
			`path = os.path.join(logdir, remote.shortname)`
			`os.mkdir(path)`
			`log.debug('Transferring archived files from %s to %s', remote.shortname, path)`
			`proc = remote.run(`
			`args=[`
			`'tar',`
			`'c',`
			`'-f', '-',`
			`'-C', '/tmp/cephtest/archive',`
			`'--',`
			`'.',`
			`],`
			`stdout=run.PIPE,`
			`wait=False,`
			`)`
			`tar = tarfile.open(mode='r\|', fileobj=proc.stdout)`
			`while True:`
			`ti = tar.next()`
			`if ti is None:`
			`break`

			`if ti.isdir():`
			`# ignore silently; easier to just create leading dirs below`
			`pass`
			`elif ti.isfile():`
			`sub = safepath.munge(ti.name)`
			`safepath.makedirs(root=path, path=os.path.dirname(sub))`
			`tar.makefile(ti, targetpath=os.path.join(path, sub))`
			`else:`
			`if ti.isdev():`
			`type_ = 'device'`
			`elif ti.issym():`
			`type_ = 'symlink'`
			`elif ti.islnk():`
			`type_ = 'hard link'`
			`else:`
			`type_ = 'unknown'`
			`log.info('Ignoring tar entry: %r type %r', ti.name, type_)`
			`continue`
			`proc.exitstatus.get()`

Archive dir removal has to be unconditional. Even when ctx.archive is False, ceph logging need the destination directory exist, so /tmp/cephtest/archive has to be created (and thus removed) unconditionally. 2011-06-30 18:26:20 +00:00			`log.info('Removing archive directory...')`
			`run.wait(`
			`ctx.cluster.run(`
			`args=[`
			`'rm',`
			`'-rf',`
			`'--',`
			`'/tmp/cephtest/archive',`
			`],`
			`wait=False,`
			`),`
			`)`
Archive cores dumped during test, record test as failed if any seen. 2011-06-17 23:00:39 +00:00
			`@contextlib.contextmanager`
			`def coredump(ctx, config):`
			`log.info('Enabling coredump saving...')`
			`run.wait(`
			`ctx.cluster.run(`
			`args=[`
			`'install', '-d', '-m0755', '--',`
			`'/tmp/cephtest/archive/coredump',`
			`run.Raw('&&'),`
			`'sudo', 'sysctl', '-w', 'kernel.core_pattern=/tmp/cephtest/archive/coredump/%t.%p.core',`
			`],`
			`wait=False,`
			`)`
			`)`

			`try:`
			`yield`
			`finally:`
			`run.wait(`
			`ctx.cluster.run(`
			`args=[`
			`'sudo', 'sysctl', '-w', 'kernel.core_pattern=core',`
			`run.Raw('&&'),`
			`# don't litter the archive dir if there were no cores dumped`
			`'rmdir',`
			`'--ignore-fail-on-non-empty',`
			`'--',`
			`'/tmp/cephtest/archive/coredump',`
			`],`
			`wait=False,`
			`)`
			`)`

			`# set success=false if the dir is still there = coredumps were`
			`# seen`
Fix bug that thought all >1 node clusters always had core dumps. Accidentally shared the stdout between all the runs. 2011-06-20 20:18:01 +00:00			`for remote in ctx.cluster.remotes.iterkeys():`
			`r = remote.run(`
			`args=[`
			`'if', 'test', '!', '-e', '/tmp/cephtest/archive/coredump', run.Raw(';'), 'then',`
			`'echo', 'OK', run.Raw(';'),`
			`'fi',`
			`],`
			`stdout=StringIO(),`
			`)`
			`if r.stdout.getvalue() != 'OK\n':`
			`log.warning('Found coredumps on %s, flagging run as failed', remote)`
			`ctx.summary['success'] = False`
Add failure_reason to summary for the first failure detected. For now, this is the exception raised during a task, the error found in the central log, or coredumps found. More specific errors (i.e. s3-tests had 3 failures) can be added later as exceptions raised by tasks. 2011-10-03 23:08:49 +00:00			`if 'failure_reason' not in ctx.summary:`
			`ctx.summary['failure_reason'] = \`
			`'Found coredumps on {remote}'.format(remote=remote)`
Archive syslog messages while the test was in progress. 2011-06-20 20:19:08 +00:00
			`@contextlib.contextmanager`
			`def syslog(ctx, config):`
			`if ctx.archive is None:`
			`# disable this whole feature if we're not going to archive the data anyway`
			`yield`
			`return`

			`log.info('Starting syslog monitoring...')`

			`run.wait(`
			`ctx.cluster.run(`
			`args=[`
			`'mkdir', '-m0755', '--',`
			`'/tmp/cephtest/archive/syslog',`
			`],`
			`wait=False,`
			`)`
			`)`

			`CONF = '/etc/rsyslog.d/80-cephtest.conf'`
			`conf_fp = StringIO("""`
			`kern.* -/tmp/cephtest/archive/syslog/kern.log;RSYSLOG_FileFormat`
			`.;kern.none -/tmp/cephtest/archive/syslog/misc.log;RSYSLOG_FileFormat`
			`""")`
			`try:`
			`for rem in ctx.cluster.remotes.iterkeys():`
			`teuthology.sudo_write_file(`
			`remote=rem,`
			`path=CONF,`
			`data=conf_fp,`
			`)`
			`conf_fp.seek(0)`
			`run.wait(`
			`ctx.cluster.run(`
			`args=[`
			`'sudo',`
			`'initctl',`
			`# a mere reload (SIGHUP) doesn't seem to make`
			`# rsyslog open the files`
			`'restart',`
			`'rsyslog',`
			`],`
			`wait=False,`
			`),`
			`)`

			`yield`
			`finally:`
			`log.info('Shutting down syslog monitoring...')`

			`run.wait(`
			`ctx.cluster.run(`
			`args=[`
			`'sudo',`
			`'rm',`
			`'-f',`
			`'--',`
			`CONF,`
			`run.Raw('&&'),`
			`'sudo',`
			`'initctl',`
			`'restart',`
			`'rsyslog',`
			`],`
			`wait=False,`
			`),`
			`)`
			`# race condition: nothing actually says rsyslog had time to`
			`# flush the file fully. oh well.`

internal: check syslog for errors This should catch lockdep warnings and mark tests with them as failed. 2011-12-07 23:20:33 +00:00			`log.info('Checking logs for errors...')`
			`for remote in ctx.cluster.remotes.iterkeys():`
			`log.debug('Checking %s', remote.name)`
			`r = remote.run(`
			`args=[`
			`'egrep',`
			`'BUG\|INFO\|DEADLOCK',`
			`run.Raw('/tmp/cephtest/archive/syslog/*.log'),`
			`run.Raw('\|'),`
syslog: ignore 'task blocked' warnings These will happen under heavy load (usually on the osd). 2011-12-09 01:17:47 +00:00			`'grep', '-v', 'task .* blocked for more than .* seconds',`
syslog checking: forgot a pipe 2011-12-17 02:09:09 +00:00			`run.Raw('\|'),`
Ignore lockdep being turned off for now. Some machines are hitting this udev issue: http://marc.info/?l=linux-kernel&m=132033587908426&w=2 and lockdep is turned off after the first warning. 2011-12-13 00:29:37 +00:00			`'grep', '-v', 'lockdep is turned off',`
syslog: ignore 'task blocked' warnings These will happen under heavy load (usually on the osd). 2011-12-09 01:17:47 +00:00			`run.Raw('\|'),`
syslog: ignore lockdep non-static key warning It looks like this warning was made default in linux 3.2. This will keep happening until #1922 is done. 2012-01-10 23:24:44 +00:00			`'grep', '-v', 'trying to register non-static key',`
			`run.Raw('\|'),`
internal: check syslog for errors This should catch lockdep warnings and mark tests with them as failed. 2011-12-07 23:20:33 +00:00			`'head', '-n', '1',`
			`],`
			`stdout=StringIO(),`
			`)`
			`stdout = r.stdout.getvalue()`
			`if stdout != '':`
			`log.error('Error in syslog on %s: %s', remote.name, stdout)`
			`ctx.summary['success'] = False`
			`if 'failure_reason' not in ctx.summary:`
			`ctx.summary['failure_reason'] = \`
			`"'{error}' in syslog".format(error=stdout)`

Archive syslog messages while the test was in progress. 2011-06-20 20:19:08 +00:00			`log.info('Compressing syslogs...')`
			`run.wait(`
			`ctx.cluster.run(`
			`args=[`
			`'find',`
			`'/tmp/cephtest/archive/syslog',`
			`'-name',`
			`'*.log',`
			`'-print0',`
			`run.Raw('\|'),`
			`'xargs',`
			`'-0',`
			`'--no-run-if-empty',`
			`'--',`
teuthology: convert from bzip2 to gzip. gzip is much, much faster on large log files. With a 7.7GB client log, gzip took 2:45 to compress it to 624MB. bzip2 took 34:38 to compress it to 366MB. For our purposes the space savings are not worth the time loss. Signed-off-by: Greg Farnum <gregory.farnum@dreamhost.com> 2011-07-29 17:35:02 +00:00			`'gzip',`
Archive syslog messages while the test was in progress. 2011-06-20 20:19:08 +00:00			`'--',`
			`],`
			`wait=False,`
			`),`
			`)`