ceph/teuthology/task/lockfile.py

import logging
import os

from orchestra import run
import time
import gevent


log = logging.getLogger(__name__)

def task(ctx, config):
    """
    This task is designed to test locking. It runs an executable
    for each lock attempt you specify, at 0.01 second intervals (to
    preserve ordering of the locks).
    You can also introduce longer intervals by setting an entry
    as a number of seconds, rather than the lock dictionary.
    The config is a list of dictionaries. For each entry in the list, you
    must name the "client" to run on, the "file" to lock, and
    the "holdtime" to hold the lock.
    Optional entries are the "offset" and "length" of the lock. You can also specify a
    "maxwait" timeout period which fails if the executable takes longer
    to complete, and an "expectfail".
    An example:
    tasks:
    - ceph:
    - cfuse: [client.0, client.1]
    - lockfile:
      [{client:client.0, file:testfile, holdtime:10},
      {client:client.1, file:testfile, holdtime:0, maxwait:0, expectfail:true},
      {client:client.1, file:testfile, holdtime:0, maxwait:15, expectfail:false},
      10,
      {client: client.1, lockfile: testfile, holdtime: 5},
      {client: client.2, lockfile: testfile, holdtime: 5, maxwait: 1, expectfail: True}]


    In the past this test would have failed; there was a bug where waitlocks weren't
    cleaned up if the process failed. More involved scenarios are also possible.
    """
    log.info('Starting lockfile')
    try:
        assert isinstance(config, list), \
            "task lockfile got invalid config"

        log.info("building executable on each host")
        buildprocs = list()
        # build the locker executable on each client
        clients = list()
        files = list()
        for op in config:
            if not isinstance(op, dict):
                continue
            log.info("got an op")
            log.info("op['client'] = %s", op['client'])
            clients.append(op['client'])
            files.append(op['lockfile'])
            if not "expectfail" in op:
                op["expectfail"] = False
            badconfig = False
            if not "client" in op:
                badconfig = True
            if not "lockfile" in op:
                badconfig = True
            if not "holdtime" in op:
                badconfig = True
            if badconfig:
                raise KeyError("bad config {op_}".format(op_=op))

        clients = set(clients)
        files = set(files)
        lock_procs = list()
        for client in clients:
            (client_remote,) = ctx.cluster.only(client).remotes.iterkeys()
            log.info("got a client remote")
            (_, _, client_id) = client.partition('.')
            filepath = os.path.join('/tmp/cephtest', 'mnt.{id}'.format(id=client_id), op["lockfile"])

            proc = client_remote.run(
                args=[
                    'mkdir', '-p', '/tmp/cephtest/archive/lockfile',
                    run.Raw('&&'),
                    'mkdir', '-p', '/tmp/cephtest/lockfile',
                    run.Raw('&&'),
                    'wget',
                    '-nv',
                    '--no-check-certificate',
                    'https://raw.github.com/gregsfortytwo/FileLocker/master/sclockandhold.cpp',
                    '-O', '/tmp/cephtest/lockfile/sclockandhold.cpp',
                    run.Raw('&&'),
                    'g++', '/tmp/cephtest/lockfile/sclockandhold.cpp',
                    '-o', '/tmp/cephtest/lockfile/sclockandhold'
                    ],
                logger=log.getChild('lockfile_client.{id}'.format(id=client_id)),
                wait=False
                )
            log.info('building sclockandhold on client{id}'.format(id=client_id))
            buildprocs.append(proc)

        # wait for builds to finish
        run.wait(buildprocs)
        log.info('finished building sclockandhold on all clients')

        # create the files to run these locks on
        client = clients.pop()
        clients.add(client)
        (client_remote,) = ctx.cluster.only(client).remotes.iterkeys()
        (_, _, client_id) = client.partition('.')
        file_procs = list()
        for lockfile in files:
            filepath = os.path.join('/tmp/cephtest', 'mnt.{id}'.format(id=client_id), lockfile)
            proc = client_remote.run(
                args=[
                    'sudo',
                    'touch',
                    filepath,
                    ],
                logger=log.getChild('lockfile_createfile'),
                wait=False
                )
            file_procs.append(proc)
        run.wait(file_procs)
        file_procs = list()
        for lockfile in files:
            filepath = os.path.join('/tmp/cephtest', 'mnt.{id}'.format(id=client_id), lockfile)
            proc = client_remote.run(
                args=[
                    'sudo', 'chown', 'ubuntu.ubuntu', filepath
                    ],
                logger=log.getChild('lockfile_createfile'),
                wait=False
                )
            file_procs.append(proc)
        run.wait(file_procs)
        log.debug('created files to lock')

        # now actually run the locktests
        for op in config:
            if not isinstance(op, dict):
                assert isinstance(op, int) or isinstance(op, float)
                log.info("sleeping for {sleep} seconds".format(sleep=op))
                time.sleep(op)
                continue
            greenlet = gevent.spawn(lock_one, op, ctx)
            lock_procs.append((greenlet, op))
            time.sleep(0.01) # to provide proper ordering
        #for op in config

        for (greenlet, op) in lock_procs:
            log.debug('checking lock for op {op_}'.format(op_=op))
            result = greenlet.get()
            if not result:
                raise Exception("Got wrong result for op {op_}".format(op_=op))
        # for (greenlet, op) in lock_procs

    finally:
        #cleanup!
        for (greenlet, op) in lock_procs:
            log.debug('closing proc for op {op_}'.format(op_=op))
            greenlet.kill(block=True)

        for client in clients:
            (client_remote,)  = ctx.cluster.only(client).remotes.iterkeys()
            (_, _, client_id) = client.partition('.')
            filepath = os.path.join('/tmp/cephtest', 'mnt.{id}'.format(id=client_id), op["lockfile"])
            proc = client_remote.run(
                args=[
                    'rm', '-rf', '/tmp/cephtest/lockfile',
                    run.Raw(';'),
                    'sudo', 'rm', '-rf', filepath
                    ],
                wait=True
                ) #proc
    #done!
# task

def lock_one(op, ctx):
    log.debug('spinning up locker with op={op_}'.format(op_=op))
    timeout = None
    proc = None
    result = None
    (client_remote,)  = ctx.cluster.only(op['client']).remotes.iterkeys()
    (_, _, client_id) = op['client'].partition('.')
    filepath = os.path.join('/tmp/cephtest', 'mnt.{id}'.format(id=client_id), op["lockfile"])

    if "maxwait" in op:
        timeout = gevent.Timeout(seconds=float(op["maxwait"]))
        timeout.start()
    try:
        proc = client_remote.run(
            args=[
                '/tmp/cephtest/enable-coredump',
                '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
                '/tmp/cephtest/archive/coverage',
                '/tmp/cephtest/daemon-helper',
                'kill',
                '/tmp/cephtest/lockfile/sclockandhold',
                filepath,
                '{holdtime}'.format(holdtime=op["holdtime"]),
                '{offset}'.format(offset=op.get("offset", '0')),
                '{length}'.format(length=op.get("length", '1')),
                ],
            logger=log.getChild('lockfile_client.{id}'.format(id=client_id)),
            wait=False,
            stdin=run.PIPE,
            check_status=False
            )
        result = proc.exitstatus.get()
    except gevent.Timeout as tout:
        if tout is not timeout:
            raise
        if bool(op["expectfail"]):
            result = 1
        if result is 1:
            if bool(op["expectfail"]):
                log.info("failed as expected for op {op_}".format(op_=op))
            else:
                raise Exception("Unexpectedly failed to lock {op_} within given timeout!".format(op_=op))
    finally: #clean up proc
        if timeout is not None:
            timeout.cancel()
        if proc is not None:
            proc.stdin.close()

    ret = (result == 0 and not bool(op["expectfail"])) or (result == 1 and bool(op["expectfail"]))

    return ret  #we made it through