task/restart: Restart task for testing daemon kill

The ceph daemons support being killed at a specific code point
with a config option.  In some cases, we want to test a kill point
only once for a given daemon run (such as replay that only occurs
during daemon startup).  This task allows running a script or executable
and (when the script sends a command to the task) restarting it with
a temporary config that has the appropriate kill point set.  Once
the daemon asserts and gets restarted, the original config is used.

Adds a specific restart_with_args() method to the DaemonState in the
ceph task.

Right now this task follows the workunit task closely, but uses stdout/stdin
to specify when to restart a daemon.

Signed-off-by: Sam Lang <sam.lang@inktank.com>
This commit is contained in:
Sam Lang 2013-03-11 13:22:10 -05:00
parent 9e81ff58e5
commit 6fe1deadbf
3 changed files with 179 additions and 3 deletions

View File

@ -10,16 +10,17 @@ log = logging.getLogger(__name__)
class RemoteProcess(object):
__slots__ = [
'command', 'stdin', 'stdout', 'stderr', 'exitstatus',
'command', 'stdin', 'stdout', 'stderr', 'exitstatus', 'exited',
# for orchestra.remote.Remote to place a backreference
'remote',
]
def __init__(self, command, stdin, stdout, stderr, exitstatus):
def __init__(self, command, stdin, stdout, stderr, exitstatus, exited):
self.command = command
self.stdin = stdin
self.stdout = stdout
self.stderr = stderr
self.exitstatus = exitstatus
self.exited = exited
class Raw(object):
def __init__(self, value):
@ -65,6 +66,9 @@ def execute(client, args):
status = None
return status
def exitstatus_ready():
return out.channel.exit_status_ready()
r = RemoteProcess(
command=cmd,
stdin=in_,
@ -73,6 +77,7 @@ def execute(client, args):
# this is a callable that will block until the status is
# available
exitstatus=get_exitstatus,
exited=exitstatus_ready,
)
return r

View File

@ -48,12 +48,32 @@ class DaemonState(object):
self.proc = self.remote.run(*cmd_args, **cmd_kwargs)
self.log.info('Started')
def restart_with_args(self, extra_args):
self.log.info('Restarting')
if self.proc is not None:
self.log.debug('stopping old one...')
self.stop()
cmd_args = list(self.command_args)
# we only want to make a temporary mod of the args list
# so we shallow copy the dict, and deepcopy the args list
cmd_kwargs = self.command_kwargs.copy()
from copy import deepcopy
cmd_kwargs['args'] = deepcopy(self.command_kwargs['args'])
cmd_kwargs['args'].extend(extra_args)
self.proc = self.remote.run(*cmd_args, **cmd_kwargs)
self.log.info('Started')
def running(self):
return self.proc is not None
return self.proc is not None and not self.proc.exited
def reset(self):
self.proc = None
def wait_for_exit(self):
if self.proc:
run.wait([self.proc])
self.proc = None
class CephState(object):
def __init__(self):

151
teuthology/task/restart.py Normal file
View File

@ -0,0 +1,151 @@
import logging
import pipes
from teuthology import misc as teuthology
from teuthology.orchestra import run as tor
from ..orchestra import run
log = logging.getLogger(__name__)
def restart_daemon(ctx, config, role, id_, *args):
log.info('Restarting {r}.{i} daemon...'.format(r=role, i=id_))
daemon = ctx.daemons.get_daemon(role, id_)
log.debug('Waiting for exit of {r}.{i} daemon...'.format(r=role, i=id_))
try:
daemon.wait_for_exit()
except tor.CommandFailedError as e:
log.debug('Command Failed: {e}'.format(e=e))
if len(args) > 0:
confargs = ['--{k}={v}'.format(k=k, v=v) for k,v in zip(args[0::2], args[1::2])]
log.debug('Doing restart of {r}.{i} daemon with args: {a}...'.format(r=role, i=id_, a=confargs))
daemon.restart_with_args(confargs)
else:
log.debug('Doing restart of {r}.{i} daemon...'.format(r=role, i=id_))
daemon.restart()
def get_tests(ctx, config, role, remote, testdir):
srcdir = '{tdir}/restart.{role}'.format(tdir=testdir, role=role)
refspec = config.get('branch')
if refspec is None:
refspec = config.get('sha1')
if refspec is None:
refspec = config.get('tag')
if refspec is None:
refspec = 'HEAD'
log.info('Pulling restart qa/workunits from ref %s', refspec)
remote.run(
logger=log.getChild(role),
args=[
'mkdir', '--', srcdir,
run.Raw('&&'),
'git',
'archive',
'--remote=git://ceph.newdream.net/git/ceph.git',
'%s:qa/workunits' % refspec,
run.Raw('|'),
'tar',
'-C', srcdir,
'-x',
'-f-',
run.Raw('&&'),
'cd', '--', srcdir,
run.Raw('&&'),
'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi',
run.Raw('&&'),
'find', '-executable', '-type', 'f', '-printf', r'%P\0'.format(srcdir=srcdir),
run.Raw('>{tdir}/restarts.list'.format(tdir=testdir)),
],
)
restarts = sorted(teuthology.get_file(
remote,
'{tdir}/restarts.list'.format(tdir=testdir)).split('\0'))
return (srcdir, restarts)
def task(ctx, config):
"""
Execute commands and allow daemon restart with config options.
Each process executed can output to stdout restart commands of the form:
restart <role> <id> <conf_key1> <conf_value1> <conf_key2> <conf_value2>
This will restart the daemon <role>.<id> with the specified config values once
by modifying the conf file with those values, and then replacing the old conf file
once the daemon is restarted.
This task does not kill a running daemon, it assumes the daemon will abort on an
assert specified in the config.
tasks:
- install:
- ceph:
- restart:
exec:
client.0:
- test_backtraces.py
"""
assert isinstance(config, dict), "task kill got invalid config"
testdir = teuthology.get_testdir(ctx)
try:
assert 'exec' in config, "config requires exec key with <role>: <command> entries"
for role, task in config['exec'].iteritems():
log.info('restart for role {r}'.format(r=role))
(remote,) = ctx.cluster.only(role).remotes.iterkeys()
srcdir, restarts = get_tests(ctx, config, role, remote, testdir)
log.info('Running command on role %s host %s', role, remote.name)
spec = '{spec}'.format(spec=task[0])
log.info('Restarts list: %s', restarts)
log.info('Spec is %s', spec)
to_run = [w for w in restarts if w == task or w.find(spec) != -1]
log.info('To run: %s', to_run)
for c in to_run:
log.info('Running restart script %s...', c)
args = [
run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)),
run.Raw('PYTHONPATH="$PYTHONPATH:{tdir}/binary/usr/local/lib/python2.7/dist-packages:{tdir}/binary/usr/local/lib/python2.6/dist-packages"'.format(tdir=testdir)),
]
env = config.get('env')
if env is not None:
for var, val in env.iteritems():
quoted_val = pipes.quote(val)
env_arg = '{var}={val}'.format(var=var, val=quoted_val)
args.append(run.Raw(env_arg))
args.extend([
'{tdir}/enable-coredump'.format(tdir=testdir),
'ceph-coverage',
'{tdir}/archive/coverage'.format(tdir=testdir),
'{srcdir}/{c}'.format(
srcdir=srcdir,
c=c,
),
])
proc = remote.run(
args=args,
stdout=tor.PIPE,
stdin=tor.PIPE,
stderr=log,
wait=False,
)
log.info('waiting for a command from script')
while True:
l = proc.stdout.readline()
if not l or l == '':
break
log.debug('script command: {c}'.format(c=l))
ll = l.strip()
cmd = ll.split(' ')
if cmd == "done":
break
assert cmd[0] == 'restart', "script sent invalid command request to kill task"
# cmd should be: restart <role> <id> <conf_key1> <conf_value1> <conf_key2> <conf_value2>
# or to clear, just: restart <role> <id>
restart_daemon(ctx, config, cmd[1], cmd[2], *cmd[3:])
proc.stdin.writelines(['restarted\n'])
proc.stdin.flush()
tor.wait([proc])
e = proc.exitstatus
if e != 0:
raise Exception('restart task got non-zero exit status {d} from script: {s}'.format(d=e, s=c))
finally:
pass