Merge branch 'wip-nuke'

Conflicts:
	teuthology/task/kernel.py
This commit is contained in:
Greg Farnum 2011-08-10 16:16:11 -07:00
commit 0139323e51
5 changed files with 252 additions and 103 deletions

View File

@ -25,7 +25,7 @@ setup(
entry_points={ entry_points={
'console_scripts': [ 'console_scripts': [
'teuthology = teuthology.run:main', 'teuthology = teuthology.run:main',
'teuthology-nuke = teuthology.run:nuke', 'teuthology-nuke = teuthology.nuke:main',
'teuthology-suite = teuthology.suite:main', 'teuthology-suite = teuthology.suite:main',
'teuthology-ls = teuthology.suite:ls', 'teuthology-ls = teuthology.suite:ls',
'teuthology-worker = teuthology.queue:worker', 'teuthology-worker = teuthology.queue:worker',

View File

@ -279,6 +279,55 @@ def wait_until_fuse_mounted(remote, fuse, mountpoint):
time.sleep(5) time.sleep(5)
log.info('cfuse is mounted on %s', mountpoint) log.info('cfuse is mounted on %s', mountpoint)
def reconnect(ctx, timeout):
"""
Connect to all the machines in ctx.cluster.
Presumably, some of them won't be up. Handle this
by waiting for them, unless the wait time exceeds
the specified timeout.
ctx needs to contain the cluster of machines you
wish it to try and connect to, as well as a config
holding the ssh keys for each of them. As long as it
contains this data, you can construct a context
that is a subset of your full cluster.
"""
import errno
log.info('Re-opening connections...')
starttime = time.time()
need_reconnect = ctx.cluster.remotes.keys()
while True:
for remote in list(need_reconnect):
try:
from orchestra import connection
remote.ssh = connection.connect(
user_at_host=remote.name,
host_key=ctx.config['targets'][remote.name],
)
except socket.timeout:
pass
except socket.error as e:
if hasattr(e, '__getitem__'):
if e[0] not in [errno.ECONNREFUSED, errno.ETIMEDOUT,
errno.EHOSTUNREACH, errno.EHOSTDOWN]:
log.exception('unknown socket error: %s', repr(e))
raise
else:
if time.time() - starttime > timeout:
log.exception('timed out waiting for %s', remote.name)
raise
else:
log.exception('weird socket error without error code')
raise
else:
need_reconnect.remove(remote)
if not need_reconnect:
break
log.debug('waited {elapsed}'.format(elapsed=str(time.time() - starttime)))
time.sleep(1)
def write_secret_file(remote, role, filename): def write_secret_file(remote, role, filename):
remote.run( remote.run(
args=[ args=[

201
teuthology/nuke.py Normal file
View File

@ -0,0 +1,201 @@
import argparse
import yaml
def parse_args():
from teuthology.run import config_file
from teuthology.run import MergeConfig
parser = argparse.ArgumentParser(description='Reset test machines')
parser.add_argument(
'-v', '--verbose',
action='store_true', default=None,
help='be more verbose'
)
parser.add_argument(
'config',
metavar='CONFFILE',
nargs='+',
type=config_file,
action=MergeConfig,
default={},
help='config file to read',
)
parser.add_argument(
'--archive',
metavar='DIR',
help='path to archive results in',
)
parser.add_argument(
'--owner',
help='job owner',
)
args = parser.parse_args()
return args
def shutdown_daemons(ctx, log):
from orchestra import run
nodes = {}
for remote in ctx.cluster.remotes.iterkeys():
proc = remote.run(
args=[
'if', 'grep', '-q', 'cfuse', '/etc/mtab', run.Raw(';'),
'then',
'grep', 'cfuse', '/etc/mtab', run.Raw('|'),
'grep', '-o', " /.* fuse", run.Raw('|'),
'grep', '-o', "/.* ", run.Raw('|'),
'xargs', 'sudo', 'fusermount', '-u', run.Raw(';'),
'fi',
run.Raw(';'),
'killall',
'--quiet',
'/tmp/cephtest/binary/usr/local/bin/cmon',
'/tmp/cephtest/binary/usr/local/bin/cosd',
'/tmp/cephtest/binary/usr/local/bin/cmds',
'/tmp/cephtest/binary/usr/local/bin/cfuse',
run.Raw(';'),
'if', 'test', '-e', '/etc/rsyslog.d/80-cephtest.conf',
run.Raw(';'),
'then',
'sudo', 'rm', '-f', '--', '/etc/rsyslog.d/80-cephtest.conf',
run.Raw('&&'),
'sudo', 'initctl', 'restart', 'rsyslog',
run.Raw(';'),
'fi',
run.Raw(';'),
],
wait=False,
)
nodes[remote.name] = proc
for name, proc in nodes.iteritems():
log.info('Waiting for %s to finish shutdowns...', name)
proc.exitstatus.get()
def find_kernel_mounts(ctx, log):
from orchestra import run
nodes = {}
log.info('Looking for kernel mounts to handle...')
for remote in ctx.cluster.remotes.iterkeys():
proc = remote.run(
args=[
'grep', '-q', " ceph " , '/etc/mtab'
],
wait=False,
)
nodes[remote] = proc
kernel_mounts = list()
for remote, proc in nodes.iteritems():
try:
proc.exitstatus.get()
log.debug('kernel mount exists on %s', remote.name)
kernel_mounts.append(remote)
except run.CommandFailedError: # no mounts!
log.debug('no kernel mount on %s', remote.name)
return kernel_mounts
def remove_kernel_mounts(ctx, kernel_mounts, log):
"""
properly we should be able to just do a forced unmount,
but that doesn't seem to be working, so you should reboot instead
"""
from orchestra import run
nodes = {}
for remote in kernel_mounts:
log.info('clearing kernel mount from %s', remote.name)
proc = remote.run(
args=[
'grep', 'ceph', '/etc/mtab', run.Raw('|'),
'grep', '-o', "on /.* type", run.Raw('|'),
'grep', '-o', "/.* ", run.Raw('|'),
'xargs', 'sudo', 'umount', '-f', run.Raw(';'),
'fi'
],
wait=False
)
nodes[remote] = proc
for remote, proc in nodes:
proc.exitstatus.get()
def reboot_kernel_mounts(ctx, kernel_mounts, log):
from orchestra import run
import time
nodes = {}
for remote in kernel_mounts:
log.info('rebooting %s', remote.name)
proc = remote.run( # note use of -n to force a no-sync reboot
args=['sudo', 'reboot', '-f', '-n'],
wait=False
)
nodes[remote] = proc
# we just ignore these procs because reboot -f doesn't actually
# send anything back to the ssh client!
#for remote, proc in nodes.iteritems():
#proc.exitstatus.get()
from teuthology.misc import reconnect
if kernel_mounts:
log.info('waiting for nodes to reboot')
time.sleep(5) #if we try and reconnect too quickly, it succeeds!
reconnect(ctx, 300) #allow 5 minutes for the reboots
def remove_testing_tree(ctx, log):
from orchestra import run
nodes = {}
for remote in ctx.cluster.remotes.iterkeys():
proc = remote.run(
args=[
'sudo', 'rm', '-rf', '/tmp/cephtest',
],
wait=False,
)
nodes[remote.name] = proc
for name, proc in nodes.iteritems():
log.info('Waiting for %s to clear filesystem...', name)
proc.exitstatus.get()
def main():
from gevent import monkey; monkey.patch_all()
from orchestra import monkey; monkey.patch_all()
import logging
log = logging.getLogger(__name__)
ctx = parse_args()
loglevel = logging.INFO
if ctx.verbose:
loglevel = logging.DEBUG
logging.basicConfig(
level=loglevel,
)
from teuthology.misc import read_config
read_config(ctx)
log.info('\n '.join(['targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines()))
if ctx.owner is None:
from teuthology.misc import get_user
ctx.owner = get_user()
from teuthology.task.internal import check_lock, connect
check_lock(ctx, None)
connect(ctx, None)
log.info('Unmount cfuse and killing daemons...')
shutdown_daemons(ctx, log)
log.info('All daemons killed.')
log.info('Dealing with any kernel mounts...')
kernel_mounts = find_kernel_mounts(ctx, log)
#remove_kernel_mounts(ctx, kernel_mounts, log)
reboot_kernel_mounts(ctx, kernel_mounts, log)
log.info('All kernel mounts gone.')
log.info('Clearing filesystem of test data...')
remove_testing_tree(ctx, log)
log.info('Filesystem Cleared.')

View File

@ -159,74 +159,6 @@ def main():
with file(os.path.join(ctx.archive, 'summary.yaml'), 'w') as f: with file(os.path.join(ctx.archive, 'summary.yaml'), 'w') as f:
yaml.safe_dump(ctx.summary, f, default_flow_style=False) yaml.safe_dump(ctx.summary, f, default_flow_style=False)
def nuke():
from gevent import monkey; monkey.patch_all()
from orchestra import monkey; monkey.patch_all()
import logging
log = logging.getLogger(__name__)
ctx = parse_args()
loglevel = logging.INFO
if ctx.verbose:
loglevel = logging.DEBUG
logging.basicConfig(
level=loglevel,
)
from teuthology.misc import read_config
read_config(ctx)
log.info('\n '.join(['targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines()))
if ctx.owner is None:
from teuthology.misc import get_user
ctx.owner = get_user()
from teuthology.task.internal import check_lock, connect
check_lock(ctx, None)
connect(ctx, None)
log.info('Killing daemons, unmounting, and removing data...')
from orchestra import run
nodes = {}
for remote in ctx.cluster.remotes.iterkeys():
proc = remote.run(
args=[
'killall',
'--quiet',
'/tmp/cephtest/binary/usr/local/bin/cmon',
'/tmp/cephtest/binary/usr/local/bin/cosd',
'/tmp/cephtest/binary/usr/local/bin/cmds',
'/tmp/cephtest/binary/usr/local/bin/cfuse',
run.Raw(';'),
'if', 'test', '-e', '/etc/rsyslog.d/80-cephtest.conf',
run.Raw(';'),
'then',
'sudo', 'rm', '-f', '--', '/etc/rsyslog.d/80-cephtest.conf',
run.Raw('&&'),
'sudo', 'initctl', 'restart', 'rsyslog',
run.Raw(';'),
'fi',
run.Raw(';'),
'find', '/tmp/cephtest', '-maxdepth', '1', '-name', 'mnt.*',
'-execdir', 'fusermount', '-u', '{}', ';',
run.Raw(';'),
'sudo', 'rm', '-rf', '/tmp/cephtest',
],
wait=False,
)
nodes[remote.name] = proc
for name, proc in nodes.iteritems():
log.info('Waiting for %s to be nuked...', name)
proc.exitstatus.get()
log.info('Done.')
def schedule(): def schedule():
parser = argparse.ArgumentParser(description='Schedule ceph integration tests') parser = argparse.ArgumentParser(description='Schedule ceph integration tests')
parser.add_argument( parser.add_argument(

View File

@ -1,7 +1,6 @@
from cStringIO import StringIO from cStringIO import StringIO
import logging import logging
import errno
import socket import socket
import time import time
@ -180,38 +179,6 @@ def install_and_reboot(ctx, config):
proc.exitstatus.get() proc.exitstatus.get()
def reconnect(ctx, timeout):
log.info('Re-opening connections...')
starttime = time.time()
need_reconnect = ctx.cluster.remotes.keys()
while True:
for remote in list(need_reconnect):
try:
remote.ssh = connection.connect(
user_at_host=remote.name,
host_key=ctx.config['targets'][remote.name],
)
except socket.timeout:
pass
except socket.error as e:
if hasattr(e, '__getitem__'):
if e[0] not in [errno.ECONNREFUSED, errno.ETIMEDOUT,
errno.EHOSTUNREACH, errno.EHOSTDOWN] or \
time.time() - starttime > timeout:
log.exception('unknown socket error: %s', repr(e))
raise
else:
log.exception('weird socket error without error code')
raise
else:
need_reconnect.remove(remote)
if not need_reconnect:
break
log.debug('waited {elapsed}'.format(elapsed=str(time.time() - starttime)))
time.sleep(1)
def task(ctx, config): def task(ctx, config):
""" """
Make sure the specified kernel is installed. Make sure the specified kernel is installed.
@ -280,7 +247,7 @@ def task(ctx, config):
if len(need_install) > 0: if len(need_install) > 0:
install_and_reboot(ctx, need_install) install_and_reboot(ctx, need_install)
reconnect(ctx, timeout) teuthology.reconnect(ctx, timeout)
for client, sha1 in need_install.iteritems(): for client, sha1 in need_install.iteritems():
log.info('Checking client {client} for new kernel version...'.format(client=client)) log.info('Checking client {client} for new kernel version...'.format(client=client))