mirror of
https://github.com/ceph/ceph
synced 2025-02-23 11:07:35 +00:00
Merge branch 'wip-nuke'
Conflicts: teuthology/task/kernel.py
This commit is contained in:
commit
0139323e51
2
setup.py
2
setup.py
@ -25,7 +25,7 @@ setup(
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'teuthology = teuthology.run:main',
|
||||
'teuthology-nuke = teuthology.run:nuke',
|
||||
'teuthology-nuke = teuthology.nuke:main',
|
||||
'teuthology-suite = teuthology.suite:main',
|
||||
'teuthology-ls = teuthology.suite:ls',
|
||||
'teuthology-worker = teuthology.queue:worker',
|
||||
|
@ -279,6 +279,55 @@ def wait_until_fuse_mounted(remote, fuse, mountpoint):
|
||||
time.sleep(5)
|
||||
log.info('cfuse is mounted on %s', mountpoint)
|
||||
|
||||
def reconnect(ctx, timeout):
|
||||
"""
|
||||
Connect to all the machines in ctx.cluster.
|
||||
|
||||
Presumably, some of them won't be up. Handle this
|
||||
by waiting for them, unless the wait time exceeds
|
||||
the specified timeout.
|
||||
|
||||
ctx needs to contain the cluster of machines you
|
||||
wish it to try and connect to, as well as a config
|
||||
holding the ssh keys for each of them. As long as it
|
||||
contains this data, you can construct a context
|
||||
that is a subset of your full cluster.
|
||||
"""
|
||||
import errno
|
||||
log.info('Re-opening connections...')
|
||||
starttime = time.time()
|
||||
need_reconnect = ctx.cluster.remotes.keys()
|
||||
while True:
|
||||
for remote in list(need_reconnect):
|
||||
try:
|
||||
from orchestra import connection
|
||||
remote.ssh = connection.connect(
|
||||
user_at_host=remote.name,
|
||||
host_key=ctx.config['targets'][remote.name],
|
||||
)
|
||||
except socket.timeout:
|
||||
pass
|
||||
except socket.error as e:
|
||||
if hasattr(e, '__getitem__'):
|
||||
if e[0] not in [errno.ECONNREFUSED, errno.ETIMEDOUT,
|
||||
errno.EHOSTUNREACH, errno.EHOSTDOWN]:
|
||||
log.exception('unknown socket error: %s', repr(e))
|
||||
raise
|
||||
else:
|
||||
if time.time() - starttime > timeout:
|
||||
log.exception('timed out waiting for %s', remote.name)
|
||||
raise
|
||||
else:
|
||||
log.exception('weird socket error without error code')
|
||||
raise
|
||||
else:
|
||||
need_reconnect.remove(remote)
|
||||
|
||||
if not need_reconnect:
|
||||
break
|
||||
log.debug('waited {elapsed}'.format(elapsed=str(time.time() - starttime)))
|
||||
time.sleep(1)
|
||||
|
||||
def write_secret_file(remote, role, filename):
|
||||
remote.run(
|
||||
args=[
|
||||
|
201
teuthology/nuke.py
Normal file
201
teuthology/nuke.py
Normal file
@ -0,0 +1,201 @@
|
||||
import argparse
|
||||
import yaml
|
||||
|
||||
def parse_args():
|
||||
from teuthology.run import config_file
|
||||
from teuthology.run import MergeConfig
|
||||
|
||||
parser = argparse.ArgumentParser(description='Reset test machines')
|
||||
parser.add_argument(
|
||||
'-v', '--verbose',
|
||||
action='store_true', default=None,
|
||||
help='be more verbose'
|
||||
)
|
||||
parser.add_argument(
|
||||
'config',
|
||||
metavar='CONFFILE',
|
||||
nargs='+',
|
||||
type=config_file,
|
||||
action=MergeConfig,
|
||||
default={},
|
||||
help='config file to read',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--archive',
|
||||
metavar='DIR',
|
||||
help='path to archive results in',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--owner',
|
||||
help='job owner',
|
||||
)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
def shutdown_daemons(ctx, log):
|
||||
from orchestra import run
|
||||
nodes = {}
|
||||
for remote in ctx.cluster.remotes.iterkeys():
|
||||
proc = remote.run(
|
||||
args=[
|
||||
'if', 'grep', '-q', 'cfuse', '/etc/mtab', run.Raw(';'),
|
||||
'then',
|
||||
'grep', 'cfuse', '/etc/mtab', run.Raw('|'),
|
||||
'grep', '-o', " /.* fuse", run.Raw('|'),
|
||||
'grep', '-o', "/.* ", run.Raw('|'),
|
||||
'xargs', 'sudo', 'fusermount', '-u', run.Raw(';'),
|
||||
'fi',
|
||||
run.Raw(';'),
|
||||
'killall',
|
||||
'--quiet',
|
||||
'/tmp/cephtest/binary/usr/local/bin/cmon',
|
||||
'/tmp/cephtest/binary/usr/local/bin/cosd',
|
||||
'/tmp/cephtest/binary/usr/local/bin/cmds',
|
||||
'/tmp/cephtest/binary/usr/local/bin/cfuse',
|
||||
run.Raw(';'),
|
||||
'if', 'test', '-e', '/etc/rsyslog.d/80-cephtest.conf',
|
||||
run.Raw(';'),
|
||||
'then',
|
||||
'sudo', 'rm', '-f', '--', '/etc/rsyslog.d/80-cephtest.conf',
|
||||
run.Raw('&&'),
|
||||
'sudo', 'initctl', 'restart', 'rsyslog',
|
||||
run.Raw(';'),
|
||||
'fi',
|
||||
run.Raw(';'),
|
||||
],
|
||||
wait=False,
|
||||
)
|
||||
nodes[remote.name] = proc
|
||||
|
||||
for name, proc in nodes.iteritems():
|
||||
log.info('Waiting for %s to finish shutdowns...', name)
|
||||
proc.exitstatus.get()
|
||||
|
||||
def find_kernel_mounts(ctx, log):
|
||||
from orchestra import run
|
||||
nodes = {}
|
||||
log.info('Looking for kernel mounts to handle...')
|
||||
for remote in ctx.cluster.remotes.iterkeys():
|
||||
proc = remote.run(
|
||||
args=[
|
||||
'grep', '-q', " ceph " , '/etc/mtab'
|
||||
],
|
||||
wait=False,
|
||||
)
|
||||
nodes[remote] = proc
|
||||
kernel_mounts = list()
|
||||
for remote, proc in nodes.iteritems():
|
||||
try:
|
||||
proc.exitstatus.get()
|
||||
log.debug('kernel mount exists on %s', remote.name)
|
||||
kernel_mounts.append(remote)
|
||||
except run.CommandFailedError: # no mounts!
|
||||
log.debug('no kernel mount on %s', remote.name)
|
||||
|
||||
return kernel_mounts
|
||||
|
||||
def remove_kernel_mounts(ctx, kernel_mounts, log):
|
||||
"""
|
||||
properly we should be able to just do a forced unmount,
|
||||
but that doesn't seem to be working, so you should reboot instead
|
||||
"""
|
||||
from orchestra import run
|
||||
nodes = {}
|
||||
for remote in kernel_mounts:
|
||||
log.info('clearing kernel mount from %s', remote.name)
|
||||
proc = remote.run(
|
||||
args=[
|
||||
'grep', 'ceph', '/etc/mtab', run.Raw('|'),
|
||||
'grep', '-o', "on /.* type", run.Raw('|'),
|
||||
'grep', '-o', "/.* ", run.Raw('|'),
|
||||
'xargs', 'sudo', 'umount', '-f', run.Raw(';'),
|
||||
'fi'
|
||||
],
|
||||
wait=False
|
||||
)
|
||||
nodes[remote] = proc
|
||||
|
||||
for remote, proc in nodes:
|
||||
proc.exitstatus.get()
|
||||
|
||||
def reboot_kernel_mounts(ctx, kernel_mounts, log):
|
||||
from orchestra import run
|
||||
import time
|
||||
nodes = {}
|
||||
for remote in kernel_mounts:
|
||||
log.info('rebooting %s', remote.name)
|
||||
proc = remote.run( # note use of -n to force a no-sync reboot
|
||||
args=['sudo', 'reboot', '-f', '-n'],
|
||||
wait=False
|
||||
)
|
||||
nodes[remote] = proc
|
||||
# we just ignore these procs because reboot -f doesn't actually
|
||||
# send anything back to the ssh client!
|
||||
#for remote, proc in nodes.iteritems():
|
||||
#proc.exitstatus.get()
|
||||
from teuthology.misc import reconnect
|
||||
if kernel_mounts:
|
||||
log.info('waiting for nodes to reboot')
|
||||
time.sleep(5) #if we try and reconnect too quickly, it succeeds!
|
||||
reconnect(ctx, 300) #allow 5 minutes for the reboots
|
||||
|
||||
def remove_testing_tree(ctx, log):
|
||||
from orchestra import run
|
||||
nodes = {}
|
||||
for remote in ctx.cluster.remotes.iterkeys():
|
||||
proc = remote.run(
|
||||
args=[
|
||||
'sudo', 'rm', '-rf', '/tmp/cephtest',
|
||||
],
|
||||
wait=False,
|
||||
)
|
||||
nodes[remote.name] = proc
|
||||
|
||||
for name, proc in nodes.iteritems():
|
||||
log.info('Waiting for %s to clear filesystem...', name)
|
||||
proc.exitstatus.get()
|
||||
|
||||
def main():
|
||||
from gevent import monkey; monkey.patch_all()
|
||||
from orchestra import monkey; monkey.patch_all()
|
||||
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
ctx = parse_args()
|
||||
|
||||
loglevel = logging.INFO
|
||||
if ctx.verbose:
|
||||
loglevel = logging.DEBUG
|
||||
|
||||
logging.basicConfig(
|
||||
level=loglevel,
|
||||
)
|
||||
|
||||
from teuthology.misc import read_config
|
||||
read_config(ctx)
|
||||
|
||||
log.info('\n '.join(['targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines()))
|
||||
|
||||
if ctx.owner is None:
|
||||
from teuthology.misc import get_user
|
||||
ctx.owner = get_user()
|
||||
|
||||
from teuthology.task.internal import check_lock, connect
|
||||
check_lock(ctx, None)
|
||||
connect(ctx, None)
|
||||
|
||||
log.info('Unmount cfuse and killing daemons...')
|
||||
shutdown_daemons(ctx, log)
|
||||
log.info('All daemons killed.')
|
||||
|
||||
log.info('Dealing with any kernel mounts...')
|
||||
kernel_mounts = find_kernel_mounts(ctx, log)
|
||||
#remove_kernel_mounts(ctx, kernel_mounts, log)
|
||||
reboot_kernel_mounts(ctx, kernel_mounts, log)
|
||||
log.info('All kernel mounts gone.')
|
||||
|
||||
log.info('Clearing filesystem of test data...')
|
||||
remove_testing_tree(ctx, log)
|
||||
log.info('Filesystem Cleared.')
|
@ -159,74 +159,6 @@ def main():
|
||||
with file(os.path.join(ctx.archive, 'summary.yaml'), 'w') as f:
|
||||
yaml.safe_dump(ctx.summary, f, default_flow_style=False)
|
||||
|
||||
|
||||
def nuke():
|
||||
from gevent import monkey; monkey.patch_all()
|
||||
from orchestra import monkey; monkey.patch_all()
|
||||
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
ctx = parse_args()
|
||||
|
||||
loglevel = logging.INFO
|
||||
if ctx.verbose:
|
||||
loglevel = logging.DEBUG
|
||||
|
||||
logging.basicConfig(
|
||||
level=loglevel,
|
||||
)
|
||||
|
||||
from teuthology.misc import read_config
|
||||
read_config(ctx)
|
||||
|
||||
log.info('\n '.join(['targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines()))
|
||||
|
||||
if ctx.owner is None:
|
||||
from teuthology.misc import get_user
|
||||
ctx.owner = get_user()
|
||||
|
||||
from teuthology.task.internal import check_lock, connect
|
||||
check_lock(ctx, None)
|
||||
connect(ctx, None)
|
||||
|
||||
log.info('Killing daemons, unmounting, and removing data...')
|
||||
|
||||
from orchestra import run
|
||||
nodes = {}
|
||||
for remote in ctx.cluster.remotes.iterkeys():
|
||||
proc = remote.run(
|
||||
args=[
|
||||
'killall',
|
||||
'--quiet',
|
||||
'/tmp/cephtest/binary/usr/local/bin/cmon',
|
||||
'/tmp/cephtest/binary/usr/local/bin/cosd',
|
||||
'/tmp/cephtest/binary/usr/local/bin/cmds',
|
||||
'/tmp/cephtest/binary/usr/local/bin/cfuse',
|
||||
run.Raw(';'),
|
||||
'if', 'test', '-e', '/etc/rsyslog.d/80-cephtest.conf',
|
||||
run.Raw(';'),
|
||||
'then',
|
||||
'sudo', 'rm', '-f', '--', '/etc/rsyslog.d/80-cephtest.conf',
|
||||
run.Raw('&&'),
|
||||
'sudo', 'initctl', 'restart', 'rsyslog',
|
||||
run.Raw(';'),
|
||||
'fi',
|
||||
run.Raw(';'),
|
||||
'find', '/tmp/cephtest', '-maxdepth', '1', '-name', 'mnt.*',
|
||||
'-execdir', 'fusermount', '-u', '{}', ';',
|
||||
run.Raw(';'),
|
||||
'sudo', 'rm', '-rf', '/tmp/cephtest',
|
||||
],
|
||||
wait=False,
|
||||
)
|
||||
nodes[remote.name] = proc
|
||||
|
||||
for name, proc in nodes.iteritems():
|
||||
log.info('Waiting for %s to be nuked...', name)
|
||||
proc.exitstatus.get()
|
||||
log.info('Done.')
|
||||
|
||||
def schedule():
|
||||
parser = argparse.ArgumentParser(description='Schedule ceph integration tests')
|
||||
parser.add_argument(
|
||||
|
@ -1,7 +1,6 @@
|
||||
from cStringIO import StringIO
|
||||
|
||||
import logging
|
||||
import errno
|
||||
import socket
|
||||
import time
|
||||
|
||||
@ -180,38 +179,6 @@ def install_and_reboot(ctx, config):
|
||||
proc.exitstatus.get()
|
||||
|
||||
|
||||
def reconnect(ctx, timeout):
|
||||
log.info('Re-opening connections...')
|
||||
starttime = time.time()
|
||||
need_reconnect = ctx.cluster.remotes.keys()
|
||||
while True:
|
||||
for remote in list(need_reconnect):
|
||||
try:
|
||||
remote.ssh = connection.connect(
|
||||
user_at_host=remote.name,
|
||||
host_key=ctx.config['targets'][remote.name],
|
||||
)
|
||||
except socket.timeout:
|
||||
pass
|
||||
except socket.error as e:
|
||||
if hasattr(e, '__getitem__'):
|
||||
if e[0] not in [errno.ECONNREFUSED, errno.ETIMEDOUT,
|
||||
errno.EHOSTUNREACH, errno.EHOSTDOWN] or \
|
||||
time.time() - starttime > timeout:
|
||||
log.exception('unknown socket error: %s', repr(e))
|
||||
raise
|
||||
else:
|
||||
log.exception('weird socket error without error code')
|
||||
raise
|
||||
else:
|
||||
need_reconnect.remove(remote)
|
||||
|
||||
if not need_reconnect:
|
||||
break
|
||||
log.debug('waited {elapsed}'.format(elapsed=str(time.time() - starttime)))
|
||||
time.sleep(1)
|
||||
|
||||
|
||||
def task(ctx, config):
|
||||
"""
|
||||
Make sure the specified kernel is installed.
|
||||
@ -280,7 +247,7 @@ def task(ctx, config):
|
||||
|
||||
if len(need_install) > 0:
|
||||
install_and_reboot(ctx, need_install)
|
||||
reconnect(ctx, timeout)
|
||||
teuthology.reconnect(ctx, timeout)
|
||||
|
||||
for client, sha1 in need_install.iteritems():
|
||||
log.info('Checking client {client} for new kernel version...'.format(client=client))
|
||||
|
Loading…
Reference in New Issue
Block a user