ceph/teuthology/nuke.py
Sandon Van Ness d54932cbc8 Fix VM issues.
Fix of #5494 although bad description. Instead of adding a wait
the code used to detect if the guest was back up is fixed. The
previous code appeared to assume only one machine and broke
when it was waiting for multiple machines if the guests did not
come up within 10 seconds of each other

Make nuke not do the normal stuff if the machine is a VPS as we
just destroy them when they get unlocked.

Instead of getting downburst options from ~/.teuthology.yaml get
it from the yaml given to teuthology for the test/task instead.

Fixed an error that would make all the default downburst values
not take effect if any of them were set via a yaml.

Signed-off-by: Sandon Van Ness <sandon@inktank.com>
Reviewed-by: Warren Usui <warren.usui@inktank.com>
2013-07-03 19:07:35 -07:00

476 lines
15 KiB
Python

import argparse
import yaml
def parse_args():
from teuthology.run import config_file
from teuthology.run import MergeConfig
parser = argparse.ArgumentParser(description='Reset test machines')
parser.add_argument(
'-v', '--verbose',
action='store_true', default=None,
help='be more verbose'
)
parser.add_argument(
'-t', '--targets',
nargs='+',
type=config_file,
action=MergeConfig,
default={},
dest='config',
help='yaml config containing machines to nuke',
)
parser.add_argument(
'-a', '--archive',
metavar='DIR',
help='archive path for a job to kill and nuke',
)
parser.add_argument(
'--owner',
help='job owner',
)
parser.add_argument(
'-p','--pid',
type=int,
default=False,
help='pid of the process to be killed',
)
parser.add_argument(
'-r', '--reboot-all',
action='store_true',
default=False,
help='reboot all machines',
)
parser.add_argument(
'-s', '--synch-clocks',
action='store_true',
default=False,
help='synchronize clocks on all machines',
)
parser.add_argument(
'-u', '--unlock',
action='store_true',
default=False,
help='Unlock each successfully nuked machine, and output targets that'
'could not be nuked.'
)
parser.add_argument(
'-n', '--name',
metavar='NAME',
help='Name of run to cleanup'
)
parser.add_argument(
'-i', '--noipmi',
action='store_true', default=False,
help='Skip ipmi checking'
)
args = parser.parse_args()
return args
def shutdown_daemons(ctx, log):
from .orchestra import run
nodes = {}
for remote in ctx.cluster.remotes.iterkeys():
proc = remote.run(
args=[
'if', 'grep', '-q', 'ceph-fuse', '/etc/mtab', run.Raw(';'),
'then',
'grep', 'ceph-fuse', '/etc/mtab', run.Raw('|'),
'grep', '-o', " /.* fuse", run.Raw('|'),
'grep', '-o', "/.* ", run.Raw('|'),
'xargs', 'sudo', 'fusermount', '-u', run.Raw(';'),
'fi',
run.Raw(';'),
'sudo',
'killall',
'--quiet',
'ceph-mon',
'ceph-osd',
'ceph-mds',
'ceph-fuse',
'radosgw',
'ceph_test_rados',
'rados',
'apache2',
run.Raw('||'),
'true', # ignore errors from ceph binaries not being found
],
wait=False,
)
nodes[remote.name] = proc
for name, proc in nodes.iteritems():
log.info('Waiting for %s to finish shutdowns...', name)
proc.exitstatus.get()
def find_kernel_mounts(ctx, log):
from .orchestra import run
nodes = {}
log.info('Looking for kernel mounts to handle...')
for remote in ctx.cluster.remotes.iterkeys():
proc = remote.run(
args=[
'grep', '-q', ' ceph ' , '/etc/mtab',
run.Raw('||'),
'grep', '-q', '^/dev/rbd' , '/etc/mtab',
],
wait=False,
)
nodes[remote] = proc
kernel_mounts = list()
for remote, proc in nodes.iteritems():
try:
proc.exitstatus.get()
log.debug('kernel mount exists on %s', remote.name)
kernel_mounts.append(remote)
except run.CommandFailedError: # no mounts!
log.debug('no kernel mount on %s', remote.name)
return kernel_mounts
def remove_kernel_mounts(ctx, kernel_mounts, log):
"""
properly we should be able to just do a forced unmount,
but that doesn't seem to be working, so you should reboot instead
"""
from .orchestra import run
nodes = {}
for remote in kernel_mounts:
log.info('clearing kernel mount from %s', remote.name)
proc = remote.run(
args=[
'grep', 'ceph', '/etc/mtab', run.Raw('|'),
'grep', '-o', "on /.* type", run.Raw('|'),
'grep', '-o', "/.* ", run.Raw('|'),
'xargs', '-r',
'sudo', 'umount', '-f', run.Raw(';'),
'fi'
],
wait=False
)
nodes[remote] = proc
for remote, proc in nodes:
proc.exitstatus.get()
def remove_osd_mounts(ctx, log):
"""
unmount any osd data mounts (scratch disks)
"""
from .orchestra import run
ctx.cluster.run(
args=[
'grep',
'/var/lib/ceph/osd/',
'/etc/mtab',
run.Raw('|'),
'awk', '{print $2}', run.Raw('|'),
'xargs', '-r',
'sudo', 'umount', run.Raw(';'),
'true'
],
)
def remove_osd_tmpfs(ctx, log):
"""
unmount tmpfs mounts
"""
from .orchestra import run
ctx.cluster.run(
args=[
'egrep', 'tmpfs\s+/mnt', '/etc/mtab', run.Raw('|'),
'awk', '{print $2}', run.Raw('|'),
'xargs', '-r',
'sudo', 'umount', run.Raw(';'),
'true'
],
)
def reboot(ctx, remotes, log):
import time
nodes = {}
for remote in remotes:
log.info('rebooting %s', remote.name)
proc = remote.run( # note use of -n to force a no-sync reboot
args=['sudo', 'reboot', '-f', '-n'],
wait=False
)
nodes[remote] = proc
# we just ignore these procs because reboot -f doesn't actually
# send anything back to the ssh client!
#for remote, proc in nodes.iteritems():
#proc.exitstatus.get()
from teuthology.misc import reconnect
if remotes:
log.info('waiting for nodes to reboot')
time.sleep(5) #if we try and reconnect too quickly, it succeeds!
reconnect(ctx, 480) #allow 8 minutes for the reboots
def reset_syslog_dir(ctx, log):
from .orchestra import run
nodes = {}
for remote in ctx.cluster.remotes.iterkeys():
proc = remote.run(
args=[
'if', 'test', '-e', '/etc/rsyslog.d/80-cephtest.conf',
run.Raw(';'),
'then',
'sudo', 'rm', '-f', '--', '/etc/rsyslog.d/80-cephtest.conf',
run.Raw('&&'),
'sudo', 'service', 'rsyslog', 'restart',
run.Raw(';'),
'fi',
run.Raw(';'),
],
wait=False,
)
nodes[remote.name] = proc
for name, proc in nodes.iteritems():
log.info('Waiting for %s to restart syslog...', name)
proc.exitstatus.get()
def dpkg_configure(ctx, log):
from .orchestra import run
nodes = {}
for remote in ctx.cluster.remotes.iterkeys():
proc = remote.run(
args=[
'sudo', 'dpkg', '--configure', '-a',
run.Raw('&&'),
'sudo', 'apt-get', '-f', 'install',
run.Raw('||'),
':',
],
wait=False,
)
nodes[remote.name] = proc
for name, proc in nodes.iteritems():
log.info('Waiting for %s to dpkg --configure -a and apt-get -f install...', name)
proc.exitstatus.get()
def remove_installed_packages(ctx, log):
from teuthology.task import install as install_task
dpkg_configure(ctx, log)
config = {'project': 'ceph'}
install_task.remove_packages(ctx, config,
{"deb": install_task.deb_packages['ceph'],
"rpm": install_task.rpm_packages['ceph']})
install_task.remove_sources(ctx, config)
install_task.purge_data(ctx)
def remove_testing_tree(ctx, log):
from teuthology.misc import get_testdir_base
from .orchestra import run
nodes = {}
for remote in ctx.cluster.remotes.iterkeys():
proc = remote.run(
args=[
'sudo', 'rm', '-rf', get_testdir_base(ctx),
# just for old time's sake
run.Raw('&&'),
'sudo', 'rm', '-rf', '/tmp/cephtest',
run.Raw('&&'),
'sudo', 'rm', '-rf', '/home/ubuntu/cephtest',
run.Raw('&&'),
'sudo', 'rm', '-rf', '/etc/ceph',
],
wait=False,
)
nodes[remote.name] = proc
for name, proc in nodes.iteritems():
log.info('Waiting for %s to clear filesystem...', name)
proc.exitstatus.get()
def synch_clocks(remotes, log):
from .orchestra import run
nodes = {}
for remote in remotes:
proc = remote.run(
args=[
'sudo', 'service', 'ntp', 'stop',
run.Raw('&&'),
'sudo', 'ntpdate-debian',
run.Raw('&&'),
'sudo', 'hwclock', '--systohc', '--utc',
run.Raw('&&'),
'sudo', 'service', 'ntp', 'start',
run.Raw('||'),
'true', # ignore errors; we may be racing with ntpd startup
],
wait=False,
)
nodes[remote.name] = proc
for name, proc in nodes.iteritems():
log.info('Waiting for clock to synchronize on %s...', name)
proc.exitstatus.get()
def main():
from gevent import monkey; monkey.patch_all(dns=False)
from .orchestra import monkey; monkey.patch_all()
from teuthology.run import config_file
import logging
log = logging.getLogger(__name__)
ctx = parse_args()
loglevel = logging.INFO
if ctx.verbose:
loglevel = logging.DEBUG
logging.basicConfig(
level=loglevel,
)
if ctx.archive:
ctx.config = config_file(ctx.archive + '/config.yaml')
if not ctx.pid:
ctx.pid = int(open(ctx.archive + '/pid').read().rstrip('\n'))
if not ctx.owner:
ctx.owner = open(ctx.archive + '/owner').read().rstrip('\n')
from teuthology.misc import read_config
read_config(ctx)
log.info('\n '.join(['targets:', ] + yaml.safe_dump(ctx.config['targets'], default_flow_style=False).splitlines()))
if ctx.owner is None:
from teuthology.misc import get_user
ctx.owner = get_user()
if ctx.pid:
if ctx.archive:
import os
log.info('Killing teuthology process at pid %d', ctx.pid)
os.system('grep -q %s /proc/%d/cmdline && sudo kill %d' % (
ctx.archive,
ctx.pid,
ctx.pid))
else:
import subprocess
subprocess.check_call(["kill", "-9", str(ctx.pid)]);
nuke(ctx, log, ctx.unlock, ctx.synch_clocks, ctx.reboot_all, ctx.noipmi)
def nuke(ctx, log, should_unlock, sync_clocks=True, reboot_all=True,
noipmi=False):
from teuthology.parallel import parallel
total_unnuked = {}
with parallel() as p:
for target, hostkey in ctx.config['targets'].iteritems():
p.spawn(
nuke_one,
ctx,
{target: hostkey},
log,
should_unlock,
sync_clocks,
reboot_all,
ctx.config.get('check-locks', True),
noipmi,
)
for unnuked in p:
if unnuked:
total_unnuked.update(unnuked)
if total_unnuked:
log.error('Could not nuke the following targets:\n' + '\n '.join(['targets:', ] + yaml.safe_dump(total_unnuked, default_flow_style=False).splitlines()))
def nuke_one(ctx, targets, log, should_unlock, synch_clocks, reboot_all,
check_locks, noipmi):
from teuthology.lock import unlock
ret = None
ctx = argparse.Namespace(
config=dict(targets=targets),
owner=ctx.owner,
check_locks=check_locks,
synch_clocks=synch_clocks,
reboot_all=reboot_all,
teuthology_config=ctx.teuthology_config,
name=ctx.name,
noipmi=noipmi,
)
try:
nuke_helper(ctx, log)
except:
log.exception('Could not nuke all targets in %s', targets)
# not re-raising the so that parallel calls aren't killed
ret = targets
else:
if should_unlock:
for target in targets.keys():
unlock(ctx, target, ctx.owner)
return ret
def nuke_helper(ctx, log):
# ensure node is up with ipmi
from teuthology.orchestra import remote
(target,) = ctx.config['targets'].keys()
host = target.split('@')[-1]
shortname = host.split('.')[0]
if 'vpm' in shortname:
return
log.debug('shortname: %s' % shortname)
log.debug('{ctx}'.format(ctx=ctx))
if not ctx.noipmi and 'ipmi_user' in ctx.teuthology_config:
console = remote.getRemoteConsole(name=host,
ipmiuser=ctx.teuthology_config['ipmi_user'],
ipmipass=ctx.teuthology_config['ipmi_password'],
ipmidomain=ctx.teuthology_config['ipmi_domain'])
cname = '{host}.{domain}'.format(host=shortname, domain=ctx.teuthology_config['ipmi_domain'])
log.info('checking console status of %s' % cname)
if not console.check_status():
# not powered on or can't get IPMI status. Try to power on
console.power_on()
# try to get status again, waiting for login prompt this time
log.info('checking console status of %s' % cname)
if not console.check_status(100):
log.error('Failed to get console status for %s, disabling console...' % cname)
log.info('console ready on %s' % cname)
else:
log.info('console ready on %s' % cname)
from teuthology.task.internal import check_lock, connect
if ctx.check_locks:
check_lock(ctx, None)
connect(ctx, None)
log.info('Unmount ceph-fuse and killing daemons...')
shutdown_daemons(ctx, log)
log.info('All daemons killed.')
need_reboot = find_kernel_mounts(ctx, log)
# no need to unmount anything if we're rebooting
if ctx.reboot_all:
need_reboot = ctx.cluster.remotes.keys()
else:
log.info('Unmount any osd data directories...')
remove_osd_mounts(ctx, log)
log.info('Unmount any osd tmpfs dirs...')
remove_osd_tmpfs(ctx, log)
#log.info('Dealing with any kernel mounts...')
#remove_kernel_mounts(ctx, need_reboot, log)
if need_reboot:
reboot(ctx, need_reboot, log)
log.info('All kernel mounts gone.')
log.info('Synchronizing clocks...')
if ctx.synch_clocks:
need_reboot = ctx.cluster.remotes.keys()
synch_clocks(need_reboot, log)
log.info('Reseting syslog output locations...')
reset_syslog_dir(ctx, log)
log.info('Clearing filesystem of test data...')
remove_testing_tree(ctx, log)
log.info('Filesystem Cleared.')
remove_installed_packages(ctx, log)
log.info('Installed packages removed.')