ceph/teuthology/nuke.py

import argparse
import logging
import os
import subprocess
import time
import yaml

import teuthology
from . import orchestra
import orchestra.remote
from .orchestra import run
from .lock import list_locks
from .lock import unlock_one
from .misc import config_file
from .misc import get_testdir
from .misc import get_user
from .misc import read_config
from .misc import reconnect
from .parallel import parallel
from .task import install as install_task
from .task.internal import check_lock
from .task.internal import connect

log = logging.getLogger(__name__)


def shutdown_daemons(ctx):
    nodes = {}
    for remote in ctx.cluster.remotes.iterkeys():
        proc = remote.run(
            args=[
                'if', 'grep', '-q', 'ceph-fuse', '/etc/mtab', run.Raw(';'),
                'then',
                'grep', 'ceph-fuse', '/etc/mtab', run.Raw('|'),
                'grep', '-o', " /.* fuse", run.Raw('|'),
                'grep', '-o', "/.* ", run.Raw('|'),
                'xargs', '-n', '1', 'sudo', 'fusermount', '-u', run.Raw(';'),
                'fi',
                run.Raw(';'),
                'sudo',
                'killall',
                '--quiet',
                'ceph-mon',
                'ceph-osd',
                'ceph-mds',
                'ceph-fuse',
                'ceph-disk',
                'radosgw',
                'ceph_test_rados',
                'rados',
                'apache2',
                run.Raw('||'),
                'true',  # ignore errors from ceph binaries not being found
            ],
            wait=False,
        )
        nodes[remote.name] = proc

    for name, proc in nodes.iteritems():
        log.info('Waiting for %s to finish shutdowns...', name)
        proc.wait()


def find_kernel_mounts(ctx):
    nodes = {}
    log.info('Looking for kernel mounts to handle...')
    for remote in ctx.cluster.remotes.iterkeys():
        proc = remote.run(
            args=[
                'grep', '-q', ' ceph ', '/etc/mtab',
                run.Raw('||'),
                'grep', '-q', '^/dev/rbd', '/etc/mtab',
            ],
            wait=False,
        )
        nodes[remote] = proc
    kernel_mounts = list()
    for remote, proc in nodes.iteritems():
        try:
            proc.wait()
            log.debug('kernel mount exists on %s', remote.name)
            kernel_mounts.append(remote)
        except run.CommandFailedError:  # no mounts!
            log.debug('no kernel mount on %s', remote.name)

    return kernel_mounts


def remove_kernel_mounts(ctx, kernel_mounts):
    """
    properly we should be able to just do a forced unmount,
    but that doesn't seem to be working, so you should reboot instead
    """
    nodes = {}
    for remote in kernel_mounts:
        log.info('clearing kernel mount from %s', remote.name)
        proc = remote.run(
            args=[
                'grep', 'ceph', '/etc/mtab', run.Raw('|'),
                'grep', '-o', "on /.* type", run.Raw('|'),
                'grep', '-o', "/.* ", run.Raw('|'),
                'xargs', '-r',
                'sudo', 'umount', '-f', run.Raw(';'),
                'fi'
            ],
            wait=False
        )
        nodes[remote] = proc

    for remote, proc in nodes:
        proc.wait()


def remove_osd_mounts(ctx):
    """
    unmount any osd data mounts (scratch disks)
    """
    ctx.cluster.run(
        args=[
            'grep',
            '/var/lib/ceph/osd/',
            '/etc/mtab',
            run.Raw('|'),
            'awk', '{print $2}', run.Raw('|'),
            'xargs', '-r',
            'sudo', 'umount', run.Raw(';'),
            'true'
        ],
    )


def remove_osd_tmpfs(ctx):
    """
    unmount tmpfs mounts
    """
    ctx.cluster.run(
        args=[
            'egrep', 'tmpfs\s+/mnt', '/etc/mtab', run.Raw('|'),
            'awk', '{print $2}', run.Raw('|'),
            'xargs', '-r',
            'sudo', 'umount', run.Raw(';'),
            'true'
        ],
    )


def reboot(ctx, remotes):
    nodes = {}
    for remote in remotes:
        log.info('rebooting %s', remote.name)
        try:
            proc = remote.run(  # note use of -n to force a no-sync reboot
                args=[
                    'sync',
                    run.Raw('&'),
                    'sleep', '5',
                    run.Raw(';'),
                    'sudo', 'reboot', '-f', '-n'
                    ],
                wait=False
                )
        except Exception:
            log.exception('ignoring exception during reboot command')
        nodes[remote] = proc
        # we just ignore these procs because reboot -f doesn't actually
        # send anything back to the ssh client!
        # for remote, proc in nodes.iteritems():
        # proc.wait()
    if remotes:
        log.info('waiting for nodes to reboot')
        time.sleep(8)  # if we try and reconnect too quickly, it succeeds!
        reconnect(ctx, 480)  # allow 8 minutes for the reboots


def reset_syslog_dir(ctx):
    nodes = {}
    for remote in ctx.cluster.remotes.iterkeys():
        proc = remote.run(
            args=[
                'if', 'test', '-e', '/etc/rsyslog.d/80-cephtest.conf',
                run.Raw(';'),
                'then',
                'sudo', 'rm', '-f', '--', '/etc/rsyslog.d/80-cephtest.conf',
                run.Raw('&&'),
                'sudo', 'service', 'rsyslog', 'restart',
                run.Raw(';'),
                'fi',
                run.Raw(';'),
            ],
            wait=False,
        )
        nodes[remote.name] = proc

    for name, proc in nodes.iteritems():
        log.info('Waiting for %s to restart syslog...', name)
        proc.wait()


def dpkg_configure(ctx):
    nodes = {}
    for remote in ctx.cluster.remotes.iterkeys():
        proc = remote.run(
            args=[
                'sudo', 'dpkg', '--configure', '-a',
                run.Raw('&&'),
                'sudo', 'apt-get', '-f', 'install',
                run.Raw('||'),
                ':',
            ],
            wait=False,
        )
        nodes[remote.name] = proc

    for name, proc in nodes.iteritems():
        log.info(
            'Waiting for %s to dpkg --configure -a and apt-get -f install...',
            name)
        proc.wait()


def remove_installed_packages(ctx):

    dpkg_configure(ctx)
    config = {'project': 'ceph'}
    install_task.remove_packages(
        ctx,
        config,
        {"deb": install_task.deb_packages['ceph'] + ['salt-common', 'salt-minion'],
         "rpm": install_task.rpm_packages['ceph']})
    install_task.remove_sources(ctx, config)
    install_task.purge_data(ctx)


def remove_testing_tree(ctx):
    nodes = {}
    for remote in ctx.cluster.remotes.iterkeys():
        proc = remote.run(
            args=[
                'sudo', 'rm', '-rf', get_testdir(ctx),
                # just for old time's sake
                run.Raw('&&'),
                'sudo', 'rm', '-rf', '/tmp/cephtest',
                run.Raw('&&'),
                'sudo', 'rm', '-rf', '/home/ubuntu/cephtest',
                run.Raw('&&'),
                'sudo', 'rm', '-rf', '/etc/ceph',
            ],
            wait=False,
        )
        nodes[remote.name] = proc

    for name, proc in nodes.iteritems():
        log.info('Waiting for %s to clear filesystem...', name)
        proc.wait()


def synch_clocks(remotes):
    nodes = {}
    for remote in remotes:
        proc = remote.run(
            args=[
                'sudo', 'service', 'ntp', 'stop',
                run.Raw('&&'),
                'sudo', 'ntpdate-debian',
                run.Raw('&&'),
                'sudo', 'hwclock', '--systohc', '--utc',
                run.Raw('&&'),
                'sudo', 'service', 'ntp', 'start',
                run.Raw('||'),
                'true',    # ignore errors; we may be racing with ntpd startup
            ],
            wait=False,
        )
        nodes[remote.name] = proc
    for name, proc in nodes.iteritems():
        log.info('Waiting for clock to synchronize on %s...', name)
        proc.wait()


def main(ctx):
    if ctx.verbose:
        teuthology.log.setLevel(logging.DEBUG)

    info = {}
    if ctx.archive:
        ctx.config = config_file(ctx.archive + '/config.yaml')
        ifn = os.path.join(ctx.archive, 'info.yaml')
        if os.path.exists(ifn):
            with file(ifn, 'r') as fd:
                info = yaml.load(fd.read())
        if not ctx.pid:
            ctx.pid = info.get('pid')
            if not ctx.pid:
                ctx.pid = int(open(ctx.archive + '/pid').read().rstrip('\n'))
        if not ctx.owner:
            ctx.owner = info.get('owner')
            if not ctx.owner:
                ctx.owner = open(ctx.archive + '/owner').read().rstrip('\n')

    read_config(ctx)

    log.info(
        '\n  '.join(
            ['targets:', ] + yaml.safe_dump(
                ctx.config['targets'],
                default_flow_style=False).splitlines()))

    if ctx.owner is None:
        ctx.owner = get_user()

    if ctx.pid:
        if ctx.archive:
            log.info('Killing teuthology process at pid %d', ctx.pid)
            os.system('grep -q %s /proc/%d/cmdline && sudo kill %d' % (
                ctx.archive,
                ctx.pid,
                ctx.pid))
        else:
            subprocess.check_call(["kill", "-9", str(ctx.pid)])

    nuke(ctx, ctx.unlock, ctx.synch_clocks, ctx.reboot_all, ctx.noipmi)


def nuke(ctx, should_unlock, sync_clocks=True, reboot_all=True, noipmi=False):
    total_unnuked = {}
    targets = dict(ctx.config['targets'])
    if ctx.name:
        log.info('Checking targets against current locks')
        locks = list_locks()
        # Remove targets who's description doesn't match archive name.
        for lock in locks:
            for target in targets:
                if target == lock['name']:
                    if ctx.name not in lock['description']:
                        del ctx.config['targets'][lock['name']]
                        log.info(
                            "Not nuking %s because description doesn't match",
                            lock['name'])
    with parallel() as p:
        for target, hostkey in ctx.config['targets'].iteritems():
            p.spawn(
                nuke_one,
                ctx,
                {target: hostkey},
                should_unlock,
                sync_clocks,
                reboot_all,
                ctx.config.get('check-locks', True),
                noipmi,
            )
        for unnuked in p:
            if unnuked:
                total_unnuked.update(unnuked)
    if total_unnuked:
        log.error('Could not nuke the following targets:\n' +
                  '\n  '.join(['targets:', ] +
                              yaml.safe_dump(
                                  total_unnuked,
                                  default_flow_style=False).splitlines()))


def nuke_one(ctx, targets, should_unlock, synch_clocks, reboot_all,
             check_locks, noipmi):
    ret = None
    ctx = argparse.Namespace(
        config=dict(targets=targets),
        owner=ctx.owner,
        check_locks=check_locks,
        synch_clocks=synch_clocks,
        reboot_all=reboot_all,
        teuthology_config=ctx.teuthology_config,
        name=ctx.name,
        noipmi=noipmi,
    )
    try:
        nuke_helper(ctx)
    except Exception:
        log.exception('Could not nuke all targets in %s' % targets)
        # not re-raising the so that parallel calls aren't killed
        ret = targets
    else:
        if should_unlock:
            for target in targets.keys():
                unlock_one(ctx, target, ctx.owner)
    return ret


def nuke_helper(ctx):
    # ensure node is up with ipmi

    (target,) = ctx.config['targets'].keys()
    host = target.split('@')[-1]
    shortname = host.split('.')[0]
    if 'vpm' in shortname:
        return
    log.debug('shortname: %s' % shortname)
    log.debug('{ctx}'.format(ctx=ctx))
    if not ctx.noipmi and 'ipmi_user' in ctx.teuthology_config:
        console = orchestra.remote.getRemoteConsole(
            name=host,
            ipmiuser=ctx.teuthology_config['ipmi_user'],
            ipmipass=ctx.teuthology_config['ipmi_password'],
            ipmidomain=ctx.teuthology_config['ipmi_domain'])
        cname = '{host}.{domain}'.format(
            host=shortname,
            domain=ctx.teuthology_config['ipmi_domain'])
        log.info('checking console status of %s' % cname)
        if not console.check_status():
            # not powered on or can't get IPMI status.  Try to power on
            console.power_on()
            # try to get status again, waiting for login prompt this time
            log.info('checking console status of %s' % cname)
            if not console.check_status(100):
                log.error('Failed to get console status for %s, ' +
                          'disabling console...' % cname)
            log.info('console ready on %s' % cname)
        else:
            log.info('console ready on %s' % cname)

    if ctx.check_locks:
        check_lock(ctx, None)
    connect(ctx, None)

    log.info('Unmount ceph-fuse and killing daemons...')
    shutdown_daemons(ctx)
    log.info('All daemons killed.')

    need_reboot = find_kernel_mounts(ctx)

    # no need to unmount anything if we're rebooting
    if ctx.reboot_all:
        need_reboot = ctx.cluster.remotes.keys()
    else:
        log.info('Unmount any osd data directories...')
        remove_osd_mounts(ctx)
        log.info('Unmount any osd tmpfs dirs...')
        remove_osd_tmpfs(ctx)
        # log.info('Dealing with any kernel mounts...')
        # remove_kernel_mounts(ctx, need_reboot)

    if need_reboot:
        reboot(ctx, need_reboot)
    log.info('All kernel mounts gone.')

    log.info('Synchronizing clocks...')
    if ctx.synch_clocks:
        need_reboot = ctx.cluster.remotes.keys()
    synch_clocks(need_reboot)

    log.info('Making sure firmware.git is not locked...')
    ctx.cluster.run(args=['sudo', 'rm', '-f',
                          '/lib/firmware/updates/.git/index.lock', ])

    log.info('Reseting syslog output locations...')
    reset_syslog_dir(ctx)
    log.info('Clearing filesystem of test data...')
    remove_testing_tree(ctx)
    log.info('Filesystem Cleared.')
    remove_installed_packages(ctx)
    log.info('Installed packages removed.')