ceph/teuthology/lock.py

import argparse
import json
import logging
import subprocess
import urllib
import yaml
import re
import collections
import tempfile
import os
import time

import teuthology
from .config import config
from . import lockstatus as ls
from . import misc
from teuthology.misc import get_distro
from teuthology.misc import get_distro_version

log = logging.getLogger(__name__)


def lock_many(ctx, num, machinetype, user=None, description=None):
    machinetypes = misc.get_multi_machine_types(machinetype)
    if user is None:
        user = misc.get_user()
    for machinetype in machinetypes:
        success, content, status = ls.send_request(
            'POST',
            config.lock_server,
            urllib.urlencode(
                dict(
                    user=user,
                    num=num,
                    machinetype=machinetype,
                    desc=description,
                )))
        if success:
            machines = json.loads(content)
            log.debug('locked {machines}'.format(
                machines=', '.join(machines.keys())))
            if machinetype == 'vps':
                ok_machs = {}
                for machine in machines:
                    if create_if_vm(ctx, machine):
                        ok_machs[machine] = machines[machine]
                    else:
                        log.error('Unable to create virtual machine: %s' % machine)
                        unlock_one(ctx, machine)
                return ok_machs
            return machines
        if status == 503:
            log.error('Insufficient nodes available to lock %d %s nodes.', num,machinetype)
        else:
            log.error('Could not lock %d %s nodes, reason: unknown.', num, machinetype)
    return []


def lock_one(ctx, name, user=None, description=None):
    if user is None:
        user = misc.get_user()
    success, _, _ = ls.send_request(
        'POST',
        config.lock_server + '/' + name,
        urllib.urlencode(dict(user=user, desc=description)))
    if success:
        log.debug('locked %s as %s', name, user)
    else:
        log.error('failed to lock %s', name)
    return success


def unlock_one(ctx, name, user=None):
    if user is None:
        user = misc.get_user()
    success, _, _ = ls.send_request(
        'DELETE',
        config.lock_server + '/' + name + '?' +
        urllib.urlencode(dict(user=user)))
    if success:
        log.debug('unlocked %s', name)
        if not destroy_if_vm(ctx, name):
            log.error('downburst destroy failed for %s', name)
            log.info('%s is not locked' % name)
    else:
        log.error('failed to unlock %s', name)
    return success


def list_locks():
    success, content, _ = ls.send_request('GET', config.lock_server)
    if success:
        return json.loads(content)
    return None


def update_lock(ctx, name, description=None, status=None, sshpubkey=None):
    status_info = ls.get_status(ctx, name)
    phys_host = status_info['vpshost']
    if phys_host:
        keyscan_out = ''
        while not keyscan_out:
            time.sleep(10)
            keyscan_out, _ = keyscan_check(ctx, [name])
    updated = {}
    if description is not None:
        updated['desc'] = description
    if status is not None:
        updated['status'] = status
    if sshpubkey is not None:
        updated['sshpubkey'] = sshpubkey

    if updated:
        success, _, _ = ls.send_request(
            'PUT',
            config.lock_server + '/' + name,
            body=urllib.urlencode(updated),
            headers={'Content-type': 'application/x-www-form-urlencoded'})
        return success
    return True


def canonicalize_hostname(s):
    if re.match('ubuntu@.*\.front\.sepia\.ceph\.com', s) is None:
        s = 'ubuntu@' + s + '.front.sepia.ceph.com'
    return s


def main(ctx):
    if ctx.verbose:
        teuthology.log.setLevel(logging.DEBUG)

    misc.read_config(ctx)

    ret = 0
    user = ctx.owner
    machines = [canonicalize_hostname(m) for m in ctx.machines]
    machines_to_update = []

    if ctx.targets:
        try:
            with file(ctx.targets) as f:
                g = yaml.safe_load_all(f)
                for new in g:
                    if 'targets' in new:
                        for t in new['targets'].iterkeys():
                            machines.append(t)
        except IOError as e:
            raise argparse.ArgumentTypeError(str(e))

    if ctx.f:
        assert ctx.lock or ctx.unlock, \
            '-f is only supported by --lock and --unlock'
    if machines:
        assert ctx.lock or ctx.unlock or ctx.list or ctx.list_targets \
            or ctx.update, \
            'machines cannot be specified with that operation'
    else:
        assert ctx.num_to_lock or ctx.list or ctx.list_targets or ctx.summary,\
            'machines must be specified for that operation'
    if ctx.all:
        assert ctx.list or ctx.list_targets, \
            '--all can only be used with --list and --list-targets'
        assert ctx.owner is None, \
            '--all and --owner are mutually exclusive'
        assert not machines, \
            '--all and listing specific machines are incompatible'
    if ctx.num_to_lock:
        assert ctx.machine_type, \
            'must specify machine type to lock'

    if ctx.brief:
        assert ctx.list, '--brief only applies to --list'

    if ctx.list or ctx.list_targets:
        assert ctx.desc is None, '--desc does nothing with --list'

        if machines:
            statuses = []
            for machine in machines:
                status = ls.get_status(ctx, machine)
                if status:
                    statuses.append(status)
                else:
                    log.error("Lockserver doesn't know about machine: %s" %
                              machine)
        else:
            statuses = list_locks()
        vmachines = []

        for vmachine in statuses:
            if vmachine['vpshost']:
                if vmachine['locked']:
                    vmachines.append(vmachine['name'])
        if vmachines:
            # Avoid ssh-keyscans for everybody when listing all machines
            # Listing specific machines will update the keys.
            if machines:
                scan_for_locks(ctx, vmachines)
                statuses = [ls.get_status(ctx, machine)
                            for machine in machines]
            else:
                statuses = list_locks()
        if statuses:
            if ctx.machine_type:
                statuses = [status for status in statuses
                            if status['type'] == ctx.machine_type]
            if not machines and ctx.owner is None and not ctx.all:
                ctx.owner = misc.get_user()
            if ctx.owner is not None:
                statuses = [status for status in statuses
                            if status['locked_by'] == ctx.owner]
            if ctx.status is not None:
                statuses = [status for status in statuses
                            if status['up'] == (ctx.status == 'up')]
            if ctx.locked is not None:
                statuses = [status for status in statuses
                            if status['locked'] == (ctx.locked == 'true')]
            if ctx.desc is not None:
                statuses = [status for status in statuses
                            if status['description'] == ctx.desc]
            if ctx.desc_pattern is not None:
                statuses = [status for status in statuses
                            if status['description'] is not None and
                            status['description'].find(ctx.desc_pattern) >= 0]
            if ctx.list:
                if ctx.brief:
                    for s in statuses:
                        locked = "un" if s['locked'] == 0 else "  "
                        mo = re.match('\w+@(\w+?)\..*', s['name'])
                        host = mo.group(1) if mo else s['name']
                        print '{host} {locked}locked {owner} "{desc}"'.format(
                            locked=locked, host=host,
                            owner=s['locked_by'], desc=s['description'])
                else:
                    print json.dumps(statuses, indent=4)
            else:
                frag = {'targets': {}}
                for f in statuses:
                    frag['targets'][f['name']] = f['sshpubkey']
                print yaml.safe_dump(frag, default_flow_style=False)
        else:
            log.error('error retrieving lock statuses')
            ret = 1

    elif ctx.summary:
        do_summary(ctx)
        return 0

    elif ctx.lock:
        for machine in machines:
            if not lock_one(ctx, machine, user):
                ret = 1
                if not ctx.f:
                    return ret
            else:
                machines_to_update.append(machine)
                create_if_vm(ctx, machine)
    elif ctx.unlock:
        for machine in machines:
            if not unlock_one(ctx, machine, user):
                ret = 1
                if not ctx.f:
                    return ret
            else:
                machines_to_update.append(machine)
    elif ctx.num_to_lock:
        result = lock_many(ctx, ctx.num_to_lock, ctx.machine_type, user)
        if not result:
            ret = 1
        else:
            machines_to_update = result.keys()
            if ctx.machine_type == 'vps':
                shortnames = ' '.join(
                    [name.split('@')[1].split('.')[0]
                        for name in result.keys()]
                )
                if len(result) < ctx.num_to_lock:
                    log.error("Locking failed.")
                    for machn in result:
                        unlock_one(ctx, machn)
                    ret = 1
                else:
                    log.info("Successfully Locked:\n%s\n" % shortnames)
                    log.info(
                        "Unable to display keys at this time (virtual " +
                        "machines are booting).")
                    log.info(
                        "Please run teuthology-lock --list-targets %s once " +
                        "these machines come up.",
                        shortnames)
            else:
                print yaml.safe_dump(
                    dict(targets=result),
                    default_flow_style=False)
    elif ctx.update:
        assert ctx.desc is not None or ctx.status is not None, \
            'you must specify description or status to update'
        assert ctx.owner is None, 'only description and status may be updated'
        machines_to_update = machines

    if ctx.desc is not None or ctx.status is not None:
        for machine in machines_to_update:
            update_lock(ctx, machine, ctx.desc, ctx.status)

    return ret


def updatekeys(ctx):
    loglevel = logging.INFO
    if ctx.verbose:
        loglevel = logging.DEBUG

    logging.basicConfig(
        level=loglevel,
    )

    misc.read_config(ctx)

    machines = [canonicalize_hostname(m) for m in ctx.machines]

    if ctx.targets:
        try:
            with file(ctx.targets) as f:
                g = yaml.safe_load_all(f)
                for new in g:
                    if 'targets' in new:
                        for t in new['targets'].iterkeys():
                            machines.append(t)
        except IOError as e:
            raise argparse.ArgumentTypeError(str(e))

    return scan_for_locks(ctx, machines)


def keyscan_check(ctx, machines):
    locks = list_locks()
    current_locks = {}
    for lock in locks:
        current_locks[lock['name']] = lock

    if hasattr(ctx, 'all'):
        if ctx.all:
            machines = current_locks.keys()

    for i, machine in enumerate(machines):
        if '@' in machine:
            _, machines[i] = machine.rsplit('@')
    args = ['ssh-keyscan', '-t', 'rsa']
    args.extend(machines)
    p = subprocess.Popen(
        args=args,
        stdout=subprocess.PIPE,
    )
    out, err = p.communicate()
    return (out, current_locks)


def update_keys(ctx, out, current_locks):
    ret = 0
    for key_entry in out.splitlines():
        hostname, pubkey = key_entry.split(' ', 1)
        # TODO: separate out user
        full_name = 'ubuntu@{host}'.format(host=hostname)
        log.info('Checking %s', full_name)
        assert full_name in current_locks, 'host is not in the database!'
        if current_locks[full_name]['sshpubkey'] != pubkey:
            log.info('New key found. Updating...')
            if not update_lock(ctx, full_name, sshpubkey=pubkey):
                log.error('failed to update %s!', full_name)
                ret = 1
    return ret


def scan_for_locks(ctx, machines):
    out, current_locks = keyscan_check(ctx, machines)
    return update_keys(ctx, out, current_locks)


def do_summary(ctx):
    lockd = collections.defaultdict(lambda: [0, 0, 'unknown'])
    for l in list_locks():
        if ctx.machine_type and l['type'] != ctx.machine_type:
            continue
        who = l['locked_by'] if l['locked'] == 1 else '(free)', l['type']
        lockd[who][0] += 1
        lockd[who][1] += l['up']         # up is 1 or 0
        lockd[who][2] = l['type']

    locks = sorted([p for p in lockd.iteritems()
                    ], key=lambda sort: (sort[1][2], sort[1][0]))
    total_count, total_up = 0, 0
    print "TYPE     COUNT  UP  OWNER"

    for (owner, (count, upcount, machinetype)) in locks:
            # if machinetype == spectype:
            print "{machinetype:8s} {count:3d}  {up:3d}  {owner}".format(
                count=count, up=upcount, owner=owner[0],
                machinetype=machinetype)
            total_count += count
            total_up += upcount

    print "         ---  ---"
    print "{cnt:12d}  {up:3d}".format(cnt=total_count, up=total_up)


def decanonicalize_hostname(s):
    if re.match('ubuntu@.*\.front\.sepia\.ceph\.com', s):
        s = s[len('ubuntu@'): -len('.front.sepia.ceph.com')]
    return s


def _get_downburst_exec():
    """
    First check for downburst in the user's path.
    Then check in ~/src, ~ubuntu/src, and ~teuthology/src.
    Return '' if no executable downburst is found.
    """
    path = os.environ.get('PATH', None)
    if path:
        for p in os.environ.get('PATH', '').split(os.pathsep):
            pth = os.path.join(p, 'downburst')
            if os.access(pth, os.X_OK):
                return pth
    import pwd
    little_old_me = pwd.getpwuid(os.getuid()).pw_name
    for user in [little_old_me, 'ubuntu', 'teuthology']:
        pth = "/home/%s/src/downburst/virtualenv/bin/downburst" % user
        if os.access(pth, os.X_OK):
            return pth
    return ''

#
# Use downburst to create a virtual machine
#


def create_if_vm(ctx, machine_name):
    status_info = ls.get_status(ctx, machine_name)
    phys_host = status_info['vpshost']
    if not phys_host:
        return False
    os_type = get_distro(ctx)
    os_version = get_distro_version(ctx)

    createMe = decanonicalize_hostname(machine_name)
    with tempfile.NamedTemporaryFile() as tmp:
        try:
            lfile = ctx.downburst_conf
            with open(lfile) as downb_yaml:
                lcnfg = yaml.safe_load(downb_yaml)
                if lcnfg.keys() == ['downburst']:
                    lcnfg = lcnfg['downburst']
        except (TypeError, AttributeError):
            try:
                lcnfg = {}
                for tdict in ctx.config['downburst']:
                    for key in tdict:
                        lcnfg[key] = tdict[key]
            except (KeyError, AttributeError):
                lcnfg = {}
        except IOError:
            print "Error reading %s" % lfile
            return False

        distro = lcnfg.get('distro', os_type.lower())
        distroversion = lcnfg.get('distroversion', os_version)

        file_info = {}
        file_info['disk-size'] = lcnfg.get('disk-size', '30G')
        file_info['ram'] = lcnfg.get('ram', '1.9G')
        file_info['cpus'] = lcnfg.get('cpus', 1)
        file_info['networks'] = lcnfg.get('networks',
                 [{'source': 'front', 'mac': status_info['mac']}])
        file_info['distro'] = distro
        file_info['distroversion'] = distroversion
        file_info['additional-disks'] = lcnfg.get(
            'additional-disks', 3)
        file_info['additional-disks-size'] = lcnfg.get(
            'additional-disks-size', '200G')
        file_info['arch'] = lcnfg.get('arch', 'x86_64')
        file_out = {'downburst': file_info}
        yaml.safe_dump(file_out, tmp)
        metadata = "--meta-data=%s" % tmp.name
        dbrst = _get_downburst_exec()
        if not dbrst:
            log.error("No downburst executable found.")
            return False
        p = subprocess.Popen([dbrst, '-c', phys_host,
                              'create', metadata, createMe],
                             stdout=subprocess.PIPE, stderr=subprocess.PIPE,)
        owt, err = p.communicate()
        if err:
            log.info("Downburst completed on %s: %s" %
                    (machine_name, err))
        else:
            log.info("%s created: %s" % (machine_name, owt))
        # If the guest already exists first destroy then re-create:
        if 'exists' in err:
            log.info("Guest files exist. Re-creating guest: %s" %
                    (machine_name))
            destroy_if_vm(ctx, machine_name)
            create_if_vm(ctx, machine_name)
    return True
#
# Use downburst to destroy a virtual machine
#


def destroy_if_vm(ctx, machine_name):
    """
    Return False only on vm downburst failures.
    """
    status_info = ls.get_status(ctx, machine_name)
    phys_host = status_info['vpshost']
    if not phys_host:
        return True
    destroyMe = decanonicalize_hostname(machine_name)
    dbrst = _get_downburst_exec()
    if not dbrst:
        log.error("No downburst executable found.")
        return False
    p = subprocess.Popen([dbrst, '-c', phys_host,
                          'destroy', destroyMe],
                         stdout=subprocess.PIPE, stderr=subprocess.PIPE,)
    owt, err = p.communicate()
    if err:
        log.error(err)
        return False
    else:
        log.info("%s destroyed: %s" % (machine_name, owt))
    return True