ceph/teuthology/lock.py

import argparse
import json
import logging
import subprocess
import yaml
import re
import collections
import os
import time
import requests
import urllib
from distutils.spawn import find_executable

import teuthology
from . import misc
from . import provision
from .config import config
from .lockstatus import get_status

log = logging.getLogger(__name__)
# Don't need to see connection pool INFO messages
logging.getLogger("requests.packages.urllib3.connectionpool").setLevel(
    logging.WARNING)

is_vpm = lambda name: 'vpm' in name

def get_distro_from_downburst():
    """
    Return a table of valid distros.

    If downburst is in path use it.  If either downburst is unavailable,
    or if downburst is unable to produce a json list, then use a default
    table.
    """
    default_table = {u'rhel_minimal': [u'6.4', u'6.5'],
                     u'fedora': [u'17', u'18', u'19', u'20'],
                     u'centos': [u'6.3', u'6.4', u'6.5', u'7.0'],
                     u'opensuse': [u'12.2'],
                     u'rhel': [u'6.3', u'6.4', u'6.5', u'7.0', u'7beta'],
                     u'centos_minimal': [u'6.4', u'6.5'],
                     u'ubuntu': [u'8.04(hardy)', u'9.10(karmic)',
                                 u'10.04(lucid)', u'10.10(maverick)',
                                 u'11.04(natty)', u'11.10(oneiric)',
                                 u'12.04(precise)', u'12.10(quantal)',
                                 u'13.04(raring)', u'13.10(saucy)',
                                 u'14.04(trusty)', u'utopic(utopic)'],
                     u'sles': [u'11-sp2'],
                     u'debian': [u'6.0', u'7.0']}
    executable_cmd = find_executable('downburst')
    if not executable_cmd:
        log.info('Using default values for supported os_type/os_version')
        return default_table
    try:
        output = subprocess.check_output([executable_cmd, 'list-json'])
        downburst_data = json.loads(output)
        return downburst_data
    except (subprocess.CalledProcessError, OSError):
        log.info('Using default values for supported os_type/os_version')
        return default_table


def vps_version_or_type_valid(machine_type, os_type, os_version):
    """
    Check os-type and os-version parameters when locking a vps.
    Os-type will always be set (defaults to ubuntu).

    In the case where downburst does not handle list-json (an older version
    of downburst, for instance), a message is printed and this checking
    is skipped (so that this code should behave as it did before this
    check was added).
    """
    if not machine_type == 'vps':
        return True
    valid_os_and_version = get_distro_from_downburst()
    if os_type not in valid_os_and_version:
        log.error('os-type is invalid')
        return False
    if not validate_distro_version(os_version,
                                   valid_os_and_version[os_type]):
        log.error("os-version '%s' is invalid", os_version)
        return False
    return True

def validate_distro_version(version, supported_versions):
    """
    Return True if the version is valid.  For Ubuntu, possible
    supported version values are of the form '12.04 (precise)' where
    either the number of the version name is acceptable.
    """
    if version in supported_versions:
        return True
    for parts in supported_versions:
        part = parts.split('(')
        if len(part) == 2:
            if version == part[0]:
                return True
            if version == part[1][0:len(part[1])-1]:
                return True

def main(ctx):
    if ctx.verbose:
        teuthology.log.setLevel(logging.DEBUG)

    misc.read_config(ctx)

    ret = 0
    user = ctx.owner
    machines = [misc.canonicalize_hostname(m, user=False)
                for m in ctx.machines]
    machines_to_update = []

    if ctx.targets:
        try:
            with file(ctx.targets) as f:
                g = yaml.safe_load_all(f)
                for new in g:
                    if 'targets' in new:
                        for t in new['targets'].iterkeys():
                            machines.append(t)
        except IOError as e:
            raise argparse.ArgumentTypeError(str(e))

    if ctx.f:
        assert ctx.lock or ctx.unlock, \
            '-f is only supported by --lock and --unlock'
    if machines:
        assert ctx.lock or ctx.unlock or ctx.list or ctx.list_targets \
            or ctx.update, \
            'machines cannot be specified with that operation'
    else:
        assert ctx.num_to_lock or ctx.list or ctx.list_targets or \
            ctx.summary or ctx.brief, \
            'machines must be specified for that operation'
    if ctx.all:
        assert ctx.list or ctx.list_targets or ctx.brief, \
            '--all can only be used with --list, --list-targets, and --brief'
        assert ctx.owner is None, \
            '--all and --owner are mutually exclusive'
        assert not machines, \
            '--all and listing specific machines are incompatible'
    if ctx.num_to_lock:
        assert ctx.machine_type, \
            'must specify machine type to lock'

    if ctx.brief or ctx.list or ctx.list_targets:
        assert ctx.desc is None, '--desc does nothing with --list/--brief'

        if machines:
            statuses = []
            for machine in machines:
                machine = misc.canonicalize_hostname(machine)
                status = get_status(machine)
                if status:
                    statuses.append(status)
                else:
                    log.error("Lockserver doesn't know about machine: %s" %
                              machine)
            # Delete this variable to avoid linter errors when we redefine it
            # in a list comprehension below
            del machine
        else:
            statuses = list_locks()
        vmachines = []

        for vmachine in statuses:
            if vmachine['vm_host']:
                if vmachine['locked']:
                    vmachines.append(vmachine['name'])
        if vmachines:
            # Avoid ssh-keyscans for everybody when listing all machines
            # Listing specific machines will update the keys.
            if machines:
                do_update_keys(vmachines)
                statuses = [get_status(machine)
                            for machine in machines]
            else:
                statuses = list_locks()
        if statuses:
            if ctx.machine_type:
                statuses = [_status for _status in statuses
                            if _status['machine_type'] == ctx.machine_type]
            if not machines and ctx.owner is None and not ctx.all:
                ctx.owner = misc.get_user()
            if ctx.owner is not None:
                statuses = [_status for _status in statuses
                            if _status['locked_by'] == ctx.owner]
            if ctx.status is not None:
                statuses = [_status for _status in statuses
                            if _status['up'] == (ctx.status == 'up')]
            if ctx.locked is not None:
                statuses = [_status for _status in statuses
                            if _status['locked'] == (ctx.locked == 'true')]
            if ctx.desc is not None:
                statuses = [_status for _status in statuses
                            if _status['description'] == ctx.desc]
            if ctx.desc_pattern is not None:
                statuses = [_status for _status in statuses
                            if _status['description'] is not None and
                            _status['description'].find(ctx.desc_pattern) >= 0]

            # When listing, only show the vm_host's name, not every detail
            for s in statuses:
                if not s.get('is_vm', False):
                    continue
                vm_host_name = s.get('vm_host', dict())['name']
                if vm_host_name:
                    s['vm_host'] = vm_host_name
            if ctx.list:
                    print json.dumps(statuses, indent=4)

            elif ctx.brief:
                for s in sorted(statuses, key=lambda s: s.get('name')):
                    locked = "un" if s['locked'] == 0 else "  "
                    mo = re.match('\w+@(\w+?)\..*', s['name'])
                    host = mo.group(1) if mo else s['name']
                    print '{host} {locked}locked {owner} "{desc}"'.format(
                        locked=locked, host=host,
                        owner=s['locked_by'], desc=s['description'])

            else:
                frag = {'targets': {}}
                for f in statuses:
                    frag['targets'][f['name']] = f['ssh_pub_key']
                print yaml.safe_dump(frag, default_flow_style=False)
        else:
            log.error('error retrieving lock statuses')
            ret = 1

    elif ctx.summary:
        do_summary(ctx)
        return 0

    elif ctx.lock:
        if not vps_version_or_type_valid(ctx.machine_type, ctx.os_type,
                                         ctx.os_version):
            log.error('Invalid os-type or version detected -- lock failed')
            return 1
        for machine in machines:
            if not lock_one(machine, user, ctx.desc):
                ret = 1
                if not ctx.f:
                    return ret
            else:
                machines_to_update.append(machine)
                provision.create_if_vm(ctx, machine)
    elif ctx.unlock:
        if ctx.owner is None and user is None:
            user = misc.get_user()
        # If none of them are vpm, do them all in one shot
        if not filter(is_vpm, machines):
            res = unlock_many(machines, user)
            return 0 if res else 1
        for machine in machines:
            if not unlock_one(ctx, machine, user):
                ret = 1
                if not ctx.f:
                    return ret
            else:
                machines_to_update.append(machine)
    elif ctx.num_to_lock:
        result = lock_many(ctx, ctx.num_to_lock, ctx.machine_type, user,
                           ctx.desc, ctx.os_type, ctx.os_version)
        if not result:
            ret = 1
        else:
            machines_to_update = result.keys()
            if ctx.machine_type == 'vps':
                shortnames = ' '.join(
                    [misc.decanonicalize_hostname(name) for name in
                     result.keys()]
                )
                if len(result) < ctx.num_to_lock:
                    log.error("Locking failed.")
                    for machn in result:
                        unlock_one(ctx, machn)
                    ret = 1
                else:
                    log.info("Successfully Locked:\n%s\n" % shortnames)
                    log.info(
                        "Unable to display keys at this time (virtual " +
                        "machines are booting).")
                    log.info(
                        "Please run teuthology-lock --list-targets %s once " +
                        "these machines come up.",
                        shortnames)
            else:
                print yaml.safe_dump(
                    dict(targets=result),
                    default_flow_style=False)
    elif ctx.update:
        assert ctx.desc is not None or ctx.status is not None, \
            'you must specify description or status to update'
        assert ctx.owner is None, 'only description and status may be updated'
        machines_to_update = machines

        if ctx.desc is not None or ctx.status is not None:
            for machine in machines_to_update:
                update_lock(machine, ctx.desc, ctx.status)

    return ret


def lock_many(ctx, num, machine_type, user=None, description=None,
              os_type=None, os_version=None, arch=None):
    if user is None:
        user = misc.get_user()

    if not vps_version_or_type_valid(ctx.machine_type, ctx.os_type,
                                     ctx.os_version):
        log.error('Invalid os-type or version detected -- lock failed')
        return

    # In the for loop below we can safely query for all bare-metal machine_type
    # values at once. So, if we're being asked for 'plana,mira,burnupi', do it
    # all in one shot. If we are passed 'plana,mira,burnupi,vps', do one query
    # for 'plana,mira,burnupi' and one for 'vps'
    machine_types_list = misc.get_multi_machine_types(machine_type)
    if machine_types_list == ['vps']:
        machine_types = machine_types_list
    elif 'vps' in machine_types_list:
        machine_types_non_vps = list(machine_types_list)
        machine_types_non_vps.remove('vps')
        machine_types_non_vps = '|'.join(machine_types_non_vps)
        machine_types = [machine_types_non_vps, 'vps']
    else:
        machine_types_str = '|'.join(machine_types_list)
        machine_types = [machine_types_str, ]

    for machine_type in machine_types:
        uri = os.path.join(config.lock_server, 'nodes', 'lock_many', '')
        data = dict(
            locked_by=user,
            count=num,
            machine_type=machine_type,
            description=description,
        )
        # Only query for os_type/os_version if non-vps, since in that case we
        # just create them.
        if machine_type != 'vps':
            if os_type:
                data['os_type'] = os_type
            if os_version:
                data['os_version'] = os_version
        if arch:
            data['arch'] = arch
        log.debug("lock_many request: %s", repr(data))
        response = requests.post(
            uri,
            data=json.dumps(data),
            headers={'content-type': 'application/json'},
        )
        if response.ok:
            machines = {misc.canonicalize_hostname(machine['name']):
                        machine['ssh_pub_key'] for machine in response.json()}
            log.debug('locked {machines}'.format(
                machines=', '.join(machines.keys())))
            if machine_type == 'vps':
                ok_machs = {}
                for machine in machines:
                    if provision.create_if_vm(ctx, machine):
                        ok_machs[machine] = machines[machine]
                    else:
                        log.error('Unable to create virtual machine: %s',
                                  machine)
                        unlock_one(ctx, machine)
                return ok_machs
            return machines
        elif response.status_code == 503:
            log.error('Insufficient nodes available to lock %d %s nodes.',
                      num, machine_type)
            log.error(response.text)
        else:
            log.error('Could not lock %d %s nodes, reason: unknown.',
                      num, machine_type)
    return []


def lock_one(name, user=None, description=None):
    name = misc.canonicalize_hostname(name, user=None)
    if user is None:
        user = misc.get_user()
    request = dict(name=name, locked=True, locked_by=user,
                   description=description)
    uri = os.path.join(config.lock_server, 'nodes', name, 'lock', '')
    response = requests.put(uri, json.dumps(request))
    success = response.ok
    if success:
        log.debug('locked %s as %s', name, user)
    else:
        try:
            reason = response.json().get('message')
        except ValueError:
            reason = str(response.status_code)
        log.error('failed to lock {node}. reason: {reason}'.format(
            node=name, reason=reason))
    return response


def unlock_many(names, user):
    fixed_names = [misc.canonicalize_hostname(name, user=None) for name in
                   names]
    names = fixed_names
    uri = os.path.join(config.lock_server, 'nodes', 'unlock_many', '')
    data = dict(
        locked_by=user,
        names=names,
    )
    response = requests.post(
        uri,
        data=json.dumps(data),
        headers={'content-type': 'application/json'},
    )
    if response.ok:
        log.debug("Unlocked: %s", ', '.join(names))
    else:
        log.error("Failed to unlock: %s", ', '.join(names))
    return response.ok


def unlock_one(ctx, name, user=None):
    if user is None:
        user = misc.get_user()
    name = misc.canonicalize_hostname(name, user=None)
    request = dict(name=name, locked=False, locked_by=user, description=None)
    uri = os.path.join(config.lock_server, 'nodes', name, 'lock', '')
    response = requests.put(uri, json.dumps(request))
    success = response.ok
    if success:
        log.debug('unlocked %s', name)
        if not provision.destroy_if_vm(ctx, name):
            log.error('downburst destroy failed for %s', name)
            log.info('%s is not locked' % name)
    else:
        try:
            reason = response.json().get('message')
        except ValueError:
            reason = str(response.status_code)
        log.error('failed to unlock {node}. reason: {reason}'.format(
            node=name, reason=reason))
    return success


def list_locks(keyed_by_name=False, **kwargs):
    uri = os.path.join(config.lock_server, 'nodes', '')
    if kwargs:
        if 'machine_type' in kwargs:
            kwargs['machine_type'] = kwargs['machine_type'].replace(',','|')
        uri += '?' + urllib.urlencode(kwargs)
    try:
        response = requests.get(uri)
    except requests.ConnectionError:
        success = False
    else:
        success = response.ok
    if success:
        if not keyed_by_name:
            return response.json()
        else:
            return {node['name']: node
                    for node in response.json()}
    return None


def update_lock(name, description=None, status=None, ssh_pub_key=None):
    name = misc.canonicalize_hostname(name, user=None)
    status_info = get_status(name)
    if status_info['is_vm']:
        ssh_key = None
        while not ssh_key:
            time.sleep(10)
            ssh_key = ssh_keyscan([name])
    updated = {}
    if description is not None:
        updated['description'] = description
    if status is not None:
        updated['up'] = (status == 'up')
    if ssh_pub_key is not None:
        updated['ssh_pub_key'] = ssh_pub_key

    if updated:
        uri = os.path.join(config.lock_server, 'nodes', name, '')
        response = requests.put(
            uri,
            json.dumps(updated))
        return response.ok
    return True


def update_inventory(node_dict):
    """
    Like update_lock(), but takes a dict and doesn't try to do anything smart
    by itself
    """
    name = node_dict.get('name')
    if not name:
        raise ValueError("must specify name")
    if not config.lock_server:
        return
    uri = os.path.join(config.lock_server, 'nodes', name, '')
    log.info("Updating %s on lock server", name)
    response = requests.put(
        uri,
        json.dumps(node_dict))
    if response.status_code == 404:
        log.info("Creating new node %s on lock server", name)
        uri = os.path.join(config.lock_server, 'nodes', '')
        response = requests.post(
            uri,
            json.dumps(node_dict))
    if not response.ok:
        log.error("Node update/creation failed for %s: %s",
                  name, response.text)
    return response.ok


def ssh_keyscan(hostnames):
    """
    Fetch the SSH public key of one or more hosts
    """
    if isinstance(hostnames, basestring):
        raise TypeError("'hostnames' must be a list")
    hostnames = [misc.canonicalize_hostname(name, user=None) for name in
                 hostnames]
    args = ['ssh-keyscan', '-t', 'rsa'] + hostnames
    p = subprocess.Popen(
        args=args,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
    )
    p.wait()

    keys_dict = dict()
    for line in p.stderr.readlines():
        if not line.startswith('#'):
            log.error(line)
    for line in p.stdout.readlines():
        host, key = line.strip().split(' ', 1)
        keys_dict[host] = key
    return keys_dict


def updatekeys(args):
    loglevel = logging.DEBUG if args['--verbose'] else logging.INFO
    logging.basicConfig(
        level=loglevel,
    )
    all_ = args['--all']
    if all_:
        machines = []
    elif args['<machine>']:
        machines = [misc.canonicalize_hostname(m, user=None)
                    for m in args['<machine>']]
    elif args['--targets']:
        targets = args['--targets']
        with file(targets) as f:
            docs = yaml.safe_load_all(f)
            for doc in docs:
                machines = [n for n in doc.get('targets', dict()).iterkeys()]

    return do_update_keys(machines, all_)


def do_update_keys(machines, all_=False):
    reference = list_locks(keyed_by_name=True)
    if all_:
        machines = reference.keys()
    keys_dict = ssh_keyscan(machines)
    return push_new_keys(keys_dict, reference)


def push_new_keys(keys_dict, reference):
    ret = 0
    for hostname, pubkey in keys_dict.iteritems():
        log.info('Checking %s', hostname)
        if reference[hostname]['ssh_pub_key'] != pubkey:
            log.info('New key found. Updating...')
            if not update_lock(hostname, ssh_pub_key=pubkey):
                log.error('failed to update %s!', hostname)
                ret = 1
    return ret


def do_summary(ctx):
    lockd = collections.defaultdict(lambda: [0, 0, 'unknown'])
    if ctx.machine_type:
        locks = list_locks(machine_type=ctx.machine_type)
    else:
        locks = list_locks()
    for l in locks:
        who = l['locked_by'] if l['locked'] == 1 \
            else '(free)', l['machine_type']
        lockd[who][0] += 1
        lockd[who][1] += 1 if l['up'] else 0
        lockd[who][2] = l['machine_type']

    locks = sorted([p for p in lockd.iteritems()
                    ], key=lambda sort: (sort[1][2], sort[1][0]))
    total_count, total_up = 0, 0
    print "TYPE     COUNT  UP  OWNER"

    for (owner, (count, upcount, machinetype)) in locks:
            # if machinetype == spectype:
            print "{machinetype:8s} {count:3d}  {up:3d}  {owner}".format(
                count=count, up=upcount, owner=owner[0],
                machinetype=machinetype)
            total_count += count
            total_up += upcount

    print "         ---  ---"
    print "{cnt:12d}  {up:3d}".format(cnt=total_count, up=total_up)