Support added for running scheduled tasks on virtual machines.

This included:
    A). changes made so that full path names on some files were used
        (scheduled tasks started in different home directories).
    B.) Changes to insure tasks come up on the beanstalkc queue properly,
    C.) Finding and inserting the libvirt eqivalent code for vm machines
        in order to simulate ipmi actions,
    D.) Fix host key code, report valgrind issue more clearly.
    E.) Some message and downburst call changes.

    Fix #4988
    Fix #5122
    Signed-off-by: Warren Usui <warren.usui@inktank.com>
This commit is contained in:
Warren Usui 2013-06-06 18:43:43 -07:00
parent c95698045d
commit a4994e3bde
8 changed files with 141 additions and 66 deletions

View File

@ -1,7 +1,7 @@
#!/bin/sh
set -e
for package in python-dev python-pip python-virtualenv libevent-dev; do
for package in python-dev python-pip python-virtualenv libevent-dev python-libvirt; do
if [ "$(dpkg --status -- $package|sed -n 's/^Status: //p')" != "install ok installed" ]; then
# add a space after old values
missing="${missing:+$missing }$package"
@ -13,7 +13,9 @@ if [ -n "$missing" ]; then
exit 1
fi
virtualenv --no-site-packages --distribute virtualenv
# site packages needed because libvirt python bindings are not nicely
# packaged
virtualenv --system-site-packages --distribute virtualenv
# avoid pip bugs
./virtualenv/bin/pip install --upgrade pip

View File

@ -8,6 +8,7 @@ import re
import collections
import tempfile
import os
import time
from teuthology import lockstatus as ls
from teuthology import misc as teuthology
@ -65,6 +66,13 @@ def list_locks(ctx):
return None
def update_lock(ctx, name, description=None, status=None, sshpubkey=None):
status_info = ls.get_status(ctx, name)
phys_host = status_info['vpshost']
if phys_host:
keyscan_out = ''
while not keyscan_out:
time.sleep(10)
keyscan_out, _ = keyscan_check(ctx, [name])
updated = {}
if description is not None:
updated['desc'] = description
@ -540,7 +548,7 @@ def create_if_vm(ctx, machine_name):
if not file_out:
file_info = {}
file_info['disk-size'] = lcnfg.get('disk-size', '30G')
file_info['ram'] = lcnfg.get('ram', '4G')
file_info['ram'] = lcnfg.get('ram', '1.9G')
file_info['cpus'] = lcnfg.get('cpus', 1)
file_info['networks'] = lcnfg.get('networks',
[{'source' : 'front'}])
@ -562,7 +570,7 @@ def create_if_vm(ctx, machine_name):
stdout=subprocess.PIPE,stderr=subprocess.PIPE,)
owt,err = p.communicate()
if err:
log.info("Downburst command to create %s failed: %s" %
log.info("Downburst completed on %s: %s" %
(machine_name,err))
else:
log.info("%s created: %s" % (machine_name,owt))

View File

@ -5,8 +5,11 @@ import logging
log = logging.getLogger(__name__)
def _lock_url(ctx):
return ctx.teuthology_config['lock_server']
try:
return ctx.teuthology_config['lock_server']
except (AttributeError, KeyError):
return "http://teuthology.front.sepia.ceph.com/locker/lock"
def send_request(method, url, body=None, headers=None):
http = httplib2.Http()
resp, content = http.request(url, method=method, body=body, headers=headers)
@ -21,5 +24,3 @@ def get_status(ctx, name):
if success:
return json.loads(content)
return None

View File

@ -727,10 +727,14 @@ def reconnect(ctx, timeout, remotes=None):
for remote in need_reconnect:
try:
log.info('trying to connect to %s', remote.name)
key = ctx.config['targets'][remote.name]
kstat = lockstatus.get_status(ctx,remote.name)
if 'sshpubkey' in kstat:
key = kstat['sshpubkey']
from .orchestra import connection
remote.ssh = connection.connect(
user_at_host=remote.name,
host_key=ctx.config['targets'][remote.name],
host_key=key,
keep_alive=True,
)
except Exception:

View File

@ -416,7 +416,7 @@ def nuke_helper(ctx, log):
log.debug('shortname: %s' % shortname)
log.debug('{ctx}'.format(ctx=ctx))
if not ctx.noipmi and 'ipmi_user' in ctx.teuthology_config:
console = remote.RemoteConsole(name=host,
console = remote.getRemoteConsole(name=host,
ipmiuser=ctx.teuthology_config['ipmi_user'],
ipmipass=ctx.teuthology_config['ipmi_password'],
ipmidomain=ctx.teuthology_config['ipmi_domain'])

View File

@ -46,22 +46,26 @@ class Remote(object):
import pexpect
import re
import logging
import libvirt
from teuthology import lockstatus as ls
log = logging.getLogger(__name__)
class RemoteConsole(object):
def getShortName(name):
hn = name.split('@')[-1]
p = re.compile('([^.]+)\.?.*')
return p.match(hn).groups()[0]
class PhysicalConsole():
def __init__(self, name, ipmiuser, ipmipass, ipmidomain, logfile=None, timeout=20):
self.name = name
self.shortname = getShortName(name)
self.timeout = timeout
self.logfile = None
self.ipmiuser = ipmiuser
self.ipmipass = ipmipass
self.ipmidomain = ipmidomain
self.timeout = timeout
self.logfile = None
hn = self.name.split('@')[-1]
p = re.compile('([^.]+)\.?.*')
self.shortname = p.match(hn).groups()[0]
def _exec(self, cmd):
if not self.ipmiuser or not self.ipmipass or not self.ipmidomain:
@ -83,6 +87,34 @@ class RemoteConsole(object):
child.logfile = self.logfile
return child
def _exit_session(self, child, timeout=None):
child.send('~.')
t = timeout
if not t:
t = self.timeout
r = child.expect(['terminated ipmitool', pexpect.TIMEOUT, pexpect.EOF], timeout=t)
if r != 0:
self._exec('sol deactivate')
def _wait_for_login(self, timeout=None, attempts=6):
log.debug('Waiting for login prompt on {s}'.format(s=self.shortname))
# wait for login prompt to indicate boot completed
t = timeout
if not t:
t = self.timeout
for i in range(0, attempts):
start = time.time()
while time.time() - start < t:
child = self._exec('sol activate')
child.send('\n')
log.debug('expect: {s} login'.format(s=self.shortname))
r = child.expect(['{s} login: '.format(s=self.shortname), pexpect.TIMEOUT, pexpect.EOF], timeout=(t - (time.time() - start)))
log.debug('expect before: {b}'.format(b=child.before))
log.debug('expect after: {a}'.format(a=child.after))
self._exit_session(child)
if r == 0:
return
def check_power(self, state, timeout=None):
# check power
total_timeout = timeout
@ -118,36 +150,6 @@ class RemoteConsole(object):
log.info('Failed to get ipmi console status for {s}: {e}'.format(s=self.shortname, e=e))
return False
def _exit_session(self, child, timeout=None):
child.send('~.')
t = timeout
if not t:
t = self.timeout
r = child.expect(['terminated ipmitool', pexpect.TIMEOUT, pexpect.EOF], timeout=t)
if r != 0:
self._exec('sol deactivate')
def _wait_for_login(self, timeout=None, attempts=6):
log.debug('Waiting for login prompt on {s}'.format(s=self.shortname))
# wait for login prompt to indicate boot completed
t = timeout
if not t:
t = self.timeout
for i in range(0, attempts):
start = time.time()
while time.time() - start < t:
child = self._exec('sol activate')
child.send('\n')
log.debug('expect: {s} login'.format(s=self.shortname))
r = child.expect(['{s} login: '.format(s=self.shortname), pexpect.TIMEOUT, pexpect.EOF], timeout=(t - (time.time() - start)))
log.debug('expect before: {b}'.format(b=child.before))
log.debug('expect after: {a}'.format(a=child.after))
self._exit_session(child)
if r == 0:
return
raise pexpect.TIMEOUT ('Timeout exceeded ({t}) for {a} attempts in _wait_for_login()'.format(t=t, a=attempts))
def power_cycle(self):
log.info('Power cycling {s}'.format(s=self.shortname))
child = self._exec('power cycle')
@ -166,18 +168,6 @@ class RemoteConsole(object):
self._wait_for_login()
log.info('Hard reset for {s} completed'.format(s=self.shortname))
def power_off(self):
log.info('Power off {s}'.format(s=self.shortname))
start = time.time()
while time.time() - start < self.timeout:
child = self._exec('power off')
r = child.expect(['Chassis Power Control: Down/Off', pexpect.EOF], timeout=self.timeout)
if r == 0:
break
if not self.check_power('off', 60):
log.error('Failed to power off {s}'.format(s=self.shortname))
log.info('Power off for {s} completed'.format(s=self.shortname))
def power_on(self):
log.info('Power on {s}'.format(s=self.shortname))
start = time.time()
@ -190,6 +180,18 @@ class RemoteConsole(object):
log.error('Failed to power on {s}'.format(s=self.shortname))
log.info('Power on for {s} completed'.format(s=self.shortname))
def power_off(self):
log.info('Power off {s}'.format(s=self.shortname))
start = time.time()
while time.time() - start < self.timeout:
child = self._exec('power off')
r = child.expect(['Chassis Power Control: Down/Off', pexpect.EOF], timeout=self.timeout)
if r == 0:
break
if not self.check_power('off', 60):
log.error('Failed to power off {s}'.format(s=self.shortname))
log.info('Power off for {s} completed'.format(s=self.shortname))
def power_off_for_interval(self, interval=30):
log.info('Power off {s} for {i} seconds'.format(s=self.shortname, i=interval))
child = self._exec('power off')
@ -199,7 +201,54 @@ class RemoteConsole(object):
child = self._exec('power on')
child.expect('Chassis Power Control: Up/On', timeout=self.timeout)
self._wait_for_login()
log.info('Power off for {i} seconds completed'.format(s=self.shortname, i=interval))
class VirtualConsole():
def __init__(self, name, ipmiuser, ipmipass, ipmidomain, logfile=None, timeout=20):
self.shortname = getShortName(name)
status_info = ls.get_status('', self.shortname)
try:
phys_host = status_info['vpshost']
except TypeError:
return
self.connection = libvirt.open(phys_host)
for i in self.connection.listDomainsID():
d = con.lookupByID(i)
if d.name() == self.shortname:
self.vm_domain = d
break
return
def check_power(self, state, timeout=None):
return self.vm_domain.info[0] in [libvirt.VIR_DOMAIN_RUNNING, libvirt.VIR_DOMAIN_BLOCKED,
libvirt.VIR_DOMAIN_PAUSED]
def check_status(self, timeout=None):
return self.vm_domain.info()[0] == libvirt.VIR_DOMAIN_RUNNING
def power_cycle(self):
self.vm_domain.info().destroy()
self.vm_domain.info().create()
def hard_reset(self):
self.vm_domain.info().destroy()
def power_on(self):
self.vm_domain.info().create()
def power_off(self):
self.vm_domain.info().destroy()
def power_off_for_interval(self, interval=30):
log.info('Power off {s} for {i} seconds'.format(s=self.shortname, i=interval))
self.vm_domain.info().destroy()
time.sleep(interval)
self.vm_domain.info().create()
log.info('Power off for {i} seconds completed'.format(s=self.shortname, i=interval))
def getRemoteConsole(name, ipmiuser, ipmipass, ipmidomain, logfile=None, timeout=20):
if name.startswith('vpm'):
return VirtualConsole(name, ipmiuser, ipmipass, ipmidomain, logfile, timeout)
return PhysicalConsole(name, ipmiuser, ipmipass, ipmidomain, logfile, timeout)

View File

@ -106,6 +106,7 @@ def lock_machines(ctx, config):
while not keyscan_out:
time.sleep(10)
keyscan_out, current_locks = lock.keyscan_check(ctx, vmlist)
log.info('virtual machine is stil unavailable')
if lock.update_keys(ctx, keyscan_out, current_locks):
log.info("Error in virtual machine keys")
newscandict = {}
@ -172,8 +173,18 @@ def connect(ctx, config):
from ..orchestra import connection, remote
from ..orchestra import cluster
remotes = []
for t, key in ctx.config['targets'].iteritems():
machs = []
for name in ctx.config['targets'].iterkeys():
machs.append(name)
lock.scan_for_locks(ctx, machs)
for t, xkey in ctx.config['targets'].iteritems():
log.debug('connecting to %s', t)
log.info('Key is :%s:', xkey)
oldkeystatus = lockstatus.get_status(ctx, t)
key = xkey
if 'sshpubkey' in oldkeystatus:
log.info('possible key is :%s:',oldkeystatus['sshpubkey'])
key = oldkeystatus['sshpubkey']
remotes.append(
remote.Remote(name=t,
ssh=connection.connect(user_at_host=t,
@ -492,16 +503,16 @@ def vm_setup(ctx, config):
Look for virtual machines and handle their initialization
"""
with parallel() as p:
editinfo = './teuthology/task/edit_sudoers.sh'
editinfo = os.path.join(os.path.dirname(__file__),'edit_sudoers.sh')
for remote in ctx.cluster.remotes.iterkeys():
mname = re.match(".*@([^\.]*)\..*", str(remote)).group(1)
if mname[0:3] == 'vpm':
if mname.startswith('vpm'):
r = remote.run(args=['test', '-e', '/ceph-qa-ready',],
stdout=StringIO(),
check_status=False,)
if r.exitstatus != 0:
p1 = subprocess.Popen(['cat', editinfo], stdout=subprocess.PIPE)
p2 = subprocess.Popen(['ssh','-t','-t',str(remote)],stdin=p1.stdout, stdout=subprocess.PIPE)
p2 = subprocess.Popen(['ssh','-t','-t',str(remote), 'sudo', 'sh'],stdin=p1.stdout, stdout=subprocess.PIPE)
_,err = p2.communicate()
if err:
log.info("Edit of /etc/sudoers failed: %s",err)

View File

@ -105,7 +105,7 @@ def task(ctx, config):
host = t.split('@')[-1]
shortname = host.split('.')[0]
from ..orchestra import remote as oremote
console = oremote.RemoteConsole(
console = oremote.getRemoteConsole(
name=host,
ipmiuser=ctx.teuthology_config['ipmi_user'],
ipmipass=ctx.teuthology_config['ipmi_password'],