mirror of
https://github.com/ceph/ceph
synced 2025-03-30 15:31:01 +00:00
Support power cycling osds/nodes through ipmi
This patch defines a RemoteConsole class associated with each Remote class instance, allowing power cycling a target through ipmi. Fixes/Implements #3782. Signed-off-by: Sam Lang <sam.lang@inktank.com> Reviewed-by: Josh Durgin <josh.durgin@inktank.com>
This commit is contained in:
parent
87b9849628
commit
58111595d4
@ -11,4 +11,4 @@ beanstalkc >=0.2.0
|
||||
nose >=1.0.0
|
||||
fudge >=1.0.3
|
||||
boto >=2.0b4
|
||||
|
||||
pexpect
|
||||
|
1
setup.py
1
setup.py
@ -23,6 +23,7 @@ setup(
|
||||
'beanstalkc >=0.2.0',
|
||||
'nose >=1.0.0',
|
||||
'fudge >=1.0.3',
|
||||
'pexpect',
|
||||
],
|
||||
|
||||
# to find the code associated with entry point
|
||||
|
@ -1,8 +1,6 @@
|
||||
import argparse
|
||||
import yaml
|
||||
|
||||
from teuthology import misc as teuthology
|
||||
|
||||
def parse_args():
|
||||
from teuthology.run import config_file
|
||||
from teuthology.run import MergeConfig
|
||||
@ -150,10 +148,11 @@ def remove_osd_mounts(ctx, log):
|
||||
unmount any osd data mounts (scratch disks)
|
||||
"""
|
||||
from .orchestra import run
|
||||
from teuthology.misc import get_testdir
|
||||
ctx.cluster.run(
|
||||
args=[
|
||||
'grep',
|
||||
'{tdir}/data/'.format(tdir=teuthology.get_testdir(ctx)),
|
||||
'{tdir}/data/'.format(tdir=get_testdir(ctx)),
|
||||
'/etc/mtab',
|
||||
run.Raw('|'),
|
||||
'awk', '{print $2}', run.Raw('|'),
|
||||
@ -223,12 +222,13 @@ def reset_syslog_dir(ctx, log):
|
||||
proc.exitstatus.get()
|
||||
|
||||
def remove_testing_tree(ctx, log):
|
||||
from teuthology.misc import get_testdir
|
||||
nodes = {}
|
||||
for remote in ctx.cluster.remotes.iterkeys():
|
||||
proc = remote.run(
|
||||
args=[
|
||||
'sudo', 'rm', '-rf',
|
||||
teuthology.get_testdir(ctx),
|
||||
get_testdir(ctx),
|
||||
],
|
||||
wait=False,
|
||||
)
|
||||
@ -355,6 +355,29 @@ def nuke_one(ctx, targets, log, should_unlock, synch_clocks, reboot_all, check_l
|
||||
return ret
|
||||
|
||||
def nuke_helper(ctx, log):
|
||||
# ensure node is up with ipmi
|
||||
from teuthology.orchestra import remote
|
||||
|
||||
(target,) = ctx.config['targets'].keys()
|
||||
host = target.split('@')[-1]
|
||||
shortname = host.split('.')[0]
|
||||
log.debug('shortname: %s' % shortname)
|
||||
console = remote.RemoteConsole(name=host,
|
||||
ipmiuser=ctx.teuthology_config['ipmi_user'],
|
||||
ipmipass=ctx.teuthology_config['ipmi_password'],
|
||||
ipmidomain=ctx.teuthology_config['ipmi_domain'])
|
||||
cname = '{host}.{domain}'.format(host=shortname, domain=ctx.teuthology_config['ipmi_domain'])
|
||||
log.info('checking console status of %s' % cname)
|
||||
if not console.check_status():
|
||||
# not powered on or can't get IPMI status. Try to power on
|
||||
console.power_on()
|
||||
# try to get status again, waiting for login prompt this time
|
||||
if not console.check_status(100):
|
||||
log.error('Failed to get console status for %s, disabling console...' % cname)
|
||||
log.info('console ready on %s' % cname)
|
||||
else:
|
||||
log.info('console ready on %s' % cname)
|
||||
|
||||
from teuthology.task.internal import check_lock, connect
|
||||
if ctx.check_locks:
|
||||
check_lock(ctx, None)
|
||||
|
@ -1,4 +1,5 @@
|
||||
from . import run
|
||||
import time
|
||||
|
||||
class Remote(object):
|
||||
"""
|
||||
@ -10,10 +11,11 @@ class Remote(object):
|
||||
# for unit tests to hook into
|
||||
_runner = staticmethod(run.run)
|
||||
|
||||
def __init__(self, name, ssh, shortname=None):
|
||||
def __init__(self, name, ssh, shortname=None, console=None):
|
||||
self.name = name
|
||||
self._shortname = shortname
|
||||
self.ssh = ssh
|
||||
self.console = console
|
||||
|
||||
@property
|
||||
def shortname(self):
|
||||
@ -40,3 +42,164 @@ class Remote(object):
|
||||
r = self._runner(client=self.ssh, **kwargs)
|
||||
r.remote = self
|
||||
return r
|
||||
|
||||
import pexpect
|
||||
import re
|
||||
import logging
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
class RemoteConsole(object):
|
||||
|
||||
def __init__(self, name, ipmiuser, ipmipass, ipmidomain, logfile=None, timeout=20):
|
||||
self.name = name
|
||||
self.ipmiuser = ipmiuser
|
||||
self.ipmipass = ipmipass
|
||||
self.ipmidomain = ipmidomain
|
||||
self.timeout = timeout
|
||||
self.logfile = None
|
||||
|
||||
hn = self.name.split('@')[-1]
|
||||
p = re.compile('([^.]+)\.?.*')
|
||||
self.shortname = p.match(hn).groups()[0]
|
||||
|
||||
def _exec(self, cmd):
|
||||
if not self.ipmiuser or not self.ipmipass or not self.ipmidomain:
|
||||
log.error('Must set ipmi_user, ipmi_password, and ipmi_domain in .teuthology.yaml')
|
||||
log.debug('pexpect command: ipmitool -H {s}.{dn} -I lanplus -U {ipmiuser} -P {ipmipass} {cmd}'.format(
|
||||
cmd=cmd,
|
||||
s=self.shortname,
|
||||
dn=self.ipmidomain,
|
||||
ipmiuser=self.ipmiuser,
|
||||
ipmipass=self.ipmipass))
|
||||
|
||||
child = pexpect.spawn ('ipmitool -H {s}.{dn} -I lanplus -U {ipmiuser} -P {ipmipass} {cmd}'.format(
|
||||
cmd=cmd,
|
||||
s=self.shortname,
|
||||
dn=self.ipmidomain,
|
||||
ipmiuser=self.ipmiuser,
|
||||
ipmipass=self.ipmipass))
|
||||
if self.logfile:
|
||||
child.logfile = self.logfile
|
||||
return child
|
||||
|
||||
def check_power(self, state, timeout=None):
|
||||
# check power
|
||||
total_timeout = timeout
|
||||
if not total_timeout:
|
||||
total_timeout = self.timeout
|
||||
t = 1
|
||||
total = t
|
||||
ta = time.time()
|
||||
while total < total_timeout:
|
||||
c = self._exec('power status')
|
||||
r = c.expect(['Chassis Power is {s}'.format(s=state), pexpect.EOF, pexpect.TIMEOUT], timeout=t)
|
||||
tb = time.time()
|
||||
if r == 0:
|
||||
return True
|
||||
elif r == 1:
|
||||
# keep trying if EOF is reached, first sleep for remaining
|
||||
# timeout interval
|
||||
if tb - ta < t:
|
||||
time.sleep(t - (tb - ta))
|
||||
# go around again if EOF or TIMEOUT
|
||||
ta = tb
|
||||
t *= 2
|
||||
total += t
|
||||
return False
|
||||
|
||||
# returns True if console is at login prompt
|
||||
def check_status(self, timeout=None):
|
||||
try :
|
||||
# check for login prompt at console
|
||||
self._wait_for_login(timeout)
|
||||
return True
|
||||
except Exception as e:
|
||||
log.info('Failed to get ipmi console status for {s}: {e}'.format(s=self.shortname, e=e))
|
||||
return False
|
||||
|
||||
def _exit_session(self, child, timeout=None):
|
||||
child.send('~.')
|
||||
t = timeout
|
||||
if not t:
|
||||
t = self.timeout
|
||||
r = child.expect(['terminated ipmitool', pexpect.TIMEOUT, pexpect.EOF], timeout=t)
|
||||
if r != 0:
|
||||
self._exec('sol deactivate')
|
||||
|
||||
def _wait_for_login(self, timeout=None, attempts=3):
|
||||
log.debug('Waiting for login prompt on {s}'.format(s=self.shortname))
|
||||
# wait for login prompt to indicate boot completed
|
||||
t = timeout
|
||||
if not t:
|
||||
t = self.timeout
|
||||
for i in range(0, attempts):
|
||||
start = time.time()
|
||||
while time.time() - start < t:
|
||||
child = self._exec('sol activate')
|
||||
child.send('\n')
|
||||
log.debug('expect: {s} login'.format(s=self.shortname))
|
||||
r = child.expect(['{s} login: '.format(s=self.shortname), pexpect.TIMEOUT, pexpect.EOF], timeout=(t - (time.time() - start)))
|
||||
log.debug('expect before: {b}'.format(b=child.before))
|
||||
log.debug('expect after: {a}'.format(a=child.after))
|
||||
|
||||
self._exit_session(child)
|
||||
if r == 0:
|
||||
return
|
||||
raise pexpect.TIMEOUT ('Timeout exceeded ({t}) for {a} attempts in _wait_for_login()'.format(t=t, a=attempts))
|
||||
|
||||
def power_cycle(self):
|
||||
log.info('Power cycling {s}'.format(s=self.shortname))
|
||||
child = self._exec('power cycle')
|
||||
child.expect('Chassis Power Control: Cycle', timeout=self.timeout)
|
||||
self._wait_for_login()
|
||||
log.info('Power cycle for {s} completed'.format(s=self.shortname))
|
||||
|
||||
def hard_reset(self):
|
||||
log.info('Performing hard reset of {s}'.format(s=self.shortname))
|
||||
start = time.time()
|
||||
while time.time() - start < self.timeout:
|
||||
child = self._exec('power reset')
|
||||
r = child.expect(['Chassis Power Control: Reset', pexpect.EOF], timeout=self.timeout)
|
||||
if r == 0:
|
||||
break
|
||||
self._wait_for_login()
|
||||
log.info('Hard reset for {s} completed'.format(s=self.shortname))
|
||||
|
||||
def power_off(self):
|
||||
log.info('Power off {s}'.format(s=self.shortname))
|
||||
start = time.time()
|
||||
while time.time() - start < self.timeout:
|
||||
child = self._exec('power off')
|
||||
r = child.expect(['Chassis Power Control: Down/Off', pexpect.EOF], timeout=self.timeout)
|
||||
if r == 0:
|
||||
break
|
||||
if not self.check_power('off', 60):
|
||||
log.error('Failed to power off {s}'.format(s=self.shortname))
|
||||
log.info('Power off for {s} completed'.format(s=self.shortname))
|
||||
|
||||
def power_on(self):
|
||||
log.info('Power on {s}'.format(s=self.shortname))
|
||||
start = time.time()
|
||||
while time.time() - start < self.timeout:
|
||||
child = self._exec('power on')
|
||||
r = child.expect(['Chassis Power Control: Up/On', pexpect.EOF], timeout=self.timeout)
|
||||
if r == 0:
|
||||
break
|
||||
if not self.check_power('on'):
|
||||
log.error('Failed to power on {s}'.format(s=self.shortname))
|
||||
log.info('Power on for {s} completed'.format(s=self.shortname))
|
||||
|
||||
def power_off_for_interval(self, interval=30):
|
||||
log.info('Power off {s} for {i} seconds'.format(s=self.shortname, i=interval))
|
||||
child = self._exec('power off')
|
||||
child.expect('Chassis Power Control: Down/Off', timeout=self.timeout)
|
||||
|
||||
time.sleep(interval)
|
||||
|
||||
child = self._exec('power on')
|
||||
child.expect('Chassis Power Control: Up/On', timeout=self.timeout)
|
||||
|
||||
self._wait_for_login()
|
||||
|
||||
log.info('Power off for {i} seconds completed'.format(s=self.shortname, i=interval))
|
||||
|
@ -56,6 +56,9 @@ class DaemonState(object):
|
||||
def running(self):
|
||||
return self.proc is not None
|
||||
|
||||
def reset(self):
|
||||
self.proc = None
|
||||
|
||||
|
||||
class CephState(object):
|
||||
def __init__(self):
|
||||
@ -326,6 +329,30 @@ def valgrind_post(ctx, config):
|
||||
if valgrind_exception is not None:
|
||||
raise valgrind_exception
|
||||
|
||||
|
||||
def mount_osd_data(ctx, remote, osd):
|
||||
testdir = teuthology.get_testdir(ctx)
|
||||
dev = ctx.disk_config.remote_to_roles_to_dev[remote][osd]
|
||||
journal = ctx.disk_config.remote_to_roles_to_journals[remote][osd]
|
||||
mount_options = ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][osd]
|
||||
fstype = ctx.disk_config.remote_to_roles_to_dev_fstype[remote][osd]
|
||||
|
||||
remote.run(
|
||||
args=[
|
||||
'sudo',
|
||||
'mount',
|
||||
'-t', fstype,
|
||||
'-o', ','.join(mount_options),
|
||||
dev,
|
||||
os.path.join('{tdir}/data'.format(tdir=testdir), 'osd.{id}.data'.format(id=osd)),
|
||||
]
|
||||
)
|
||||
|
||||
if journal == ('/mnt/osd.%s' % osd):
|
||||
remote.run( args=[ 'sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt' ] )
|
||||
tmpfs = '/mnt/osd.%s' % osd
|
||||
remote.run( args=[ 'truncate', '-s', '1500M', tmpfs ] )
|
||||
|
||||
@contextlib.contextmanager
|
||||
def cluster(ctx, config):
|
||||
testdir = teuthology.get_testdir(ctx)
|
||||
@ -350,7 +377,7 @@ def cluster(ctx, config):
|
||||
roles_to_devs = {}
|
||||
roles_to_journals = {}
|
||||
if config.get('fs'):
|
||||
log.info('fs option selected, checkin for scratch devs')
|
||||
log.info('fs option selected, checking for scratch devs')
|
||||
log.info('found devs: %s' % (str(devs),))
|
||||
roles_to_devs = assign_devs(
|
||||
teuthology.roles_of_type(roles_for_host, 'osd'), devs
|
||||
@ -359,7 +386,7 @@ def cluster(ctx, config):
|
||||
devs = devs[len(roles_to_devs):]
|
||||
log.info('dev map: %s' % (str(roles_to_devs),))
|
||||
devs_to_clean[remote] = []
|
||||
|
||||
|
||||
if config.get('block_journal'):
|
||||
log.info('block journal enabled')
|
||||
roles_to_journals = assign_devs(
|
||||
@ -703,6 +730,12 @@ def cluster(ctx, config):
|
||||
os.path.join('{tdir}/data'.format(tdir=testdir), 'osd.{id}.data'.format(id=id_)),
|
||||
]
|
||||
)
|
||||
if not remote in ctx.disk_config.remotes_to_roles_to_dev_mount_options:
|
||||
ctx.disk_config.remotes_to_roles_to_dev_mount_options[remote] = {}
|
||||
ctx.disk_config.remotes_to_roles_to_dev_mount_options[remote][id_] = mount_options
|
||||
if not remote in ctx.disk_config.remotes_to_roles_to_dev_fstype:
|
||||
ctx.disk_config.remotes_to_roles_to_dev_fstype[remote] = {}
|
||||
ctx.disk_config.remotes_to_roles_to_dev_fstype[remote][id_] = fs
|
||||
remote.run(
|
||||
args=[
|
||||
'sudo', 'chown', '-R', 'ubuntu.ubuntu',
|
||||
|
@ -6,6 +6,7 @@ import gevent
|
||||
import json
|
||||
import threading
|
||||
from teuthology import misc as teuthology
|
||||
from teuthology.task import ceph as ceph_task
|
||||
|
||||
class Thrasher:
|
||||
def __init__(self, manager, config, logger=None):
|
||||
@ -212,9 +213,10 @@ class Thrasher:
|
||||
self.all_up()
|
||||
|
||||
class CephManager:
|
||||
def __init__(self, controller, ctx=None, logger=None):
|
||||
def __init__(self, controller, ctx=None, config=None, logger=None):
|
||||
self.lock = threading.RLock()
|
||||
self.ctx = ctx
|
||||
self.config = config
|
||||
self.controller = controller
|
||||
if (logger):
|
||||
self.log = lambda x: logger.info(x)
|
||||
@ -645,7 +647,12 @@ class CephManager:
|
||||
self.raw_cluster_cmd('osd', 'out', str(osd))
|
||||
|
||||
def kill_osd(self, osd):
|
||||
self.ctx.daemons.get_daemon('osd', osd).stop()
|
||||
if 'powercycle' in self.config and self.config['powercycle']:
|
||||
(remote,) = self.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys()
|
||||
self.log('kill_osd on osd.{o} doing powercycle of {s}'.format(o=osd, s=remote.name))
|
||||
remote.console.power_off()
|
||||
else:
|
||||
self.ctx.daemons.get_daemon('osd', osd).stop()
|
||||
|
||||
def blackhole_kill_osd(self, osd):
|
||||
self.raw_cluster_cmd('--', 'tell', 'osd.%d' % osd,
|
||||
@ -654,6 +661,11 @@ class CephManager:
|
||||
self.ctx.daemons.get_daemon('osd', osd).stop()
|
||||
|
||||
def revive_osd(self, osd):
|
||||
if 'powercycle' in self.config and self.config['powercycle']:
|
||||
(remote,) = self.ctx.cluster.only('osd.{o}'.format(o=osd)).remotes.iterkeys()
|
||||
self.log('kill_osd on osd.{o} doing powercycle of {s}'.format(o=osd, s=remote.name))
|
||||
remote.console.hard_reset()
|
||||
ceph_task.mount_osd_data(self.ctx, remote, osd)
|
||||
self.ctx.daemons.get_daemon('osd', osd).restart()
|
||||
|
||||
def mark_down_osd(self, osd):
|
||||
@ -666,9 +678,18 @@ class CephManager:
|
||||
## monitors
|
||||
|
||||
def kill_mon(self, mon):
|
||||
self.ctx.daemons.get_daemon('mon', mon).stop()
|
||||
if 'powercycle' in self.config and self.config['powercycle']:
|
||||
(remote,) = self.ctx.cluster.only('mon.{m}'.format(m=mon)).remotes.iterkeys()
|
||||
self.log('kill_mon on mon.{m} doing powercycle of {s}'.format(m=mon, s=remote.name))
|
||||
remote.console.power_off()
|
||||
else:
|
||||
self.ctx.daemons.get_daemon('mon', mon).stop()
|
||||
|
||||
def revive_mon(self, mon):
|
||||
if 'powercycle' in self.config and self.config['powercycle']:
|
||||
(remote,) = self.ctx.cluster.only('mon.{m}'.format(m=mon)).remotes.iterkeys()
|
||||
self.log('revive_mon on mon.{m} doing powercycle of {s}'.format(m=mon, s=remote.name))
|
||||
remote.console.power_on()
|
||||
self.ctx.daemons.get_daemon('mon', mon).restart()
|
||||
|
||||
def get_mon_status(self, mon):
|
||||
@ -701,24 +722,30 @@ class CephManager:
|
||||
## metadata servers
|
||||
|
||||
def kill_mds(self, mds):
|
||||
self.ctx.daemons.get_daemon('mds', mds).stop()
|
||||
if 'powercycle' in self.config and self.config['powercycle']:
|
||||
(remote,) = self.ctx.cluster.only('mds.{m}'.format(m=mds)).remotes.iterkeys()
|
||||
self.log('kill_mds on mds.{m} doing powercycle of {s}'.format(m=mds, s=remote.name))
|
||||
remote.console.power_off()
|
||||
else:
|
||||
self.ctx.daemons.get_daemon('mds', mds).stop()
|
||||
|
||||
def kill_mds_by_rank(self, rank):
|
||||
status = self.get_mds_status_by_rank(rank)
|
||||
self.ctx.daemons.get_daemon('mds', status['name']).stop()
|
||||
self.kill_mds(status['name'])
|
||||
|
||||
def revive_mds(self, mds, standby_for_rank=None):
|
||||
if 'powercycle' in self.config and self.config['powercycle']:
|
||||
(remote,) = self.ctx.cluster.only('mds.{m}'.format(m=mds)).remotes.iterkeys()
|
||||
self.log('revive_mds on mds.{m} doing powercycle of {s}'.format(m=mds, s=remote.name))
|
||||
remote.console.power_on()
|
||||
args = []
|
||||
if standby_for_rank:
|
||||
args.extend(['--hot-standby', standby_for_rank])
|
||||
self.ctx.daemons.get_daemon('mds', mds).restart(*args)
|
||||
|
||||
def revive_mds_by_rank(self, rank, standby_for_rank=None):
|
||||
args = []
|
||||
if standby_for_rank:
|
||||
args.extend(['--hot-standby', standby_for_rank])
|
||||
status = self.get_mds_status_by_rank(rank)
|
||||
self.ctx.daemons.get_daemon('mds', status['name']).restart(*args)
|
||||
self.revive_mds(status['name'], standby_for_rank)
|
||||
|
||||
def get_mds_status(self, mds):
|
||||
out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
|
||||
|
@ -149,12 +149,29 @@ def connect(ctx, config):
|
||||
from ..orchestra import cluster
|
||||
remotes = []
|
||||
for t, key in ctx.config['targets'].iteritems():
|
||||
console = None
|
||||
if 'ipmi_user' in ctx.teuthology_config:
|
||||
host = t.split('@')[-1]
|
||||
shortname = host.split('.')[0]
|
||||
console = remote.RemoteConsole(name=host,
|
||||
ipmiuser=ctx.teuthology_config['ipmi_user'],
|
||||
ipmipass=ctx.teuthology_config['ipmi_password'],
|
||||
ipmidomain=ctx.teuthology_config['ipmi_domain'])
|
||||
cname = '{host}.{domain}'.format(host=shortname, domain=ctx.teuthology_config['ipmi_domain'])
|
||||
log.debug('checking console status of %s' % cname)
|
||||
if not console.check_status():
|
||||
log.info('Failed to get console status for %s, disabling console...' % cname)
|
||||
console=None
|
||||
else:
|
||||
log.debug('console ready on %s' % cname)
|
||||
|
||||
log.debug('connecting to %s', t)
|
||||
remotes.append(
|
||||
remote.Remote(name=t,
|
||||
ssh=connection.connect(user_at_host=t,
|
||||
host_key=key,
|
||||
keep_alive=True)))
|
||||
keep_alive=True),
|
||||
console=console))
|
||||
ctx.cluster = cluster.Cluster()
|
||||
if 'roles' in ctx.config:
|
||||
for rem, roles in zip(remotes, ctx.config['roles']):
|
||||
|
@ -68,6 +68,9 @@ def task(ctx, config):
|
||||
chance_inject_pause_short: (1) chance of injecting short stall
|
||||
chance_inject_pause_long: (0) chance of injecting long stall
|
||||
|
||||
powercycle: (false) whether to power cycle the node instead
|
||||
of just the osd process
|
||||
|
||||
example:
|
||||
|
||||
tasks:
|
||||
@ -89,6 +92,7 @@ def task(ctx, config):
|
||||
manager = ceph_manager.CephManager(
|
||||
mon,
|
||||
ctx=ctx,
|
||||
config=config,
|
||||
logger=log.getChild('ceph_manager'),
|
||||
)
|
||||
ctx.manager = manager
|
||||
|
Loading…
Reference in New Issue
Block a user