ceph/tasks/die_on_err.py

71 lines
1.9 KiB
Python
Raw Normal View History

"""
Raise exceptions on osd coredumps or test err directories
"""
import contextlib
import logging
import time
from teuthology.orchestra import run
import ceph_manager
from teuthology import misc as teuthology
log = logging.getLogger(__name__)
@contextlib.contextmanager
def task(ctx, config):
"""
Die if {testdir}/err exists or if an OSD dumps core
"""
if config is None:
config = {}
first_mon = teuthology.get_first_mon(ctx, config)
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
log.info('num_osds is %s' % num_osds)
manager = ceph_manager.CephManager(
mon,
ctx=ctx,
logger=log.getChild('ceph_manager'),
)
while len(manager.get_osd_status()['up']) < num_osds:
time.sleep(10)
testdir = teuthology.get_testdir(ctx)
while True:
for i in range(num_osds):
(osd_remote,) = ctx.cluster.only('osd.%d' % i).remotes.iterkeys()
p = osd_remote.run(
args = [ 'test', '-e', '{tdir}/err'.format(tdir=testdir) ],
wait=True,
check_status=False,
)
exit_status = p.exitstatus
if exit_status == 0:
log.info("osd %d has an error" % i)
raise Exception("osd %d error" % i)
log_path = '/var/log/ceph/osd.%d.log' % (i)
p = osd_remote.run(
args = [
'tail', '-1', log_path,
run.Raw('|'),
'grep', '-q', 'end dump'
],
wait=True,
check_status=False,
)
exit_status = p.exitstatus
if exit_status == 0:
log.info("osd %d dumped core" % i)
raise Exception("osd %d dumped core" % i)
time.sleep(5)