ceph/teuthology/task/die_on_err.py
Mike Ryan f8e1f5c222 task: die on ceph error or coredump
This task allows ceph to signal to teuth that it should die immediately
by touching a file under /tmp/cephtest

Signed-off-by: Mike Ryan <mike.ryan@inktank.com>
2012-09-04 09:52:38 -07:00

69 lines
1.8 KiB
Python

import contextlib
import logging
import os
import time
from ..orchestra import run
import ceph_manager
from teuthology import misc as teuthology
log = logging.getLogger(__name__)
@contextlib.contextmanager
def task(ctx, config):
"""
Die if /tmp/cephtest/err exists or if an OSD dumps core
"""
if config is None:
config = {}
first_mon = teuthology.get_first_mon(ctx, config)
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
log.info('num_osds is %s' % num_osds)
manager = ceph_manager.CephManager(
mon,
ctx=ctx,
logger=log.getChild('ceph_manager'),
)
while len(manager.get_osd_status()['up']) < num_osds:
time.sleep(10)
log_path = '/tmp/cephtest/archive/log'
while True:
for i in range(num_osds):
(osd_remote,) = ctx.cluster.only('osd.%d' % i).remotes.iterkeys()
p = osd_remote.run(
args = [ 'test', '-e', '/tmp/cephtest/err' ],
wait=True,
check_status=False,
)
exit_status = p.exitstatus
if exit_status == 0:
log.info("osd %d has an error" % i)
raise Exception("osd %d error" % i)
log_path = '/tmp/cephtest/archive/log/osd.%d.log' % i
p = osd_remote.run(
args = [
'tail', '-1', log_path,
run.Raw('|'),
'grep', '-q', 'end dump'
],
wait=True,
check_status=False,
)
exit_status = p.exitstatus
if exit_status == 0:
log.info("osd %d dumped core" % i)
raise Exception("osd %d dumped core" % i)
time.sleep(5)