mirror of
https://github.com/ceph/ceph
synced 2024-12-26 13:33:57 +00:00
620dd5511b
Will run for as long as teuthology runs. By default, fails if any clock skews higher than 0.05 seconds are detected, but will only fail when the teuthology run finishes and after reporting a list of all the detected skews. Accepted options: interval amount of seconds to wait in-between checks. (default: 30.0) max-skew maximum skew, in seconds, that is considered tolerable before issuing a warning. (default: 0.05) expect-skew 'true' or 'false', to indicate whether to expect a skew during the run or not. If 'true', the test will fail if no skew is found, and succeed if a skew is indeed found; if 'false', it's the other way around. (default: false) never-fail Don't fail the run if a skew is detected and we weren't expecting it, or if no skew is detected and we were expecting it. (default: False) Signed-off-by: Joao Eduardo Luis <jecluis@gmail.com>
163 lines
5.2 KiB
Python
163 lines
5.2 KiB
Python
import logging
|
|
import contextlib
|
|
import ceph_manager
|
|
import time
|
|
import gevent
|
|
import json
|
|
from teuthology import misc as teuthology
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
class ClockSkewCheck:
|
|
"""
|
|
Periodically check if there are any clock skews among the monitors in the
|
|
quorum. By default, assume no skews are supposed to exist; that can be
|
|
changed using the 'expect-skew' option. If 'fail-on-skew' is set to false,
|
|
then we will always succeed and only report skews if any are found.
|
|
|
|
This class does not spawn a thread. It assumes that, if that is indeed
|
|
wanted, it should be done by a third party (for instance, the task using
|
|
this class). We intend it as such in order to reuse this class if need be.
|
|
|
|
This task accepts the following options:
|
|
|
|
interval amount of seconds to wait in-between checks. (default: 30.0)
|
|
max-skew maximum skew, in seconds, that is considered tolerable before
|
|
issuing a warning. (default: 0.05)
|
|
expect-skew 'true' or 'false', to indicate whether to expect a skew during
|
|
the run or not. If 'true', the test will fail if no skew is
|
|
found, and succeed if a skew is indeed found; if 'false', it's
|
|
the other way around. (default: false)
|
|
never-fail Don't fail the run if a skew is detected and we weren't
|
|
expecting it, or if no skew is detected and we were expecting
|
|
it. (default: False)
|
|
|
|
Example:
|
|
Expect a skew higher than 0.05 seconds, but only report it without failing
|
|
the teuthology run.
|
|
|
|
- mon_clock_skew_check:
|
|
interval: 30
|
|
max-skew: 0.05
|
|
expect_skew: true
|
|
never-fail: true
|
|
"""
|
|
|
|
def __init__(self, ctx, manager, config, logger):
|
|
self.ctx = ctx
|
|
self.manager = manager;
|
|
|
|
self.stopping = False
|
|
self.logger = logger
|
|
self.config = config
|
|
|
|
if self.config is None:
|
|
self.config = dict()
|
|
|
|
self.check_interval = float(self.config.get('interval', 30.0))
|
|
self.max_skew = float(self.config.get('max-skew', 0.05))
|
|
self.expect_skew = self.config.get('expect-skew', False)
|
|
self.never_fail = self.config.get('never-fail', False)
|
|
|
|
def info(self, x):
|
|
self.logger.info(x)
|
|
|
|
def warn(self, x):
|
|
self.logger.warn(x)
|
|
|
|
def finish(self):
|
|
self.stopping = True
|
|
|
|
def do_check(self):
|
|
self.info('start checking for clock skews')
|
|
skews = dict()
|
|
while not self.stopping:
|
|
quorum_size = len(teuthology.get_mon_names(self.ctx))
|
|
self.manager.wait_for_mon_quorum_size(quorum_size)
|
|
|
|
health = self.manager.get_mon_health(True)
|
|
for timecheck in health['timechecks']:
|
|
mon_skew = float(timecheck['skew'])
|
|
mon_health = timecheck['health']
|
|
mon_id = timecheck['name']
|
|
if mon_skew > self.max_skew:
|
|
assert mon_health == 'HEALTH_WARN', \
|
|
'mon.{id} health is \'{health}\' but skew {s} > max {ms}'.format(
|
|
id=mon_id,s=mon_skew,ms=self.max_skew)
|
|
|
|
log_str = 'mon.{id} with skew {s} > max {ms}'.format(
|
|
id=mon_id,s=mon_skew,ms=self.max_skew)
|
|
|
|
""" add to skew list """
|
|
details = timecheck['details']
|
|
skews[mon_id] = {'skew': mon_skew, 'details': details}
|
|
|
|
if self.expect_skew:
|
|
self.info('expected skew: {str}'.format(str=log_str))
|
|
else:
|
|
self.warn('unexpected skew: {str}'.format(str=log_str))
|
|
|
|
if (self.check_interval > 0.0):
|
|
time.sleep(self.check_interval)
|
|
|
|
total = len(skews)
|
|
if total > 0:
|
|
self.info('---------- found {n} skews ----------'.format(n=total))
|
|
for mon_id,values in skews.iteritems():
|
|
self.info('mon.{id}: {v}'.format(id=mon_id,v=values))
|
|
self.info('-------------------------------------')
|
|
else:
|
|
self.info('---------- no skews were found ----------')
|
|
|
|
error_str = ''
|
|
found_error = False
|
|
|
|
if self.expect_skew:
|
|
if total == 0:
|
|
error_str = 'We were expecting a skew, but none was found!'
|
|
found_error = True
|
|
else:
|
|
if total > 0:
|
|
error_str = 'We were not expecting a skew, but we did find it!'
|
|
found_error = True
|
|
|
|
if found_error:
|
|
self.info(error_str)
|
|
if not self.never_fail:
|
|
assert False, error_str
|
|
|
|
@contextlib.contextmanager
|
|
def task(ctx, config):
|
|
"""
|
|
Use clas ClockSkewCheck to check for clock skews on the monitors.
|
|
This task will spawn a thread running ClockSkewCheck's do_check().
|
|
|
|
All the configuration will be directly handled by ClockSkewCheck,
|
|
so please refer to the class documentation for further information.
|
|
"""
|
|
if config is None:
|
|
config = {}
|
|
assert isinstance(config, dict), \
|
|
'mon_clock_skew_check task only accepts a dict for configuration'
|
|
log.info('Beginning mon_clock_skew_check...')
|
|
first_mon = teuthology.get_first_mon(ctx, config)
|
|
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
|
|
manager = ceph_manager.CephManager(
|
|
mon,
|
|
ctx=ctx,
|
|
logger=log.getChild('ceph_manager'),
|
|
)
|
|
|
|
skew_check = ClockSkewCheck(ctx,
|
|
manager, config,
|
|
logger=log.getChild('mon_clock_skew_check'))
|
|
skew_check_thread = gevent.spawn(skew_check.do_check)
|
|
try:
|
|
yield
|
|
finally:
|
|
log.info('joining mon_clock_skew_check')
|
|
skew_check.finish()
|
|
skew_check_thread.get()
|
|
|
|
|