2014-02-14 02:18:56 +00:00
|
|
|
"""
|
|
|
|
Handle clock skews in monitors.
|
|
|
|
"""
|
2013-01-04 18:16:58 +00:00
|
|
|
import logging
|
|
|
|
import contextlib
|
|
|
|
import ceph_manager
|
|
|
|
import time
|
|
|
|
import gevent
|
2013-07-22 16:58:41 +00:00
|
|
|
from StringIO import StringIO
|
2013-01-04 18:16:58 +00:00
|
|
|
from teuthology import misc as teuthology
|
|
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
class ClockSkewCheck:
|
2014-02-14 02:18:56 +00:00
|
|
|
"""
|
|
|
|
Periodically check if there are any clock skews among the monitors in the
|
|
|
|
quorum. By default, assume no skews are supposed to exist; that can be
|
|
|
|
changed using the 'expect-skew' option. If 'fail-on-skew' is set to false,
|
|
|
|
then we will always succeed and only report skews if any are found.
|
|
|
|
|
|
|
|
This class does not spawn a thread. It assumes that, if that is indeed
|
|
|
|
wanted, it should be done by a third party (for instance, the task using
|
|
|
|
this class). We intend it as such in order to reuse this class if need be.
|
|
|
|
|
|
|
|
This task accepts the following options:
|
|
|
|
|
|
|
|
interval amount of seconds to wait in-between checks. (default: 30.0)
|
|
|
|
max-skew maximum skew, in seconds, that is considered tolerable before
|
|
|
|
issuing a warning. (default: 0.05)
|
|
|
|
expect-skew 'true' or 'false', to indicate whether to expect a skew during
|
|
|
|
the run or not. If 'true', the test will fail if no skew is
|
|
|
|
found, and succeed if a skew is indeed found; if 'false', it's
|
|
|
|
the other way around. (default: false)
|
|
|
|
never-fail Don't fail the run if a skew is detected and we weren't
|
|
|
|
expecting it, or if no skew is detected and we were expecting
|
|
|
|
it. (default: False)
|
|
|
|
|
|
|
|
at-least-once Runs at least once, even if we are told to stop.
|
|
|
|
(default: True)
|
|
|
|
at-least-once-timeout If we were told to stop but we are attempting to
|
|
|
|
run at least once, timeout after this many seconds.
|
|
|
|
(default: 600)
|
|
|
|
|
|
|
|
Example:
|
|
|
|
Expect a skew higher than 0.05 seconds, but only report it without
|
|
|
|
failing the teuthology run.
|
2013-01-04 18:16:58 +00:00
|
|
|
|
|
|
|
- mon_clock_skew_check:
|
|
|
|
interval: 30
|
|
|
|
max-skew: 0.05
|
|
|
|
expect_skew: true
|
|
|
|
never-fail: true
|
2014-02-14 02:18:56 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, ctx, manager, config, logger):
|
|
|
|
self.ctx = ctx
|
|
|
|
self.manager = manager
|
|
|
|
|
|
|
|
self.stopping = False
|
|
|
|
self.logger = logger
|
|
|
|
self.config = config
|
|
|
|
|
|
|
|
if self.config is None:
|
|
|
|
self.config = dict()
|
|
|
|
|
|
|
|
self.check_interval = float(self.config.get('interval', 30.0))
|
|
|
|
|
|
|
|
first_mon = teuthology.get_first_mon(ctx, config)
|
|
|
|
remote = ctx.cluster.only(first_mon).remotes.keys()[0]
|
|
|
|
proc = remote.run(
|
|
|
|
args=[
|
|
|
|
'sudo',
|
|
|
|
'ceph-mon',
|
|
|
|
'-i', first_mon[4:],
|
|
|
|
'--show-config-value', 'mon_clock_drift_allowed'
|
|
|
|
], stdout=StringIO(), wait=True
|
|
|
|
)
|
|
|
|
self.max_skew = self.config.get('max-skew', float(proc.stdout.getvalue()))
|
|
|
|
|
|
|
|
self.expect_skew = self.config.get('expect-skew', False)
|
|
|
|
self.never_fail = self.config.get('never-fail', False)
|
|
|
|
self.at_least_once = self.config.get('at-least-once', True)
|
|
|
|
self.at_least_once_timeout = self.config.get('at-least-once-timeout', 600.0)
|
|
|
|
|
|
|
|
def info(self, x):
|
|
|
|
"""
|
|
|
|
locally define logger for info messages
|
|
|
|
"""
|
|
|
|
self.logger.info(x)
|
|
|
|
|
|
|
|
def warn(self, x):
|
|
|
|
"""
|
|
|
|
locally define logger for warnings
|
|
|
|
"""
|
|
|
|
self.logger.warn(x)
|
|
|
|
|
|
|
|
def debug(self, x):
|
|
|
|
"""
|
|
|
|
locally define logger for debug messages
|
|
|
|
"""
|
|
|
|
self.logger.info(x)
|
|
|
|
self.logger.debug(x)
|
|
|
|
|
|
|
|
def finish(self):
|
|
|
|
"""
|
|
|
|
Break out of the do_check loop.
|
|
|
|
"""
|
|
|
|
self.stopping = True
|
|
|
|
|
|
|
|
def sleep_interval(self):
|
|
|
|
"""
|
|
|
|
If a sleep interval is set, sleep for that amount of time.
|
|
|
|
"""
|
|
|
|
if self.check_interval > 0.0:
|
|
|
|
self.debug('sleeping for {s} seconds'.format(
|
|
|
|
s=self.check_interval))
|
|
|
|
time.sleep(self.check_interval)
|
|
|
|
|
|
|
|
def print_skews(self, skews):
|
|
|
|
"""
|
|
|
|
Display skew values.
|
|
|
|
"""
|
|
|
|
total = len(skews)
|
|
|
|
if total > 0:
|
|
|
|
self.info('---------- found {n} skews ----------'.format(n=total))
|
|
|
|
for mon_id, values in skews.iteritems():
|
|
|
|
self.info('mon.{id}: {v}'.format(id=mon_id, v=values))
|
|
|
|
self.info('-------------------------------------')
|
2013-01-24 18:00:39 +00:00
|
|
|
else:
|
2014-02-14 02:18:56 +00:00
|
|
|
self.info('---------- no skews were found ----------')
|
|
|
|
|
|
|
|
def do_check(self):
|
|
|
|
"""
|
|
|
|
Clock skew checker. Loops until finish() is called.
|
|
|
|
"""
|
|
|
|
self.info('start checking for clock skews')
|
|
|
|
skews = dict()
|
|
|
|
ran_once = False
|
|
|
|
|
|
|
|
started_on = None
|
|
|
|
|
|
|
|
while not self.stopping or (self.at_least_once and not ran_once):
|
|
|
|
|
|
|
|
if self.at_least_once and not ran_once and self.stopping:
|
|
|
|
if started_on is None:
|
|
|
|
self.info('kicking-off timeout (if any)')
|
|
|
|
started_on = time.time()
|
|
|
|
elif self.at_least_once_timeout > 0.0:
|
|
|
|
assert time.time() - started_on < self.at_least_once_timeout, \
|
|
|
|
'failed to obtain a timecheck before timeout expired'
|
|
|
|
|
|
|
|
quorum_size = len(teuthology.get_mon_names(self.ctx))
|
|
|
|
self.manager.wait_for_mon_quorum_size(quorum_size)
|
|
|
|
|
|
|
|
health = self.manager.get_mon_health(True)
|
|
|
|
timechecks = health['timechecks']
|
|
|
|
|
|
|
|
clean_check = False
|
|
|
|
|
|
|
|
if timechecks['round_status'] == 'finished':
|
|
|
|
assert (timechecks['round'] % 2) == 0, \
|
|
|
|
'timecheck marked as finished but round ' \
|
|
|
|
'disagrees (r {r})'.format(
|
|
|
|
r=timechecks['round'])
|
|
|
|
clean_check = True
|
|
|
|
else:
|
|
|
|
assert timechecks['round_status'] == 'on-going', \
|
|
|
|
'timecheck status expected \'on-going\' ' \
|
|
|
|
'but found \'{s}\' instead'.format(
|
|
|
|
s=timechecks['round_status'])
|
|
|
|
if 'mons' in timechecks.keys() and len(timechecks['mons']) > 1:
|
|
|
|
self.info('round still on-going, but there are available reports')
|
|
|
|
else:
|
|
|
|
self.info('no timechecks available just yet')
|
|
|
|
self.sleep_interval()
|
|
|
|
continue
|
|
|
|
|
|
|
|
assert len(timechecks['mons']) > 1, \
|
|
|
|
'there are not enough reported timechecks; ' \
|
|
|
|
'expected > 1 found {n}'.format(n=len(timechecks['mons']))
|
|
|
|
|
|
|
|
for check in timechecks['mons']:
|
|
|
|
mon_skew = float(check['skew'])
|
|
|
|
mon_health = check['health']
|
|
|
|
mon_id = check['name']
|
|
|
|
if abs(mon_skew) > self.max_skew:
|
|
|
|
assert mon_health == 'HEALTH_WARN', \
|
|
|
|
'mon.{id} health is \'{health}\' but skew {s} > max {ms}'.format(
|
|
|
|
id=mon_id,health=mon_health,s=abs(mon_skew),ms=self.max_skew)
|
|
|
|
|
|
|
|
log_str = 'mon.{id} with skew {s} > max {ms}'.format(
|
|
|
|
id=mon_id,s=abs(mon_skew),ms=self.max_skew)
|
|
|
|
|
|
|
|
""" add to skew list """
|
|
|
|
details = check['details']
|
|
|
|
skews[mon_id] = {'skew': mon_skew, 'details': details}
|
|
|
|
|
|
|
|
if self.expect_skew:
|
|
|
|
self.info('expected skew: {str}'.format(str=log_str))
|
|
|
|
else:
|
|
|
|
self.warn('unexpected skew: {str}'.format(str=log_str))
|
|
|
|
|
|
|
|
if clean_check or (self.expect_skew and len(skews) > 0):
|
|
|
|
ran_once = True
|
|
|
|
self.print_skews(skews)
|
|
|
|
self.sleep_interval()
|
|
|
|
|
|
|
|
total = len(skews)
|
2013-01-30 20:52:03 +00:00
|
|
|
self.print_skews(skews)
|
2013-01-04 18:16:58 +00:00
|
|
|
|
2014-02-14 02:18:56 +00:00
|
|
|
error_str = ''
|
|
|
|
found_error = False
|
2013-01-04 18:16:58 +00:00
|
|
|
|
2014-02-14 02:18:56 +00:00
|
|
|
if self.expect_skew:
|
|
|
|
if total == 0:
|
|
|
|
error_str = 'We were expecting a skew, but none was found!'
|
|
|
|
found_error = True
|
|
|
|
else:
|
|
|
|
if total > 0:
|
|
|
|
error_str = 'We were not expecting a skew, but we did find it!'
|
|
|
|
found_error = True
|
2013-01-04 18:16:58 +00:00
|
|
|
|
2014-02-14 02:18:56 +00:00
|
|
|
if found_error:
|
|
|
|
self.info(error_str)
|
|
|
|
if not self.never_fail:
|
|
|
|
assert False, error_str
|
2013-01-04 18:16:58 +00:00
|
|
|
|
|
|
|
@contextlib.contextmanager
|
|
|
|
def task(ctx, config):
|
2014-02-14 02:18:56 +00:00
|
|
|
"""
|
|
|
|
Use clas ClockSkewCheck to check for clock skews on the monitors.
|
|
|
|
This task will spawn a thread running ClockSkewCheck's do_check().
|
|
|
|
|
|
|
|
All the configuration will be directly handled by ClockSkewCheck,
|
|
|
|
so please refer to the class documentation for further information.
|
|
|
|
"""
|
|
|
|
if config is None:
|
|
|
|
config = {}
|
|
|
|
assert isinstance(config, dict), \
|
|
|
|
'mon_clock_skew_check task only accepts a dict for configuration'
|
|
|
|
log.info('Beginning mon_clock_skew_check...')
|
|
|
|
first_mon = teuthology.get_first_mon(ctx, config)
|
2014-03-27 16:35:28 +00:00
|
|
|
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
|
2014-02-14 02:18:56 +00:00
|
|
|
manager = ceph_manager.CephManager(
|
|
|
|
mon,
|
|
|
|
ctx=ctx,
|
|
|
|
logger=log.getChild('ceph_manager'),
|
|
|
|
)
|
|
|
|
|
|
|
|
skew_check = ClockSkewCheck(ctx,
|
|
|
|
manager, config,
|
|
|
|
logger=log.getChild('mon_clock_skew_check'))
|
|
|
|
skew_check_thread = gevent.spawn(skew_check.do_check)
|
|
|
|
try:
|
|
|
|
yield
|
|
|
|
finally:
|
|
|
|
log.info('joining mon_clock_skew_check')
|
|
|
|
skew_check.finish()
|
|
|
|
skew_check_thread.get()
|
2013-01-04 18:16:58 +00:00
|
|
|
|
|
|
|
|