2011-06-13 23:36:21 +00:00
|
|
|
import contextlib
|
|
|
|
import logging
|
|
|
|
import ceph_manager
|
2011-08-31 20:56:42 +00:00
|
|
|
from teuthology import misc as teuthology
|
|
|
|
|
2011-06-13 23:36:21 +00:00
|
|
|
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
|
|
@contextlib.contextmanager
|
|
|
|
def task(ctx, config):
|
|
|
|
"""
|
2011-08-25 22:18:42 +00:00
|
|
|
"Thrash" the OSDs by randomly marking them out/down (and then back
|
2011-08-31 20:21:30 +00:00
|
|
|
in) until the task is ended. This loops, and every op_delay
|
|
|
|
seconds it randomly chooses to add or remove an OSD (even odds)
|
|
|
|
unless there are fewer than min_out OSDs out of the cluster, or
|
|
|
|
more than min_in OSDs in the cluster.
|
|
|
|
|
2011-08-25 22:18:42 +00:00
|
|
|
All commands are run on mon0 and it stops when __exit__ is called.
|
2011-08-25 22:27:30 +00:00
|
|
|
|
2011-08-25 22:18:42 +00:00
|
|
|
The config is optional, and is a dict containing some or all of:
|
2011-06-13 23:36:21 +00:00
|
|
|
|
2011-08-31 20:21:30 +00:00
|
|
|
min_in: (default 2) the minimum number of OSDs to keep in the
|
|
|
|
cluster
|
|
|
|
|
|
|
|
min_out: (default 0) the minimum number of OSDs to keep out of the
|
|
|
|
cluster
|
|
|
|
|
|
|
|
op_delay: (5) the length of time to sleep between changing an
|
|
|
|
OSD's status
|
|
|
|
|
2012-01-11 00:20:50 +00:00
|
|
|
min_dead: (0) minimum number of osds to leave down/dead.
|
|
|
|
|
2012-01-17 17:24:54 +00:00
|
|
|
max_dead: (0) maximum number of osds to leave down/dead before waiting
|
2012-01-10 21:57:55 +00:00
|
|
|
for clean. This should probably be num_replicas - 1.
|
|
|
|
|
2011-08-31 20:21:30 +00:00
|
|
|
clean_interval: (60) the approximate length of time to loop before
|
|
|
|
waiting until the cluster goes clean. (In reality this is used
|
|
|
|
to probabilistically choose when to wait, and the method used
|
|
|
|
makes it closer to -- but not identical to -- the half-life.)
|
|
|
|
|
|
|
|
chance_down: (0) the probability that the thrasher will mark an
|
|
|
|
OSD down rather than marking it out. (The thrasher will not
|
|
|
|
consider that OSD out of the cluster, since presently an OSD
|
|
|
|
wrongly marked down will mark itself back up again.) This value
|
|
|
|
can be either an integer (eg, 75) or a float probability (eg
|
|
|
|
0.75).
|
2011-09-09 01:09:11 +00:00
|
|
|
|
2012-11-07 20:36:37 +00:00
|
|
|
chance_test_min_size: (0) chance to run test_pool_min_size,
|
|
|
|
which:
|
|
|
|
- kills all but one osd
|
|
|
|
- waits
|
|
|
|
- kills that osd
|
|
|
|
- revives all other osds
|
|
|
|
- verifies that the osds fully recover
|
|
|
|
|
2011-09-09 01:09:11 +00:00
|
|
|
timeout: (360) the number of seconds to wait for the cluster
|
2011-11-17 19:11:33 +00:00
|
|
|
to become clean after each cluster change. If this doesn't
|
|
|
|
happen within the timeout, an exception will be raised.
|
2011-09-09 01:09:11 +00:00
|
|
|
|
2011-06-13 23:36:21 +00:00
|
|
|
example:
|
|
|
|
|
|
|
|
tasks:
|
|
|
|
- ceph:
|
2011-07-12 01:00:03 +00:00
|
|
|
- thrashosds:
|
2011-08-31 20:21:30 +00:00
|
|
|
chance_down: 10
|
|
|
|
op_delay: 3
|
|
|
|
min_in: 1
|
2011-09-09 01:09:11 +00:00
|
|
|
timeout: 600
|
2011-06-13 23:36:21 +00:00
|
|
|
- interactive:
|
|
|
|
"""
|
2011-09-09 17:31:08 +00:00
|
|
|
if config is None:
|
|
|
|
config = {}
|
|
|
|
assert isinstance(config, dict), \
|
|
|
|
'thrashosds task only accepts a dict for configuration'
|
2011-06-13 23:36:21 +00:00
|
|
|
log.info('Beginning thrashosds...')
|
2011-08-31 20:56:42 +00:00
|
|
|
first_mon = teuthology.get_first_mon(ctx, config)
|
|
|
|
(mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
|
2011-06-13 23:36:21 +00:00
|
|
|
manager = ceph_manager.CephManager(
|
2011-07-12 01:00:03 +00:00
|
|
|
mon,
|
2011-09-14 23:31:58 +00:00
|
|
|
ctx=ctx,
|
2011-07-12 01:00:03 +00:00
|
|
|
logger=log.getChild('ceph_manager'),
|
2011-06-13 23:36:21 +00:00
|
|
|
)
|
|
|
|
thrash_proc = ceph_manager.Thrasher(
|
|
|
|
manager,
|
2011-08-25 22:18:42 +00:00
|
|
|
config,
|
|
|
|
logger=log.getChild('thrasher')
|
2011-06-13 23:36:21 +00:00
|
|
|
)
|
|
|
|
try:
|
|
|
|
yield
|
|
|
|
finally:
|
|
|
|
log.info('joining thrashosds')
|
|
|
|
thrash_proc.do_join()
|
2012-02-18 05:53:25 +00:00
|
|
|
manager.wait_for_recovery(config.get('timeout', 360))
|