From f12426c3ec687f667eafd96528fa49f95e9028f5 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 27 Feb 2014 14:09:52 +0000 Subject: [PATCH] mds_thrash: PEP8-ize whitespace ...so that I can edit the code in a python IDE without it lighting up like a christmas tree! Signed-off-by: John Spray --- teuthology/task/mds_thrash.py | 417 +++++++++++++++++----------------- 1 file changed, 213 insertions(+), 204 deletions(-) diff --git a/teuthology/task/mds_thrash.py b/teuthology/task/mds_thrash.py index 8a58932a66a..40186e6296c 100644 --- a/teuthology/task/mds_thrash.py +++ b/teuthology/task/mds_thrash.py @@ -11,222 +11,231 @@ from teuthology import misc as teuthology log = logging.getLogger(__name__) + class MDSThrasher: - """ - MDSThrasher:: - - The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc). - - The config is optional. Many of the config parameters are a a maximum value - to use when selecting a random value from a range. To always use the maximum - value, set no_random to true. The config is a dict containing some or all of: - - seed: [no default] seed the random number generator - - randomize: [default: true] enables randomization and use the max/min values - - max_thrash: [default: 1] the maximum number of MDSs that will be thrashed at - any given time. - - max_thrash_delay: [default: 30] maximum number of seconds to delay before - thrashing again. - - max_revive_delay: [default: 10] maximum number of seconds to delay before - bringing back a thrashed MDS - - thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed - during replay. Value should be between 0.0 and 1.0 - - max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in - the replay state before thrashing - - thrash_weights: allows specific MDSs to be thrashed more/less frequently. This option - overrides anything specified by max_thrash. This option is a dict containing - mds.x: weight pairs. For example, [mds.a: 0.7, mds.b: 0.3, mds.c: 0.0]. Each weight - is a value from 0.0 to 1.0. Any MDSs not specified will be automatically - given a weight of 0.0. For a given MDS, by default the trasher delays for up - to max_thrash_delay, trashes, waits for the MDS to recover, and iterates. If a non-zero - weight is specified for an MDS, for each iteration the thrasher chooses whether to thrash - during that iteration based on a random value [0-1] not exceeding the weight of that MDS. - - Examples:: - - - The following example sets the likelihood that mds.a will be thrashed - to 80%, mds.b to 20%, and other MDSs will not be thrashed. It also sets the - likelihood that an MDS will be thrashed in replay to 40%. - Thrash weights do not have to sum to 1. - - tasks: - - ceph: - - mds_thrash: - thrash_weights: - - mds.a: 0.8 - - mds.b: 0.2 - thrash_in_replay: 0.4 - - ceph-fuse: - - workunit: - clients: - all: [suites/fsx.sh] - - The following example disables randomization, and uses the max delay values: - - tasks: - - ceph: - - mds_thrash: - max_thrash_delay: 10 - max_revive_delay: 1 - max_replay_thrash_delay: 4 - - """ - def __init__(self, ctx, manager, config, logger, failure_group, weight): - self.ctx = ctx - self.manager = manager - self.manager.wait_for_clean() - - self.stopping = False - self.logger = logger - self.config = config - - self.randomize = bool(self.config.get('randomize', True)) - self.max_thrash_delay = float(self.config.get('thrash_delay', 30.0)) - self.thrash_in_replay = float(self.config.get('thrash_in_replay', False)) - assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format(v=self.thrash_in_replay) - - self.max_replay_thrash_delay = float(self.config.get('max_replay_thrash_delay', 4.0)) - - self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0)) - - self.thread = gevent.spawn(self.do_thrash) - - self.failure_group = failure_group - self.weight = weight - - def log(self, x): - """Write data to logger assigned to this MDThrasher""" - self.logger.info(x) - - def do_join(self): - """Thread finished""" - self.stopping = True - self.thread.get() - - def do_thrash(self): """ - Perform the random thrashing action + MDSThrasher:: + + The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc). + + The config is optional. Many of the config parameters are a a maximum value + to use when selecting a random value from a range. To always use the maximum + value, set no_random to true. The config is a dict containing some or all of: + + seed: [no default] seed the random number generator + + randomize: [default: true] enables randomization and use the max/min values + + max_thrash: [default: 1] the maximum number of MDSs that will be thrashed at + any given time. + + max_thrash_delay: [default: 30] maximum number of seconds to delay before + thrashing again. + + max_revive_delay: [default: 10] maximum number of seconds to delay before + bringing back a thrashed MDS + + thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed + during replay. Value should be between 0.0 and 1.0 + + max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in + the replay state before thrashing + + thrash_weights: allows specific MDSs to be thrashed more/less frequently. This option + overrides anything specified by max_thrash. This option is a dict containing + mds.x: weight pairs. For example, [mds.a: 0.7, mds.b: 0.3, mds.c: 0.0]. Each weight + is a value from 0.0 to 1.0. Any MDSs not specified will be automatically + given a weight of 0.0. For a given MDS, by default the trasher delays for up + to max_thrash_delay, trashes, waits for the MDS to recover, and iterates. If a non-zero + weight is specified for an MDS, for each iteration the thrasher chooses whether to thrash + during that iteration based on a random value [0-1] not exceeding the weight of that MDS. + + Examples:: + + + The following example sets the likelihood that mds.a will be thrashed + to 80%, mds.b to 20%, and other MDSs will not be thrashed. It also sets the + likelihood that an MDS will be thrashed in replay to 40%. + Thrash weights do not have to sum to 1. + + tasks: + - ceph: + - mds_thrash: + thrash_weights: + - mds.a: 0.8 + - mds.b: 0.2 + thrash_in_replay: 0.4 + - ceph-fuse: + - workunit: + clients: + all: [suites/fsx.sh] + + The following example disables randomization, and uses the max delay values: + + tasks: + - ceph: + - mds_thrash: + max_thrash_delay: 10 + max_revive_delay: 1 + max_replay_thrash_delay: 4 + """ - self.log('starting mds_do_thrash for failure group: ' + ', '.join(['mds.{_id}'.format(_id=_f) for _f in self.failure_group])) - while not self.stopping: - delay = self.max_thrash_delay - if self.randomize: - delay = random.randrange(0.0, self.max_thrash_delay) - if delay > 0.0: - self.log('waiting for {delay} secs before thrashing'.format(delay=delay)) - time.sleep(delay) + def __init__(self, ctx, manager, config, logger, failure_group, weight): + self.ctx = ctx + self.manager = manager + self.manager.wait_for_clean() - skip = random.randrange(0.0, 1.0) - if self.weight < 1.0 and skip > self.weight: - self.log('skipping thrash iteration with skip ({skip}) > weight ({weight})'.format(skip=skip, weight=self.weight)) - continue + self.stopping = False + self.logger = logger + self.config = config - # find the active mds in the failure group - statuses = [self.manager.get_mds_status(m) for m in self.failure_group] - actives = filter(lambda s: s['state'] == 'up:active', statuses) - assert len(actives) == 1, 'Can only have one active in a failure group' + self.randomize = bool(self.config.get('randomize', True)) + self.max_thrash_delay = float(self.config.get('thrash_delay', 30.0)) + self.thrash_in_replay = float(self.config.get('thrash_in_replay', False)) + assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format( + v=self.thrash_in_replay) - active_mds = actives[0]['name'] - active_rank = actives[0]['rank'] + self.max_replay_thrash_delay = float(self.config.get('max_replay_thrash_delay', 4.0)) - self.log('kill mds.{id} (rank={r})'.format(id=active_mds, r=active_rank)) - self.manager.kill_mds_by_rank(active_rank) + self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0)) - # wait for mon to report killed mds as crashed - status = {} - last_laggy_since = None - itercount = 0 - while True: - failed = self.manager.get_mds_status_all()['failed'] - status = self.manager.get_mds_status(active_mds) - if not status: - break - if 'laggy_since' in status: - last_laggy_since = status['laggy_since'] - break - if any([(f == active_mds) for f in failed]): - break - self.log('waiting till mds map indicates mds.{_id} is laggy/crashed, in failed state, or mds.{_id} is removed from mdsmap'.format(_id=active_mds)) - itercount = itercount + 1 - if itercount > 10: - self.log('mds map: {status}'.format(status=self.manager.get_mds_status_all())) - time.sleep(2) - if last_laggy_since: - self.log('mds.{_id} reported laggy/crashed since: {since}'.format(_id=active_mds, since=last_laggy_since)) - else: - self.log('mds.{_id} down, removed from mdsmap'.format(_id=active_mds, since=last_laggy_since)) + self.thread = gevent.spawn(self.do_thrash) - # wait for a standby mds to takeover and become active - takeover_mds = None - takeover_rank = None - itercount = 0 - while True: - statuses = [self.manager.get_mds_status(m) for m in self.failure_group] - actives = filter(lambda s: s and s['state'] == 'up:active', statuses) - if len(actives) > 0: - assert len(actives) == 1, 'Can only have one active in failure group' - takeover_mds = actives[0]['name'] - takeover_rank = actives[0]['rank'] - break - itercount = itercount + 1 - if itercount > 10: - self.log('mds map: {status}'.format(status=self.manager.get_mds_status_all())) + self.failure_group = failure_group + self.weight = weight - self.log('New active mds is mds.{_id}'.format(_id=takeover_mds)) + def log(self, x): + """Write data to logger assigned to this MDThrasher""" + self.logger.info(x) - # wait for a while before restarting old active to become new - # standby - delay = self.max_revive_delay - if self.randomize: - delay = random.randrange(0.0, self.max_revive_delay) + def do_join(self): + """Thread finished""" + self.stopping = True + self.thread.get() - self.log('waiting for {delay} secs before reviving mds.{id}'.format( - delay=delay,id=active_mds)) - time.sleep(delay) + def do_thrash(self): + """ + Perform the random thrashing action + """ + self.log('starting mds_do_thrash for failure group: ' + ', '.join( + ['mds.{_id}'.format(_id=_f) for _f in self.failure_group])) + while not self.stopping: + delay = self.max_thrash_delay + if self.randomize: + delay = random.randrange(0.0, self.max_thrash_delay) - self.log('reviving mds.{id}'.format(id=active_mds)) - self.manager.revive_mds(active_mds, standby_for_rank=takeover_rank) + if delay > 0.0: + self.log('waiting for {delay} secs before thrashing'.format(delay=delay)) + time.sleep(delay) - status = {} - while True: - status = self.manager.get_mds_status(active_mds) - if status and (status['state'] == 'up:standby' or status['state'] == 'up:standby-replay'): - break - self.log('waiting till mds map indicates mds.{_id} is in standby or standby-replay'.format(_id=active_mds)) - time.sleep(2) - self.log('mds.{_id} reported in {state} state'.format(_id=active_mds, state=status['state'])) + skip = random.randrange(0.0, 1.0) + if self.weight < 1.0 and skip > self.weight: + self.log('skipping thrash iteration with skip ({skip}) > weight ({weight})'.format(skip=skip, + weight=self.weight)) + continue - # don't do replay thrashing right now - continue - # this might race with replay -> active transition... - if status['state'] == 'up:replay' and random.randrange(0.0, 1.0) < self.thrash_in_replay: + # find the active mds in the failure group + statuses = [self.manager.get_mds_status(m) for m in self.failure_group] + actives = filter(lambda s: s['state'] == 'up:active', statuses) + assert len(actives) == 1, 'Can only have one active in a failure group' - delay = self.max_replay_thrash_delay - if self.randomize: - delay = random.randrange(0.0, self.max_replay_thrash_delay) - time.sleep(delay) - self.log('kill replaying mds.{id}'.format(id=self.to_kill)) - self.manager.kill_mds(self.to_kill) + active_mds = actives[0]['name'] + active_rank = actives[0]['rank'] - delay = self.max_revive_delay - if self.randomize: - delay = random.randrange(0.0, self.max_revive_delay) + self.log('kill mds.{id} (rank={r})'.format(id=active_mds, r=active_rank)) + self.manager.kill_mds_by_rank(active_rank) - self.log('waiting for {delay} secs before reviving mds.{id}'.format( - delay=delay,id=self.to_kill)) - time.sleep(delay) + # wait for mon to report killed mds as crashed + status = {} + last_laggy_since = None + itercount = 0 + while True: + failed = self.manager.get_mds_status_all()['failed'] + status = self.manager.get_mds_status(active_mds) + if not status: + break + if 'laggy_since' in status: + last_laggy_since = status['laggy_since'] + break + if any([(f == active_mds) for f in failed]): + break + self.log( + 'waiting till mds map indicates mds.{_id} is laggy/crashed, in failed state, or mds.{_id} is removed from mdsmap'.format( + _id=active_mds)) + itercount = itercount + 1 + if itercount > 10: + self.log('mds map: {status}'.format(status=self.manager.get_mds_status_all())) + time.sleep(2) + if last_laggy_since: + self.log( + 'mds.{_id} reported laggy/crashed since: {since}'.format(_id=active_mds, since=last_laggy_since)) + else: + self.log('mds.{_id} down, removed from mdsmap'.format(_id=active_mds, since=last_laggy_since)) - self.log('revive mds.{id}'.format(id=self.to_kill)) - self.manager.revive_mds(self.to_kill) + # wait for a standby mds to takeover and become active + takeover_mds = None + takeover_rank = None + itercount = 0 + while True: + statuses = [self.manager.get_mds_status(m) for m in self.failure_group] + actives = filter(lambda s: s and s['state'] == 'up:active', statuses) + if len(actives) > 0: + assert len(actives) == 1, 'Can only have one active in failure group' + takeover_mds = actives[0]['name'] + takeover_rank = actives[0]['rank'] + break + itercount = itercount + 1 + if itercount > 10: + self.log('mds map: {status}'.format(status=self.manager.get_mds_status_all())) + + self.log('New active mds is mds.{_id}'.format(_id=takeover_mds)) + + # wait for a while before restarting old active to become new + # standby + delay = self.max_revive_delay + if self.randomize: + delay = random.randrange(0.0, self.max_revive_delay) + + self.log('waiting for {delay} secs before reviving mds.{id}'.format( + delay=delay, id=active_mds)) + time.sleep(delay) + + self.log('reviving mds.{id}'.format(id=active_mds)) + self.manager.revive_mds(active_mds, standby_for_rank=takeover_rank) + + status = {} + while True: + status = self.manager.get_mds_status(active_mds) + if status and (status['state'] == 'up:standby' or status['state'] == 'up:standby-replay'): + break + self.log( + 'waiting till mds map indicates mds.{_id} is in standby or standby-replay'.format(_id=active_mds)) + time.sleep(2) + self.log('mds.{_id} reported in {state} state'.format(_id=active_mds, state=status['state'])) + + # don't do replay thrashing right now + continue + # this might race with replay -> active transition... + if status['state'] == 'up:replay' and random.randrange(0.0, 1.0) < self.thrash_in_replay: + + delay = self.max_replay_thrash_delay + if self.randomize: + delay = random.randrange(0.0, self.max_replay_thrash_delay) + time.sleep(delay) + self.log('kill replaying mds.{id}'.format(id=self.to_kill)) + self.manager.kill_mds(self.to_kill) + + delay = self.max_revive_delay + if self.randomize: + delay = random.randrange(0.0, self.max_revive_delay) + + self.log('waiting for {delay} secs before reviving mds.{id}'.format( + delay=delay, id=self.to_kill)) + time.sleep(delay) + + self.log('revive mds.{id}'.format(id=self.to_kill)) + self.manager.revive_mds(self.to_kill) @contextlib.contextmanager @@ -268,7 +277,7 @@ def task(ctx, config): statuses = None statuses_by_rank = None while True: - statuses = {m : manager.get_mds_status(m) for m in mdslist} + statuses = {m: manager.get_mds_status(m) for m in mdslist} statuses_by_rank = {} for _, s in statuses.iteritems(): if isinstance(s, dict): @@ -285,18 +294,18 @@ def task(ctx, config): # setup failure groups failure_groups = {} - actives = {s['name'] : s for (_,s) in statuses.iteritems() if s['state'] == 'up:active'} + actives = {s['name']: s for (_, s) in statuses.iteritems() if s['state'] == 'up:active'} log.info('Actives is: {d}'.format(d=actives)) log.info('Statuses is: {d}'.format(d=statuses_by_rank)) for active in actives: - for (r,s) in statuses.iteritems(): + for (r, s) in statuses.iteritems(): if s['standby_for_name'] == active: if not active in failure_groups: failure_groups[active] = [] log.info('Assigning mds rank {r} to failure group {g}'.format(r=r, g=active)) failure_groups[active].append(r) - for (active,standbys) in failure_groups.iteritems(): + for (active, standbys) in failure_groups.iteritems(): weight = 1.0 if 'thrash_weights' in config: @@ -307,10 +316,10 @@ def task(ctx, config): thrashers[active] = MDSThrasher( ctx, manager, config, logger=log.getChild('mds_thrasher.failure_group.[{a}, {sbs}]'.format( - a=active, - sbs=', '.join(standbys) - ) - ), + a=active, + sbs=', '.join(standbys) + ) + ), failure_group=failure_group, weight=weight) # if thrash_weights isn't specified and we've reached max_thrash,