ceph_manager: add filestore and heartbeat stalls

Signed-off-by: Samuel Just <sam.just@inktank.com>
This commit is contained in:
Samuel Just 2013-01-23 16:13:22 -08:00
parent e714c77812
commit 566ae5332e
2 changed files with 42 additions and 0 deletions

View File

@ -124,6 +124,29 @@ class Thrasher:
timeout=self.config.get('timeout')
)
def inject_pause(self, conf_key, duration, check_after, should_be_down):
the_one = random.choice(self.in_osds)
self.log("inject_pause on {osd}".format(osd = the_one))
self.log(
"Testing {key} pause injection for duration {duration}".format(
key = conf_key,
duration = duration
))
self.log(
"Checking after {after}, should_be_down={shouldbedown}".format(
after = check_after,
shouldbedown = should_be_down
))
self.ceph_manager.set_config(the_one, **{conf_key:duration})
if not should_be_down:
return
time.sleep(check_after)
status = self.ceph_manager.get_osd_status()
assert the_one in status['down']
time.sleep(duration - check_after + 20)
status = self.ceph_manager.get_osd_status()
assert not the_one in status['down']
def choose_action(self):
chance_down = self.config.get('chance_down', 0)
chance_test_min_size = self.config.get('chance_test_min_size', 0)
@ -148,6 +171,19 @@ class Thrasher:
actions.append((self.grow_pool, self.config.get('chance_pgnum_grow', 0),))
actions.append((self.fix_pgp_num, self.config.get('chance_pgpnum_fix', 0),))
actions.append((self.test_pool_min_size, chance_test_min_size,))
for key in ['heartbeat_inject_failure', 'filestore_inject_stall']:
for scenario in [
(lambda: self.inject_pause(key,
self.config.get('pause_short', 3),
0,
False),
self.config.get('chance_inject_pause_short', 1),),
(lambda: self.inject_pause(key,
self.config.get('pause_long', 150),
self.config.get('pause_check_after', 120),
True),
self.config.get('chance_inject_pause_long', 0.2),)]:
actions.append(scenario)
total = sum([y for (x,y) in actions])
val = random.uniform(0, total)

View File

@ -62,6 +62,12 @@ def task(ctx, config):
pool_grow_by: (10) amount to increase pgnum by
max_pgs_per_pool_osd: (1200) don't expand pools past this size per osd
pause_short: (3) duration of short pause
pause_long: (150) duration of long pause
pause_check_after: (120) assert osd down after this long
chance_inject_pause_short: (1) chance of injecting short stall
chance_inject_pause_long: (0.1) chance of injecting long stall
example:
tasks: