wait_till_clean -> wait_for_clean and wait_for_recovery

Clean now also means the correct number of replicas, whereas recovered
means we have done all the work we can do given the replicas/osds we have.
For example, degraded and clean are now mutually exclusive.

Also move away from 'till'.
This commit is contained in:
Sage Weil 2012-02-17 21:53:25 -08:00
parent ad9d7fb6e1
commit 196d4a1f16
4 changed files with 39 additions and 13 deletions

View File

@ -50,7 +50,7 @@ def task(ctx, config):
manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
manager.wait_till_clean()
manager.wait_for_clean()
# write some data
p = rados_start(mon, ['-p', 'data', 'bench', '15', 'write', '-b', '4096'])
@ -67,7 +67,7 @@ def task(ctx, config):
# wait for everything to peer and be happy...
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
manager.wait_till_clean()
manager.wait_for_recovery()
# write some new data
p = rados_start(mon, ['-p', 'data', 'bench', '30', 'write', '-b', '4096'])
@ -87,12 +87,12 @@ def task(ctx, config):
# cluster must recover
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
manager.wait_till_clean()
manager.wait_for_recovery()
# re-add osd.0
manager.revive_osd(0)
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
manager.wait_till_clean()
manager.wait_for_clean()

View File

@ -8,7 +8,7 @@ import json
class Thrasher(gevent.Greenlet):
def __init__(self, manager, config, logger=None):
self.ceph_manager = manager
self.ceph_manager.wait_till_clean()
self.ceph_manager.wait_for_clean()
osd_status = self.ceph_manager.get_osd_status()
self.in_osds = osd_status['in']
self.live_osds = osd_status['live']
@ -124,7 +124,7 @@ class Thrasher(gevent.Greenlet):
if random.uniform(0,1) < (float(delay) / cleanint):
while len(self.dead_osds) > maxdead:
self.revive_osd()
self.ceph_manager.wait_till_clean(
self.ceph_manager.wait_for_recovery(
timeout=self.config.get('timeout')
)
self.choose_action()()
@ -226,6 +226,14 @@ class CephManager:
num += 1
return num
def get_num_active_recovered(self):
pgs = self.get_pg_stats()
num = 0
for pg in pgs:
if pg['state'].count('active') and not pg['state'].count('recovering'):
num += 1
return num
def get_num_active(self):
pgs = self.get_pg_stats()
num = 0
@ -237,8 +245,11 @@ class CephManager:
def is_clean(self):
return self.get_num_active_clean() == self.get_num_pgs()
def wait_till_clean(self, timeout=None):
self.log("waiting till clean")
def is_recovered(self):
return self.get_num_active_recovered() == self.get_num_pgs()
def wait_for_clean(self, timeout=None):
self.log("waiting for clean")
start = time.time()
num_active_clean = self.get_num_active_clean()
while not self.is_clean():
@ -252,6 +263,21 @@ class CephManager:
time.sleep(3)
self.log("clean!")
def wait_for_recovery(self, timeout=None):
self.log("waiting for recovery to complete")
start = time.time()
num_active_recovered = self.get_num_active_recovered()
while not self.is_recovered():
if timeout is not None:
assert time.time() - start < timeout, \
'failed to recover before timeout expired'
cur_active_recovered = self.get_num_active_recovered()
if cur_active_recovered != num_active_recovered:
start = time.time()
num_active_recovered = cur_active_recovered
time.sleep(3)
self.log("recovered!")
def osd_is_up(self, osd):
osds = self.get_osd_dump()
return osds[osd]['up'] > 0

View File

@ -45,7 +45,7 @@ def task(ctx, config):
manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
manager.wait_till_clean()
manager.wait_for_clean()
# something that is always there
dummyfile = '/etc/fstab'
@ -60,7 +60,7 @@ def task(ctx, config):
manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
manager.wait_till_clean()
manager.wait_for_recovery()
# create old objects
for f in range(1, 10):
@ -135,7 +135,7 @@ def task(ctx, config):
manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
manager.wait_till_clean()
manager.wait_for_recovery()
# verify result
for f in range(1, 10):
@ -149,4 +149,4 @@ def task(ctx, config):
# see if osd.1 can cope
manager.revive_osd(1)
manager.wait_till_osd_is_up(1)
manager.wait_till_clean()
manager.wait_for_clean()

View File

@ -82,4 +82,4 @@ def task(ctx, config):
finally:
log.info('joining thrashosds')
thrash_proc.do_join()
manager.wait_till_clean(config.get('timeout', 360))
manager.wait_for_recovery(config.get('timeout', 360))