mirror of https://github.com/ceph/ceph
wait_till_clean -> wait_for_clean and wait_for_recovery
Clean now also means the correct number of replicas, whereas recovered means we have done all the work we can do given the replicas/osds we have. For example, degraded and clean are now mutually exclusive. Also move away from 'till'.
This commit is contained in:
parent
ad9d7fb6e1
commit
196d4a1f16
|
@ -50,7 +50,7 @@ def task(ctx, config):
|
|||
manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
|
||||
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
|
||||
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
|
||||
manager.wait_till_clean()
|
||||
manager.wait_for_clean()
|
||||
|
||||
# write some data
|
||||
p = rados_start(mon, ['-p', 'data', 'bench', '15', 'write', '-b', '4096'])
|
||||
|
@ -67,7 +67,7 @@ def task(ctx, config):
|
|||
# wait for everything to peer and be happy...
|
||||
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
|
||||
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
|
||||
manager.wait_till_clean()
|
||||
manager.wait_for_recovery()
|
||||
|
||||
# write some new data
|
||||
p = rados_start(mon, ['-p', 'data', 'bench', '30', 'write', '-b', '4096'])
|
||||
|
@ -87,12 +87,12 @@ def task(ctx, config):
|
|||
# cluster must recover
|
||||
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
|
||||
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
|
||||
manager.wait_till_clean()
|
||||
manager.wait_for_recovery()
|
||||
|
||||
# re-add osd.0
|
||||
manager.revive_osd(0)
|
||||
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
|
||||
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
|
||||
manager.wait_till_clean()
|
||||
manager.wait_for_clean()
|
||||
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ import json
|
|||
class Thrasher(gevent.Greenlet):
|
||||
def __init__(self, manager, config, logger=None):
|
||||
self.ceph_manager = manager
|
||||
self.ceph_manager.wait_till_clean()
|
||||
self.ceph_manager.wait_for_clean()
|
||||
osd_status = self.ceph_manager.get_osd_status()
|
||||
self.in_osds = osd_status['in']
|
||||
self.live_osds = osd_status['live']
|
||||
|
@ -124,7 +124,7 @@ class Thrasher(gevent.Greenlet):
|
|||
if random.uniform(0,1) < (float(delay) / cleanint):
|
||||
while len(self.dead_osds) > maxdead:
|
||||
self.revive_osd()
|
||||
self.ceph_manager.wait_till_clean(
|
||||
self.ceph_manager.wait_for_recovery(
|
||||
timeout=self.config.get('timeout')
|
||||
)
|
||||
self.choose_action()()
|
||||
|
@ -226,6 +226,14 @@ class CephManager:
|
|||
num += 1
|
||||
return num
|
||||
|
||||
def get_num_active_recovered(self):
|
||||
pgs = self.get_pg_stats()
|
||||
num = 0
|
||||
for pg in pgs:
|
||||
if pg['state'].count('active') and not pg['state'].count('recovering'):
|
||||
num += 1
|
||||
return num
|
||||
|
||||
def get_num_active(self):
|
||||
pgs = self.get_pg_stats()
|
||||
num = 0
|
||||
|
@ -237,8 +245,11 @@ class CephManager:
|
|||
def is_clean(self):
|
||||
return self.get_num_active_clean() == self.get_num_pgs()
|
||||
|
||||
def wait_till_clean(self, timeout=None):
|
||||
self.log("waiting till clean")
|
||||
def is_recovered(self):
|
||||
return self.get_num_active_recovered() == self.get_num_pgs()
|
||||
|
||||
def wait_for_clean(self, timeout=None):
|
||||
self.log("waiting for clean")
|
||||
start = time.time()
|
||||
num_active_clean = self.get_num_active_clean()
|
||||
while not self.is_clean():
|
||||
|
@ -252,6 +263,21 @@ class CephManager:
|
|||
time.sleep(3)
|
||||
self.log("clean!")
|
||||
|
||||
def wait_for_recovery(self, timeout=None):
|
||||
self.log("waiting for recovery to complete")
|
||||
start = time.time()
|
||||
num_active_recovered = self.get_num_active_recovered()
|
||||
while not self.is_recovered():
|
||||
if timeout is not None:
|
||||
assert time.time() - start < timeout, \
|
||||
'failed to recover before timeout expired'
|
||||
cur_active_recovered = self.get_num_active_recovered()
|
||||
if cur_active_recovered != num_active_recovered:
|
||||
start = time.time()
|
||||
num_active_recovered = cur_active_recovered
|
||||
time.sleep(3)
|
||||
self.log("recovered!")
|
||||
|
||||
def osd_is_up(self, osd):
|
||||
osds = self.get_osd_dump()
|
||||
return osds[osd]['up'] > 0
|
||||
|
|
|
@ -45,7 +45,7 @@ def task(ctx, config):
|
|||
manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
|
||||
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
|
||||
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
|
||||
manager.wait_till_clean()
|
||||
manager.wait_for_clean()
|
||||
|
||||
# something that is always there
|
||||
dummyfile = '/etc/fstab'
|
||||
|
@ -60,7 +60,7 @@ def task(ctx, config):
|
|||
|
||||
manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
|
||||
manager.raw_cluster_cmd('tell', 'osd.1', 'flush_pg_stats')
|
||||
manager.wait_till_clean()
|
||||
manager.wait_for_recovery()
|
||||
|
||||
# create old objects
|
||||
for f in range(1, 10):
|
||||
|
@ -135,7 +135,7 @@ def task(ctx, config):
|
|||
manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
|
||||
manager.raw_cluster_cmd('tell', 'osd.0', 'flush_pg_stats')
|
||||
manager.raw_cluster_cmd('tell', 'osd.2', 'flush_pg_stats')
|
||||
manager.wait_till_clean()
|
||||
manager.wait_for_recovery()
|
||||
|
||||
# verify result
|
||||
for f in range(1, 10):
|
||||
|
@ -149,4 +149,4 @@ def task(ctx, config):
|
|||
# see if osd.1 can cope
|
||||
manager.revive_osd(1)
|
||||
manager.wait_till_osd_is_up(1)
|
||||
manager.wait_till_clean()
|
||||
manager.wait_for_clean()
|
||||
|
|
|
@ -82,4 +82,4 @@ def task(ctx, config):
|
|||
finally:
|
||||
log.info('joining thrashosds')
|
||||
thrash_proc.do_join()
|
||||
manager.wait_till_clean(config.get('timeout', 360))
|
||||
manager.wait_for_recovery(config.get('timeout', 360))
|
||||
|
|
Loading…
Reference in New Issue