verify all osds start before checking health

Just checking health isn't good enough, since it races with OSD startup:
we can have a healthy cluster with 0 (or something else < total) OSDs.
This commit is contained in:
Sage Weil 2012-01-08 15:14:18 -08:00
parent f4883ebf09
commit 50463ffddd
2 changed files with 30 additions and 0 deletions

View File

@ -9,6 +9,7 @@ import time
import urllib2
import urlparse
import yaml
import json
from .orchestra import run
@ -286,6 +287,31 @@ def wait_until_healthy(remote):
break
time.sleep(1)
def wait_until_osds_up(cluster, remote):
"""Wait until all Ceph OSDs are booted."""
num_osds = num_instances_of_type(cluster, 'osd')
while True:
r = remote.run(
args=[
'/tmp/cephtest/enable-coredump',
'/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
'/tmp/cephtest/archive/coverage',
'/tmp/cephtest/binary/usr/local/bin/ceph',
'-c', '/tmp/cephtest/ceph.conf',
'--concise',
'osd', 'dump', '--format=json'
],
stdout=StringIO(),
logger=log.getChild('health'),
)
out = r.stdout.getvalue()
j = json.loads('\n'.join(out.split('\n')[1:]))
up = len(j['osds'])
log.debug('%d of %d OSDs are up' % (up, num_osds))
if up == num_osds:
break
time.sleep(1)
def wait_until_fuse_mounted(remote, fuse, mountpoint):
while True:
proc = remote.run(

View File

@ -904,6 +904,10 @@ def healthy(ctx, config):
log.info('Waiting until ceph is healthy...')
firstmon = teuthology.get_first_mon(ctx, config)
(mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
teuthology.wait_until_osds_up(
cluster=ctx.cluster,
remote=mon0_remote
)
teuthology.wait_until_healthy(
remote=mon0_remote,
)