verify all osds start before checking health

Just checking health isn't good enough, since it races with OSD startup: we can have a healthy cluster with 0 (or something else < total) OSDs.
2025-03-11 02:39:05 +00:00 · 2012-01-08 15:14:18 -08:00 · 2012-01-08 15:14:18 -08:00 · 50463ffddd
commit 50463ffddd
parent f4883ebf09
2 changed files with 30 additions and 0 deletions
--- a/teuthology/misc.py
+++ b/teuthology/misc.py
@ -9,6 +9,7 @@ import time
 import urllib2
 import urlparse
 import yaml
+import json

 from .orchestra import run

@ -286,6 +287,31 @@ def wait_until_healthy(remote):
            break
        time.sleep(1)

+def wait_until_osds_up(cluster, remote):
+    """Wait until all Ceph OSDs are booted."""
+    num_osds = num_instances_of_type(cluster, 'osd')
+    while True:
+        r = remote.run(
+            args=[
+                '/tmp/cephtest/enable-coredump',
+                '/tmp/cephtest/binary/usr/local/bin/ceph-coverage',
+                '/tmp/cephtest/archive/coverage',
+                '/tmp/cephtest/binary/usr/local/bin/ceph',
+                '-c', '/tmp/cephtest/ceph.conf',
+                '--concise',
+                'osd', 'dump', '--format=json'
+                ],
+            stdout=StringIO(),
+            logger=log.getChild('health'),
+            )
+        out = r.stdout.getvalue()
+        j = json.loads('\n'.join(out.split('\n')[1:]))
+        up = len(j['osds'])
+        log.debug('%d of %d OSDs are up' % (up, num_osds))
+        if up == num_osds:
+            break
+        time.sleep(1)
+
 def wait_until_fuse_mounted(remote, fuse, mountpoint):
    while True:
        proc = remote.run(
--- a/teuthology/task/ceph.py
+++ b/teuthology/task/ceph.py
@ -904,6 +904,10 @@ def healthy(ctx, config):
    log.info('Waiting until ceph is healthy...')
    firstmon = teuthology.get_first_mon(ctx, config)
    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
+    teuthology.wait_until_osds_up(
+        cluster=ctx.cluster,
+        remote=mon0_remote
+        )
    teuthology.wait_until_healthy(
        remote=mon0_remote,
        )