mirror of
https://github.com/ceph/ceph
synced 2025-02-23 11:07:35 +00:00
Merge PR #43613 into master
* refs/pull/43613/head: qa: lengthen health warning wait Reviewed-by: Jeff Layton <jlayton@redhat.com> Reviewed-by: Xiubo Li <xiubli@redhat.com>
This commit is contained in:
commit
10d8c7a4a5
@ -156,6 +156,7 @@ class CephTestCase(unittest.TestCase):
|
||||
log.debug("Not found expected summary strings yet ({0})".format(summary_strings))
|
||||
return False
|
||||
|
||||
log.info(f"waiting {timeout}s for health warning matching {pattern}")
|
||||
self.wait_until_true(seen_health_warning, timeout)
|
||||
|
||||
def wait_for_health_clear(self, timeout):
|
||||
|
@ -478,6 +478,17 @@ class MDSCluster(CephCluster):
|
||||
for fs in self.status().get_filesystems():
|
||||
Filesystem(ctx=self._ctx, fscid=fs['id']).destroy()
|
||||
|
||||
@property
|
||||
def beacon_timeout(self):
|
||||
"""
|
||||
Generate an acceptable timeout for the mons to drive some MDSMap change
|
||||
because of missed beacons from some MDS. This involves looking up the
|
||||
grace period in use by the mons and adding an acceptable buffer.
|
||||
"""
|
||||
|
||||
grace = float(self.get_config("mds_beacon_grace", service_type="mon"))
|
||||
return grace*2+15
|
||||
|
||||
|
||||
class Filesystem(MDSCluster):
|
||||
"""
|
||||
|
@ -306,8 +306,6 @@ class TestFailover(CephFSTestCase):
|
||||
in thrashing tests.
|
||||
"""
|
||||
|
||||
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
|
||||
|
||||
(original_active, ) = self.fs.get_active_names()
|
||||
original_standbys = self.mds_cluster.get_standby_daemons()
|
||||
|
||||
@ -321,7 +319,7 @@ class TestFailover(CephFSTestCase):
|
||||
|
||||
log.info("Waiting for promotion of one of the original standbys {0}".format(
|
||||
original_standbys))
|
||||
self.wait_until_true(promoted, timeout=grace*2)
|
||||
self.wait_until_true(promoted, timeout=self.fs.beacon_timeout)
|
||||
|
||||
# Start the original rank 0 daemon up again, see that he becomes a standby
|
||||
self.fs.mds_restart(original_active)
|
||||
@ -343,8 +341,6 @@ class TestFailover(CephFSTestCase):
|
||||
if not require_active:
|
||||
self.skipTest("fuse_require_active_mds is not set")
|
||||
|
||||
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
|
||||
|
||||
# Check it's not laggy to begin with
|
||||
(original_active, ) = self.fs.get_active_names()
|
||||
self.assertNotIn("laggy_since", self.fs.status().get_mds(original_active))
|
||||
@ -367,7 +363,7 @@ class TestFailover(CephFSTestCase):
|
||||
|
||||
return True
|
||||
|
||||
self.wait_until_true(laggy, grace * 2)
|
||||
self.wait_until_true(laggy, self.fs.beacon_timeout)
|
||||
with self.assertRaises(CommandFailedError):
|
||||
self.mounts[0].mount_wait()
|
||||
|
||||
@ -379,8 +375,6 @@ class TestFailover(CephFSTestCase):
|
||||
# Need all my standbys up as well as the active daemons
|
||||
self.wait_for_daemon_start()
|
||||
|
||||
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
|
||||
|
||||
standbys = self.mds_cluster.get_standby_daemons()
|
||||
self.assertGreaterEqual(len(standbys), 1)
|
||||
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)))
|
||||
@ -388,8 +382,7 @@ class TestFailover(CephFSTestCase):
|
||||
# Kill a standby and check for warning
|
||||
victim = standbys.pop()
|
||||
self.fs.mds_stop(victim)
|
||||
log.info("waiting for insufficient standby daemon warning")
|
||||
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
|
||||
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
|
||||
|
||||
# restart the standby, see that he becomes a standby, check health clears
|
||||
self.fs.mds_restart(victim)
|
||||
@ -403,8 +396,7 @@ class TestFailover(CephFSTestCase):
|
||||
standbys = self.mds_cluster.get_standby_daemons()
|
||||
self.assertGreaterEqual(len(standbys), 1)
|
||||
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1))
|
||||
log.info("waiting for insufficient standby daemon warning")
|
||||
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
|
||||
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
|
||||
|
||||
# Set it to 0
|
||||
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
|
||||
@ -420,7 +412,6 @@ class TestFailover(CephFSTestCase):
|
||||
|
||||
self.mount_a.umount_wait()
|
||||
|
||||
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
|
||||
monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds"))
|
||||
|
||||
mds_0 = self.fs.get_rank(rank=0, status=status)
|
||||
@ -428,7 +419,7 @@ class TestFailover(CephFSTestCase):
|
||||
self.fs.rank_signal(signal.SIGSTOP, rank=0, status=status)
|
||||
self.wait_until_true(
|
||||
lambda: "laggy_since" in self.fs.get_rank(),
|
||||
timeout=grace * 2
|
||||
timeout=self.fs.beacon_timeout
|
||||
)
|
||||
|
||||
self.fs.rank_fail(rank=1)
|
||||
@ -441,7 +432,7 @@ class TestFailover(CephFSTestCase):
|
||||
self.fs.rank_signal(signal.SIGCONT, rank=0)
|
||||
self.wait_until_true(
|
||||
lambda: "laggy_since" not in self.fs.get_rank(rank=0),
|
||||
timeout=grace * 2
|
||||
timeout=self.fs.beacon_timeout
|
||||
)
|
||||
|
||||
# mds.b will be stuck at 'reconnect' state if snapserver gets confused
|
||||
|
@ -168,15 +168,13 @@ done
|
||||
# Kill the rank 0
|
||||
self.fs.mds_stop(original_active)
|
||||
|
||||
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
|
||||
|
||||
def promoted():
|
||||
active = self.fs.get_active_names()
|
||||
return active and active[0] in original_standbys
|
||||
|
||||
log.info("Waiting for promotion of one of the original standbys {0}".format(
|
||||
original_standbys))
|
||||
self.wait_until_true(promoted, timeout=grace*2)
|
||||
self.wait_until_true(promoted, timeout=self.fs.beacon_timeout)
|
||||
|
||||
self._check_task_status_na()
|
||||
|
||||
|
@ -70,8 +70,6 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.set_max_mds(2)
|
||||
status = self.fs.wait_for_daemons()
|
||||
|
||||
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
|
||||
|
||||
# setup subtrees
|
||||
self.mount_a.run_shell(["mkdir", "-p", "d1/dir"])
|
||||
self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
|
||||
@ -92,7 +90,7 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.rank_freeze(True, rank=0)
|
||||
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
|
||||
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s1{0}".format(i)], wait=False)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
|
||||
self.delete_mds_coredump(rank0['name']);
|
||||
|
||||
self.fs.rank_fail(rank=0)
|
||||
@ -120,7 +118,7 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.rank_freeze(True, rank=1) # prevent failover...
|
||||
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
|
||||
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s2{0}".format(i)], wait=False)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*3);
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
|
||||
self.delete_mds_coredump(rank0['name']);
|
||||
|
||||
self.fs.rank_signal(signal.SIGKILL, rank=1)
|
||||
@ -168,7 +166,7 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.rank_freeze(True, rank=1) # prevent failover...
|
||||
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=1, status=status)
|
||||
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s3{0}".format(i)], wait=False)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout);
|
||||
self.delete_mds_coredump(rank1['name']);
|
||||
|
||||
self.mount_a.kill()
|
||||
@ -210,7 +208,7 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "8"], rank=0, status=status)
|
||||
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "3"], rank=1, status=status)
|
||||
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s4"], wait=False)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout);
|
||||
self.delete_mds_coredump(rank1['name']);
|
||||
|
||||
self.mount_a.kill()
|
||||
@ -223,7 +221,7 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.wait_for_daemon_start([rank1['name']])
|
||||
|
||||
# rollback triggers assertion
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
|
||||
self.delete_mds_coredump(rank0['name']);
|
||||
self.fs.rank_fail(rank=0)
|
||||
self.fs.mds_restart(rank0['name'])
|
||||
@ -244,8 +242,6 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.set_max_mds(3)
|
||||
status = self.fs.wait_for_daemons()
|
||||
|
||||
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
|
||||
|
||||
self.mount_a.run_shell(["mkdir", "-p", "d0/d1/dir"])
|
||||
self.mount_a.run_shell(["mkdir", "-p", "d0/d2/dir"])
|
||||
self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
|
||||
@ -302,7 +298,7 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.rank_freeze(True, rank=2)
|
||||
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "9"], rank=2, status=status)
|
||||
proc = self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s3"], wait=False)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout);
|
||||
self.delete_mds_coredump(rank2['name']);
|
||||
|
||||
# mksnap should wait for notify ack from mds.2
|
||||
@ -328,7 +324,7 @@ class TestSnapshots(CephFSTestCase):
|
||||
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "4"], rank=2, status=status)
|
||||
last_created = self._get_last_created_snap(rank=0)
|
||||
proc = self.mount_a.run_shell(["mkdir", "d0/d2/dir/.snap/s{0}".format(i)], wait=False)
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
|
||||
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout);
|
||||
self.delete_mds_coredump(rank2['name']);
|
||||
|
||||
self.mount_a.kill()
|
||||
|
Loading…
Reference in New Issue
Block a user