Merge PR #43613 into master

* refs/pull/43613/head:
	qa: lengthen health warning wait

Reviewed-by: Jeff Layton <jlayton@redhat.com>
Reviewed-by: Xiubo Li <xiubli@redhat.com>
This commit is contained in:
Patrick Donnelly 2021-11-04 16:52:23 -04:00
commit 10d8c7a4a5
No known key found for this signature in database
GPG Key ID: BE69BB7D36E459B4
5 changed files with 26 additions and 29 deletions

View File

@ -156,6 +156,7 @@ class CephTestCase(unittest.TestCase):
log.debug("Not found expected summary strings yet ({0})".format(summary_strings))
return False
log.info(f"waiting {timeout}s for health warning matching {pattern}")
self.wait_until_true(seen_health_warning, timeout)
def wait_for_health_clear(self, timeout):

View File

@ -478,6 +478,17 @@ class MDSCluster(CephCluster):
for fs in self.status().get_filesystems():
Filesystem(ctx=self._ctx, fscid=fs['id']).destroy()
@property
def beacon_timeout(self):
"""
Generate an acceptable timeout for the mons to drive some MDSMap change
because of missed beacons from some MDS. This involves looking up the
grace period in use by the mons and adding an acceptable buffer.
"""
grace = float(self.get_config("mds_beacon_grace", service_type="mon"))
return grace*2+15
class Filesystem(MDSCluster):
"""

View File

@ -306,8 +306,6 @@ class TestFailover(CephFSTestCase):
in thrashing tests.
"""
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
(original_active, ) = self.fs.get_active_names()
original_standbys = self.mds_cluster.get_standby_daemons()
@ -321,7 +319,7 @@ class TestFailover(CephFSTestCase):
log.info("Waiting for promotion of one of the original standbys {0}".format(
original_standbys))
self.wait_until_true(promoted, timeout=grace*2)
self.wait_until_true(promoted, timeout=self.fs.beacon_timeout)
# Start the original rank 0 daemon up again, see that he becomes a standby
self.fs.mds_restart(original_active)
@ -343,8 +341,6 @@ class TestFailover(CephFSTestCase):
if not require_active:
self.skipTest("fuse_require_active_mds is not set")
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
# Check it's not laggy to begin with
(original_active, ) = self.fs.get_active_names()
self.assertNotIn("laggy_since", self.fs.status().get_mds(original_active))
@ -367,7 +363,7 @@ class TestFailover(CephFSTestCase):
return True
self.wait_until_true(laggy, grace * 2)
self.wait_until_true(laggy, self.fs.beacon_timeout)
with self.assertRaises(CommandFailedError):
self.mounts[0].mount_wait()
@ -379,8 +375,6 @@ class TestFailover(CephFSTestCase):
# Need all my standbys up as well as the active daemons
self.wait_for_daemon_start()
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
standbys = self.mds_cluster.get_standby_daemons()
self.assertGreaterEqual(len(standbys), 1)
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)))
@ -388,8 +382,7 @@ class TestFailover(CephFSTestCase):
# Kill a standby and check for warning
victim = standbys.pop()
self.fs.mds_stop(victim)
log.info("waiting for insufficient standby daemon warning")
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
# restart the standby, see that he becomes a standby, check health clears
self.fs.mds_restart(victim)
@ -403,8 +396,7 @@ class TestFailover(CephFSTestCase):
standbys = self.mds_cluster.get_standby_daemons()
self.assertGreaterEqual(len(standbys), 1)
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1))
log.info("waiting for insufficient standby daemon warning")
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout)
# Set it to 0
self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
@ -420,7 +412,6 @@ class TestFailover(CephFSTestCase):
self.mount_a.umount_wait()
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds"))
mds_0 = self.fs.get_rank(rank=0, status=status)
@ -428,7 +419,7 @@ class TestFailover(CephFSTestCase):
self.fs.rank_signal(signal.SIGSTOP, rank=0, status=status)
self.wait_until_true(
lambda: "laggy_since" in self.fs.get_rank(),
timeout=grace * 2
timeout=self.fs.beacon_timeout
)
self.fs.rank_fail(rank=1)
@ -441,7 +432,7 @@ class TestFailover(CephFSTestCase):
self.fs.rank_signal(signal.SIGCONT, rank=0)
self.wait_until_true(
lambda: "laggy_since" not in self.fs.get_rank(rank=0),
timeout=grace * 2
timeout=self.fs.beacon_timeout
)
# mds.b will be stuck at 'reconnect' state if snapserver gets confused

View File

@ -168,15 +168,13 @@ done
# Kill the rank 0
self.fs.mds_stop(original_active)
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
def promoted():
active = self.fs.get_active_names()
return active and active[0] in original_standbys
log.info("Waiting for promotion of one of the original standbys {0}".format(
original_standbys))
self.wait_until_true(promoted, timeout=grace*2)
self.wait_until_true(promoted, timeout=self.fs.beacon_timeout)
self._check_task_status_na()

View File

@ -70,8 +70,6 @@ class TestSnapshots(CephFSTestCase):
self.fs.set_max_mds(2)
status = self.fs.wait_for_daemons()
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
# setup subtrees
self.mount_a.run_shell(["mkdir", "-p", "d1/dir"])
self.mount_a.setfattr("d1", "ceph.dir.pin", "1")
@ -92,7 +90,7 @@ class TestSnapshots(CephFSTestCase):
self.fs.rank_freeze(True, rank=0)
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s1{0}".format(i)], wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
self.delete_mds_coredump(rank0['name']);
self.fs.rank_fail(rank=0)
@ -120,7 +118,7 @@ class TestSnapshots(CephFSTestCase):
self.fs.rank_freeze(True, rank=1) # prevent failover...
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status)
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s2{0}".format(i)], wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*3);
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
self.delete_mds_coredump(rank0['name']);
self.fs.rank_signal(signal.SIGKILL, rank=1)
@ -168,7 +166,7 @@ class TestSnapshots(CephFSTestCase):
self.fs.rank_freeze(True, rank=1) # prevent failover...
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=1, status=status)
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s3{0}".format(i)], wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout);
self.delete_mds_coredump(rank1['name']);
self.mount_a.kill()
@ -210,7 +208,7 @@ class TestSnapshots(CephFSTestCase):
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "8"], rank=0, status=status)
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "3"], rank=1, status=status)
proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s4"], wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=grace*2);
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout);
self.delete_mds_coredump(rank1['name']);
self.mount_a.kill()
@ -223,7 +221,7 @@ class TestSnapshots(CephFSTestCase):
self.wait_for_daemon_start([rank1['name']])
# rollback triggers assertion
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=grace*2);
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout);
self.delete_mds_coredump(rank0['name']);
self.fs.rank_fail(rank=0)
self.fs.mds_restart(rank0['name'])
@ -244,8 +242,6 @@ class TestSnapshots(CephFSTestCase):
self.fs.set_max_mds(3)
status = self.fs.wait_for_daemons()
grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
self.mount_a.run_shell(["mkdir", "-p", "d0/d1/dir"])
self.mount_a.run_shell(["mkdir", "-p", "d0/d2/dir"])
self.mount_a.setfattr("d0", "ceph.dir.pin", "0")
@ -302,7 +298,7 @@ class TestSnapshots(CephFSTestCase):
self.fs.rank_freeze(True, rank=2)
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "9"], rank=2, status=status)
proc = self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s3"], wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout);
self.delete_mds_coredump(rank2['name']);
# mksnap should wait for notify ack from mds.2
@ -328,7 +324,7 @@ class TestSnapshots(CephFSTestCase):
self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "4"], rank=2, status=status)
last_created = self._get_last_created_snap(rank=0)
proc = self.mount_a.run_shell(["mkdir", "d0/d2/dir/.snap/s{0}".format(i)], wait=False)
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=grace*2);
self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout);
self.delete_mds_coredump(rank2['name']);
self.mount_a.kill()