tasks: create ceph-mgr tests

Signed-off-by: John Spray <john.spray@redhat.com>
2025-01-04 10:12:30 +00:00 · 2016-07-16 22:16:53 +01:00 · 2016-07-16 22:16:53 +01:00 · 38c23c1841
commit 38c23c1841
parent a9df9e088d
3 changed files with 166 additions and 0 deletions
--- a/tasks/mgr/init.py
+++ b/tasks/mgr/init.py
--- a/tasks/mgr/mgr_test_case.py
+++ b/tasks/mgr/mgr_test_case.py
@ -0,0 +1,85 @@
+
+from unittest import case
+import json
+
+from teuthology import misc
+from tasks.ceph_test_case import CephTestCase
+
+# TODO move definition of CephCluster
+from tasks.cephfs.filesystem import CephCluster
+
+
+class MgrCluster(CephCluster):
+    def __init__(self, ctx):
+        super(MgrCluster, self).__init__(ctx)
+        self.mgr_ids = list(misc.all_roles_of_type(ctx.cluster, 'mgr'))
+
+        if len(self.mgr_ids) == 0:
+            raise RuntimeError(
+                "This task requires at least one manager daemon")
+
+        self.mgr_daemons = dict(
+            [(mgr_id, self._ctx.daemons.get_daemon('mgr', mgr_id)) for mgr_id
+             in self.mgr_ids])
+
+    @property
+    def admin_remote(self):
+        first_mon = misc.get_first_mon(self._ctx, None)
+        (result,) = self._ctx.cluster.only(first_mon).remotes.iterkeys()
+        return result
+
+    def mgr_stop(self, mgr_id):
+        self.mgr_daemons[mgr_id].stop()
+
+    def mgr_fail(self, mgr_id):
+        self.mon_manager.raw_cluster_cmd("mgr", "fail", mgr_id)
+
+    def mgr_restart(self, mgr_id):
+        self.mgr_daemons[mgr_id].restart()
+
+    def get_mgr_map(self):
+        status = json.loads(
+            self.mon_manager.raw_cluster_cmd("status", "--format=json-pretty"))
+
+        return status["mgrmap"]
+
+    def get_active_id(self):
+        return self.get_mgr_map()["active_name"]
+
+    def get_standby_ids(self):
+        return [s['name'] for s in self.get_mgr_map()["standbys"]]
+
+
+class MgrTestCase(CephTestCase):
+    REQUIRE_MGRS = 1
+
+    def setUp(self):
+        super(MgrTestCase, self).setUp()
+
+        # The test runner should have populated this
+        assert self.mgr_cluster is not None
+
+        if len(self.mgr_cluster.mgr_ids) < self.REQUIRE_MGRS:
+            raise case.SkipTest("Only have {0} manager daemons, "
+                                "{1} are required".format(
+                len(self.mgr_cluster.mgr_ids), self.REQUIRE_MGRS))
+
+        # Restart all the daemons
+        for daemon in self.mgr_cluster.mgr_daemons.values():
+            daemon.stop()
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            self.mgr_cluster.mgr_fail(mgr_id)
+
+        for daemon in self.mgr_cluster.mgr_daemons.values():
+            daemon.restart()
+
+        # Wait for an active to come up
+        self.wait_until_true(lambda: self.mgr_cluster.get_active_id() != "",
+                             timeout=20)
+
+        expect_standbys = set(self.mgr_cluster.mgr_ids) \
+                          - {self.mgr_cluster.get_active_id()}
+        self.wait_until_true(
+            lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
+            timeout=20)
--- a/tasks/mgr/test_failover.py
+++ b/tasks/mgr/test_failover.py
@ -0,0 +1,81 @@
+
+import logging
+
+from tasks.mgr.mgr_test_case import MgrTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+class TestFailover(MgrTestCase):
+    REQUIRE_MGRS = 2
+
+    def test_timeout(self):
+        """
+        That when an active mgr stops responding, a standby is promoted
+        after mon_mgr_beacon_grace.
+        """
+
+        # Query which mgr is active
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        # Stop that daemon
+        self.mgr_cluster.mgr_stop(original_active)
+
+        # Assert that the other mgr becomes active
+        self.wait_until_true(
+            lambda: self.mgr_cluster.get_active_id() in original_standbys,
+            timeout=60
+        )
+
+        self.mgr_cluster.mgr_restart(original_active)
+        self.wait_until_true(
+            lambda: original_active in self.mgr_cluster.get_standby_ids(),
+            timeout=10
+        )
+
+    def test_explicit_fail(self):
+        """
+        That when a user explicitly fails a daemon, a standby immediately
+        replaces it.
+        :return:
+        """
+        # Query which mgr is active
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        self.mgr_cluster.mgr_fail(original_active)
+
+        # A standby should take over
+        self.wait_until_true(
+            lambda: self.mgr_cluster.get_active_id() in original_standbys,
+            timeout=60
+        )
+
+        # The one we failed should come back as a standby (he isn't
+        # really dead)
+        self.wait_until_true(
+            lambda: original_active in self.mgr_cluster.get_standby_ids(),
+            timeout=10
+        )
+
+    def test_standby_timeout(self):
+        """
+        That when a standby daemon stops sending beacons, it is
+        removed from the list of standbys
+        :return:
+        """
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        victim = original_standbys[0]
+        self.mgr_cluster.mgr_stop(victim)
+
+        expect_standbys = set(original_standbys) - {victim}
+
+        self.wait_until_true(
+            lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
+            timeout=60
+        )
+        self.assertEqual(self.mgr_cluster.get_active_id(), original_active)