qa: Added mon connection score tests

Basically when we deploy a 3 MONS Check if the connection scores are clean with a 60 seconds grace period Fixes: https://tracker.ceph.com/issues/65695 Signed-off-by: Kamoltat <ksirivad@redhat.com>
2025-02-19 17:08:05 +00:00 · 2024-07-17 22:26:55 +00:00 · 2024-07-17 22:26:55 +00:00 · ed7f4e8829
commit ed7f4e8829
parent c05d4e2716
3 changed files with 139 additions and 7 deletions
--- a/qa/suites/rados/singleton/all/mon-connection-score.yaml
+++ b/qa/suites/rados/singleton/all/mon-connection-score.yaml
@ -0,0 +1,40 @@
+roles:
+- - mon.a
+  - mon.b
+  - mon.c
+  - osd.0
+  - osd.1
+  - osd.2
+  - mgr.x
+  - client.0
+
+openstack:
+  - volumes: # attached to each instance
+      count: 3
+      size: 10 # GB
+tasks:
+- install:
+- ceph:
+    pre-mgr-commands:
+      - sudo ceph config set mgr mgr_pool false --force
+    log-ignorelist:
+      - overall HEALTH_
+      - \(OSDMAP_FLAGS\)
+      - \(OSD_
+      - \(PG_
+      - \(POOL_
+      - \(CACHE_POOL_
+      - \(OBJECT_
+      - \(SLOW_OPS\)
+      - \(REQUEST_SLOW\)
+      - \(TOO_FEW_PGS\)
+      - slow request
+      - \(POOL_APP_NOT_ENABLED\)
+      - overall HEALTH_
+      - \(MGR_DOWN\)
+      - \(MON_DOWN\)
+      - \(PG_AVAILABILITY\)
+      - \(SLOW_OPS\)
+- cephfs_test_runner:
+    modules:
+      - tasks.mon_connection_score
--- a/qa/tasks/ceph_test_case.py
+++ b/qa/tasks/ceph_test_case.py
@ -353,13 +353,10 @@ class CephTestCase(unittest.TestCase, RunCephCmd):
        while True:
            if condition():
                success_time_elapsed = 0
-                while success_time_elapsed < success_hold_time:
-                    if condition():
-                        success_time_elapsed += 1
-                        time.sleep(1)
-                        elapsed += 1
-                    else:
-                        break
+                while success_time_elapsed < success_hold_time and condition():
+                    success_time_elapsed += 1
+                    time.sleep(1)
+                    elapsed += 1
                if success_time_elapsed == success_hold_time:
                    log.debug("wait_until_true_and_hold: success for {0}s".format(success_hold_time))
                    return
--- a/qa/tasks/mon_connection_score.py
+++ b/qa/tasks/mon_connection_score.py
@ -0,0 +1,95 @@
+from tasks.ceph_test_case import CephTestCase
+import json
+import logging
+log = logging.getLogger(__name__)
+
+
+class TestStretchClusterNew(CephTestCase):
+
+    CLUSTER = "ceph"
+    MONS = {
+            "a": {
+                "rank": 0,
+                },
+            "b": {
+                "rank": 1,
+            },
+            "c": {
+                "rank": 2,
+            }
+        }
+    WRITE_PERIOD = 10
+    RECOVERY_PERIOD = WRITE_PERIOD * 6
+    SUCCESS_HOLD_TIME = 10
+
+    def setUp(self):
+        """
+        Set up the cluster for the test.
+        """
+        super(TestStretchClusterNew, self).setUp()
+
+    def tearDown(self):
+        """
+        Clean up the cluter after the test.
+        """
+        super(TestStretchClusterNew, self).tearDown()
+
+    def _check_connection_score(self):
+        """
+        Check the connection score of all the mons.
+        """
+        for mon, _ in self.MONS.items():
+            # get the connection score
+            cscore = self.ceph_cluster.mon_manager.raw_cluster_cmd(
+                'daemon', 'mon.{}'.format(mon),
+                'connection', 'scores', 'dump')
+            # parse the connection score
+            cscore = json.loads(cscore)
+            # check if the current mon rank is correct
+            if cscore["rank"] != self.MONS[mon]["rank"]:
+                log.error(
+                    "Rank mismatch {} != {}".format(
+                        cscore["rank"], self.MONS[mon]["rank"]
+                    )
+                )
+                return False
+            # check if current mon have all the peer reports and ourself
+            if len(cscore['reports']) != len(self.MONS):
+                log.error(
+                    "Reports count mismatch {}".format(cscore['reports'])
+                )
+                return False
+
+            for report in cscore["reports"]:
+                report_rank = []
+                for peer in report["peer_scores"]:
+                    # check if the peer is alive
+                    if not peer["peer_alive"]:
+                        log.error("Peer {} is not alive".format(peer))
+                        return False
+                    report_rank.append(peer["peer_rank"])
+
+                # check if current mon has all the ranks and no duplicates
+                expected_ranks = [
+                    rank
+                    for data in self.MONS.values()
+                    for rank in data.values()
+                ]
+                if report_rank.sort() != expected_ranks.sort():
+                    log.error("Rank mismatch in report {}".format(report))
+                    return False
+
+        log.info("Connection score is clean!")
+        return True
+
+    def test_connection_score(self):
+        # check if all mons are in quorum
+        self.ceph_cluster.mon_manager.wait_for_mon_quorum_size(3)
+        # check if all connection scores reflect this
+        self.wait_until_true_and_hold(
+            lambda: self._check_connection_score(),
+            # Wait for 4 minutes for the connection score to recover
+            timeout=self.RECOVERY_PERIOD * 4,
+            # Hold the clean connection score for 60 seconds
+            success_hold_time=self.SUCCESS_HOLD_TIME * 6
+        )