Merge pull request #47079 from neesingh-rh/fix_56483

mgr/stats: missing clients in perf stats command output. Reviewed-by: Venky Shankar <vshankar@redhat.com> Reviewed-by: Jos Collin <jcollin@redhat.com>
2025-02-19 08:57:27 +00:00 · 2022-08-24 09:49:47 +05:30 · 2022-08-24 09:49:47 +05:30 · 49e66ed0c6
commit 49e66ed0c6
parent cadd5fcfc0 e717e216ea
2 changed files with 91 additions and 10 deletions
--- a/qa/tasks/cephfs/test_mds_metrics.py
+++ b/qa/tasks/cephfs/test_mds_metrics.py
@ -511,3 +511,79 @@ class TestMDSMetrics(CephFSTestCase):
            if not (client_metadata[i]['valid_metrics']):
                raise RuntimeError("valid_metrics not found!")

+    def test_perf_stats_stale_metrics_with_multiple_filesystem(self):
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set",
+                    "enable_multiple", "true", "--yes-i-really-mean-it")
+
+        # creating filesystem
+        fs_b = self._setup_fs(fs_name="fs2")
+
+        # Mount a client on fs_b
+        self.mount_b.mount_wait(cephfs_name=fs_b.name)
+        self.mount_b.write_n_mb("test.bin", 1)
+        self.mount_b.path_to_ino("test.bin")
+        self.mount_b.create_files()
+
+        # creating another filesystem
+        fs_a = self._setup_fs(fs_name="fs1")
+
+        # Mount a client on fs_a
+        self.mount_a.mount_wait(cephfs_name=fs_a.name)
+        self.mount_a.write_n_mb("pad.bin", 1)
+        self.mount_a.write_n_mb("test.bin", 2)
+        self.mount_a.path_to_ino("test.bin")
+        self.mount_a.create_files()
+
+        # validate
+        valid, metrics = self._get_metrics(
+            self.verify_mds_metrics(client_count=1, mul_fs=[fs_a.id, fs_b.id]), 30)
+        log.debug(f"metrics={metrics}")
+        self.assertTrue(valid)
+
+        # get mounted client's entries from the global_metrics.
+        client_a_name = f'client.{self.mount_a.get_global_id()}'
+
+        global_metrics = metrics['global_metrics']
+        client_a_metrics = global_metrics.get("fs1", {}).get(client_a_name, {})
+
+        # fail active mds of fs_a
+        fs_a_mds = fs_a.get_active_names()[0]
+        self.mds_cluster.mds_fail(fs_a_mds)
+        fs_a.wait_for_state('up:active', rank=0, timeout=30)
+
+        # spread directory per rank
+        self._spread_directory_on_all_ranks(fs_a.id)
+
+        # spread some I/O
+        self._do_spread_io_all_clients(fs_a.id)
+
+        # wait a bit for mgr to get updated metrics
+        time.sleep(5)
+
+        # validate
+        try:
+            valid, metrics_new = self._get_metrics(
+                self.verify_mds_metrics(client_count=1, mul_fs=[fs_a.id, fs_b.id]), 30)
+            log.debug(f'metrics={metrics_new}')
+            self.assertTrue(valid)
+
+            client_metadata = metrics_new['client_metadata']
+            client_a_metadata = client_metadata.get("fs1", {}).get(client_a_name, {})
+
+            global_metrics = metrics_new['global_metrics']
+            client_a_metrics_new = global_metrics.get("fs1", {}).get(client_a_name, {})
+
+            # the metrics should be different for the test to succeed.
+            self.assertTrue(client_a_metadata and client_a_metrics_new
+                            and (client_a_metrics_new != client_a_metrics),
+                            "Invalid 'ceph fs perf stats' metrics after"
+                            f" rank0 mds of {fs_a.name} failover")
+        except MaxWhileTries:
+            raise RuntimeError("Failed to fetch `ceph fs perf stats` metrics")
+        finally:
+            # cleanup test directories
+            self._cleanup_test_dirs()
+
--- a/src/pybind/mgr/stats/fs/perf_stats.py
+++ b/src/pybind/mgr/stats/fs/perf_stats.py
@ -195,15 +195,17 @@ class FSPerfStats(object):
            gid_state = FSPerfStats.get_rank0_mds_gid_state(self.module.get('fs_map'))
            if not gid_state:
                return
-            rank0_gid, state = gid_state
-            if (rank0_gid and rank0_gid != self.prev_rank0_gid and state == 'up:active'):
-                #the new rank0 MDS is up:active
-                ua_last_updated = time.monotonic()
-                if (self.rqtimer and self.rqtimer.is_alive()):
-                    self.rqtimer.cancel()
-                self.rqtimer = Timer(REREGISTER_TIMER_INTERVAL,
-                                     self.re_register_queries, args=(rank0_gid, ua_last_updated,))
-                self.rqtimer.start()
+            for value in gid_state:
+                rank0_gid, state = value
+                if (rank0_gid and rank0_gid != self.prev_rank0_gid and state == 'up:active'):
+                    #the new rank0 MDS is up:active
+                    ua_last_updated = time.monotonic()
+                    if (self.rqtimer and self.rqtimer.is_alive()):
+                        self.rqtimer.cancel()
+                    self.rqtimer = Timer(REREGISTER_TIMER_INTERVAL,
+                                         self.re_register_queries,
+                                         args=(rank0_gid, ua_last_updated,))
+                    self.rqtimer.start()

    def re_register_queries(self, rank0_gid, ua_last_updated):
        #reregister queries if the metrics are the latest. Otherwise reschedule the timer and
@ -221,12 +223,15 @@ class FSPerfStats(object):

    @staticmethod
    def get_rank0_mds_gid_state(fsmap):
+        gid_state = []
        for fs in fsmap['filesystems']:
            mds_map = fs['mdsmap']
            if mds_map is not None:
                for mds_id, mds_status in mds_map['info'].items():
                    if mds_status['rank'] == 0:
-                        return mds_status['gid'], mds_status['state']
+                        gid_state.append([mds_status['gid'], mds_status['state']])
+        if gid_state:
+            return gid_state
        logger.warn("No rank0 mds in the fsmap")

    def update_client_meta(self):