Merge pull request #56066 from rishabh-d-dave/mds-fail-confirm

mon,cephfs: require confirmation flag to bring down unhealthy MDS Reviewed-by: Leonid Usov <leonid.usov@ibm.com> Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
2025-02-17 16:07:37 +00:00 · 2024-05-03 13:13:34 +05:30 · 2024-05-03 13:13:34 +05:30 · 18c7799cce
commit 18c7799cce
parent 6f3d652b8e 214d614309
11 changed files with 280 additions and 7 deletions
--- a/5
+++ b/5
@ -178,6 +178,11 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config
  and it can be disabled by using:
   `ceph config set mgr mgr/volumes/snapshot_clone_no_wait false`

+* CephFS: Command "ceph mds fail" and "ceph fs fail" now requires a
+  confirmation flag when some MDSs exhibit health warning MDS_TRIM or
+  MDS_CACHE_OVERSIZED. This is to prevent accidental MDS failover causing
+  further delays in recovery.
+
 >=18.0.0

 * The RGW policy parser now rejects unknown principals by default. If you are
--- a/doc/cephfs/add-remove-mds.rst
+++ b/doc/cephfs/add-remove-mds.rst
@ -117,4 +117,11 @@ the following method.

 	$ sudo rm -rf /var/lib/ceph/mds/ceph-${id}

+
+.. note:: When an active MDS either has health warning MDS_TRIM or
+   MDS_CACHE_OVERSIZED, confirmation flag (--yes-i-really-mean-it)
+   needs to be passed, else the command will fail. It is not recommended to
+   restart an MDS which has these warnings since slow recovery at restart may
+   lead to more problems.
+
 .. _MDS Config Reference: ../mds-config-ref
--- a/doc/cephfs/administration.rst
+++ b/doc/cephfs/administration.rst
@ -193,7 +193,11 @@ file system and MDS daemons down, use the ``ceph fs fail`` command:

 ::

-    ceph fs fail <fs_name>
+    ceph fs fail <fs_name> {--yes-i-really-mean-it}
+
+.. note:: Note that confirmation flag is optional because it is only required
+   when the MDS is active and has health warning MDS_TRIM or
+   MDS_CACHE_OVERSIZED.

 This command sets a file system flag to prevent standbys from
 activating on the file system (the ``joinable`` flag).
@ -210,7 +214,11 @@ respawn as standbys. The file system will be left in a degraded state.
 ::

    # For all ranks, 0-N:
-    ceph mds fail <fs_name>:<n>
+    ceph mds fail <fs_name>:<n> {--yes-i-really-mean-it}
+
+.. note:: Note that confirmation flag is optional because it is only required
+   when the MDS is active and has health warning MDS_TRIM or
+   MDS_CACHE_OVERSIZED.

 Once all ranks are inactive, the file system may also be deleted or left in
 this state for other purposes (perhaps disaster recovery).
--- a/qa/suites/fs/functional/tasks/admin.yaml
+++ b/qa/suites/fs/functional/tasks/admin.yaml
@ -5,6 +5,8 @@ overrides:
        lockdep: true
    log-ignorelist:
      - missing required features
+      - \(MDS_CACHE_OVERSIZED\)
+      - \(MDS_TRIM\)
 tasks:
  - cephfs_test_runner:
      fail_on_skip: false
--- a/qa/tasks/cephfs/filesystem.py
+++ b/qa/tasks/cephfs/filesystem.py
@ -612,7 +612,12 @@ class FilesystemBase(MDSClusterBase):
        self.run_ceph_cmd("fs", "reset", str(self.name), '--yes-i-really-mean-it')

    def fail(self):
-        self.run_ceph_cmd("fs", "fail", str(self.name))
+        cmd = ["fs", "fail", str(self.name)]
+        try:
+            self.run_ceph_cmd(cmd)
+        except CommandFailedError:
+            cmd.append("--yes-i-really-mean-it")
+            self.run_ceph_cmd(cmd)

    def set_flag(self, var, *args):
        a = map(lambda x: str(x).lower(), args)
@ -1181,8 +1186,13 @@ class FilesystemBase(MDSClusterBase):
    def rank_repaired(self, rank):
        self.run_ceph_cmd("mds", "repaired", "{}:{}".format(self.id, rank))

-    def rank_fail(self, rank=0):
-        self.run_ceph_cmd("mds", "fail", "{}:{}".format(self.id, rank))
+    def rank_fail(self, rank=0, confirm=True):
+        cmd = f'mds fail {self.id}:{rank}'
+        try:
+            self.run_ceph_cmd(args=cmd)
+        except CommandFailedError:
+            cmd += ' --yes--i-really-mean-it'
+            self.run_ceph_cmd(args=cmd)

    def rank_is_running(self, rank=0, status=None):
        name = self.get_rank(rank=rank, status=status)['name']
--- a/qa/tasks/cephfs/test_admin.py
+++ b/qa/tasks/cephfs/test_admin.py
@ -91,6 +91,38 @@ class TestAdminCommands(CephFSTestCase):
        if overwrites:
            self.run_ceph_cmd('osd', 'pool', 'set', n+"-data", 'allow_ec_overwrites', 'true')

+    def _get_unhealthy_mds_id(self, health_report, health_warn):
+        '''
+        Return MDS ID for which health warning in "health_warn" has been
+        generated.
+        '''
+        # variable "msg" should hold string something like this -
+        # 'mds.b(mds.0): Behind on trimming (865/10) max_segments: 10,
+        # num_segments: 86
+        msg = health_report['checks'][health_warn]['detail'][0]['message']
+        mds_id = msg.split('(')[0]
+        mds_id = mds_id.replace('mds.', '')
+        return mds_id
+
+    def wait_till_health_warn(self, health_warn, active_mds_id, sleep=3,
+                              tries=10):
+        errmsg = (f'Expected health warning "{health_warn}" to eventually '
+                  'show up in output of command "ceph health detail". Tried '
+                  f'{tries} times with interval of {sleep} seconds but the '
+                  'health warning didn\'t turn up.')
+
+        with safe_while(sleep=sleep, tries=tries, action=errmsg) as proceed:
+            while proceed():
+                self.get_ceph_cmd_stdout(
+                    f'tell mds.{active_mds_id} cache status')
+
+                health_report = json.loads(self.get_ceph_cmd_stdout(
+                    'health detail --format json'))
+
+                if health_warn in health_report['checks']:
+                    return
+
+
@classhook('_add_valid_tell')
 class TestValidTell(TestAdminCommands):
    @classmethod
@ -2154,3 +2186,166 @@ class TestPermErrMsg(CephFSTestCase):
                args=(f'fs authorize {self.fs.name} {self.CLIENT_NAME} / '
                      f'{wrong_perm}'), retval=self.EXPECTED_ERRNO,
                errmsgs=self.EXPECTED_ERRMSG)
+
+
+class TestFSFail(TestAdminCommands):
+
+    MDSS_REQUIRED = 2
+    CLIENTS_REQUIRED = 1
+
+    def test_with_health_warn_oversize_cache(self):
+        '''
+        Test that, when health warning MDS_CACHE_OVERSIZE is present for an
+        MDS, command "ceph fs fail" fails without confirmation flag and passes
+        when confirmation flag is passed.
+        '''
+        health_warn = 'MDS_CACHE_OVERSIZED'
+        self.config_set('mds', 'mds_cache_memory_limit', '1K')
+        self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
+        active_mds_id = self.fs.get_active_names()[0]
+
+        self.mount_a.open_n_background('.', 400)
+        self.wait_till_health_warn(health_warn, active_mds_id)
+
+        # actual testing begins now.
+        errmsg = 'mds_cache_oversized'
+        self.negtest_ceph_cmd(args=f'fs fail {self.fs.name}',
+                              retval=1, errmsgs=errmsg)
+        self.run_ceph_cmd(f'fs fail {self.fs.name} --yes-i-really-mean-it')
+
+    def test_with_health_warn_trim(self):
+        '''
+        Test that, when health warning MDS_TRIM is present for an MDS, command
+        "ceph fs fail" fails without confirmation flag and passes when
+        confirmation flag is passed.
+        '''
+        health_warn = 'MDS_TRIM'
+        # for generating health warning MDS_TRIM
+        self.config_set('mds', 'mds_debug_subtrees', 'true')
+        # this will really really slow the trimming, so that MDS_TRIM stays
+        # for longer.
+        self.config_set('mds', 'mds_log_trim_decay_rate', '60')
+        self.config_set('mds', 'mds_log_trim_threshold', '1')
+        active_mds_id = self.fs.get_active_names()[0]
+
+        self.mount_a.open_n_background('.', 400)
+        self.wait_till_health_warn(health_warn, active_mds_id)
+
+        # actual testing begins now.
+        errmsg = 'mds_trim'
+        self.negtest_ceph_cmd(args=f'fs fail {self.fs.name}',
+                              retval=1, errmsgs=errmsg)
+        self.run_ceph_cmd(f'fs fail {self.fs.name} --yes-i-really-mean-it')
+
+    def test_with_health_warn_with_2_active_MDSs(self):
+        '''
+        Test that, when a CephFS has 2 active MDSs and one of them have either
+        health warning MDS_TRIM or MDS_CACHE_OVERSIZE, running "ceph fs fail"
+        fails without confirmation flag and passes when confirmation flag is
+        passed.
+        '''
+        health_warn = 'MDS_CACHE_OVERSIZED'
+        self.fs.set_max_mds(2)
+        self.config_set('mds', 'mds_cache_memory_limit', '1K')
+        self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
+        self.fs.wait_for_daemons()
+        mds1_id, mds2_id = self.fs.get_active_names()
+
+        self.mount_a.open_n_background('.', 400)
+        # MDS ID for which health warning has been generated.
+        self.wait_till_health_warn(health_warn, mds1_id)
+
+        # actual testing begins now.
+        errmsg = 'mds_cache_oversized'
+        self.negtest_ceph_cmd(args=f'fs fail {self.fs.name}',
+                              retval=1, errmsgs=errmsg)
+        self.run_ceph_cmd(f'fs fail {self.fs.name} --yes-i-really-mean-it')
+
+
+class TestMDSFail(TestAdminCommands):
+
+    MDSS_REQUIRED = 2
+    CLIENTS_REQUIRED = 1
+
+    def test_with_health_warn_oversize_cache(self):
+        '''
+        Test that, when health warning MDS_CACHE_OVERSIZE is present for an
+        MDS, command "ceph mds fail" fails without confirmation flag and
+        passes when confirmation flag is passed.
+        '''
+        health_warn = 'MDS_CACHE_OVERSIZED'
+        self.config_set('mds', 'mds_cache_memory_limit', '1K')
+        self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
+        active_mds_id = self.fs.get_active_names()[0]
+
+        self.mount_a.open_n_background('.', 400)
+        self.wait_till_health_warn(health_warn, active_mds_id)
+
+        # actual testing begins now.
+        errmsg = 'mds_cache_oversized'
+        self.negtest_ceph_cmd(args=f'mds fail {active_mds_id}',
+                              retval=1, errmsgs=errmsg)
+        self.run_ceph_cmd(f'mds fail {self.fs.name} --yes-i-really-mean-it')
+
+    def test_with_health_warn_trim(self):
+        '''
+        Test that, when health warning MDS_TRIM is present for an MDS, command
+        "ceph mds fail" fails without confirmation flag and passes when
+        confirmation is passed.
+        '''
+        health_warn = 'MDS_TRIM'
+        # for generating health warning MDS_TRIM
+        self.config_set('mds', 'mds_debug_subtrees', 'true')
+        # this will really really slow the trimming, so that MDS_TRIM stays
+        # for longer.
+        self.config_set('mds', 'mds_log_trim_decay_rate', '60')
+        self.config_set('mds', 'mds_log_trim_threshold', '1')
+        active_mds_id = self.fs.get_active_names()[0]
+
+        self.mount_a.open_n_background('.', 400)
+        self.wait_till_health_warn(health_warn, active_mds_id)
+
+        # actual testing begins now...
+        errmsg = 'mds_trim'
+        self.negtest_ceph_cmd(args=f'mds fail {active_mds_id}',
+                              retval=1, errmsgs=errmsg)
+        self.run_ceph_cmd(f'mds fail {self.fs.name} --yes-i-really-mean-it')
+
+    def test_with_health_warn_with_2_active_MDSs(self):
+        '''
+        Test when a CephFS has 2 active MDSs and one of them have either
+        health warning MDS_TRIM or MDS_CACHE_OVERSIZE, running "ceph mds fail"
+        fails for both MDSs without confirmation flag and passes for both when
+        confirmation flag is passed.
+        '''
+        health_warn = 'MDS_CACHE_OVERSIZED'
+        self.fs.set_max_mds(2)
+        self.config_set('mds', 'mds_cache_memory_limit', '1K')
+        self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
+        self.fs.wait_for_daemons()
+        mds1_id, mds2_id = self.fs.get_active_names()
+
+        self.mount_a.open_n_background('.', 400)
+        self.wait_till_health_warn(health_warn, mds1_id)
+
+        health_report = json.loads(self.get_ceph_cmd_stdout('health detail '
+                                                            '--format json'))
+        # MDS ID for which health warning has been generated.
+        hw_mds_id = self._get_unhealthy_mds_id(health_report, health_warn)
+        if mds1_id == hw_mds_id:
+            non_hw_mds_id = mds2_id
+        elif mds2_id == hw_mds_id:
+            non_hw_mds_id = mds1_id
+        else:
+            raise RuntimeError('There are only 2 MDSs right now but apparently'
+                               'health warning was raised for an MDS other '
+                               'than these two. This is definitely an error.')
+
+        # actual testing begins now...
+        errmsg = 'mds_cache_oversized'
+        self.negtest_ceph_cmd(args=f'mds fail {non_hw_mds_id}', retval=1,
+                              errmsgs=errmsg)
+        self.negtest_ceph_cmd(args=f'mds fail {hw_mds_id}', retval=1,
+                              errmsgs=errmsg)
+        self.run_ceph_cmd('mds fail mds1_id --yes-i-really-mean-it')
+        self.run_ceph_cmd('mds fail mds2_id --yes-i-really-mean-it')
--- a/src/mon/FSCommands.cc
+++ b/src/mon/FSCommands.cc
@ -108,6 +108,15 @@ class FailHandler : public FileSystemCommandHandler
      return -ENOENT;
    }

+  bool confirm = false;
+  cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
+  if (!confirm &&
+      mon->mdsmon()->has_health_warnings({
+	MDS_HEALTH_TRIM, MDS_HEALTH_CACHE_OVERSIZED})) {
+    ss << errmsg_for_unhealthy_mds;
+    return -EPERM;
+  }
+
    auto f = [](auto&& fs) {
      fs.get_mds_map().set_flag(CEPH_MDSMAP_NOT_JOINABLE);
    };
--- a/src/mon/FSCommands.h
+++ b/src/mon/FSCommands.h
@ -91,4 +91,12 @@ public:
    std::ostream &ss) = 0;
 };

+
+static constexpr auto errmsg_for_unhealthy_mds = \
+  "MDS has one of two health warnings which could extend recovery: "
+  "MDS_TRIM or MDS_CACHE_OVERSIZED. MDS failover is not recommended "
+  "since it might cause unexpected file system unavailability. If "
+  "you wish to proceed, pass --yes-i-really-mean-it";
+
+
 #endif
--- a/src/mon/MDSMonitor.cc
+++ b/src/mon/MDSMonitor.cc
@ -1491,6 +1491,23 @@ out:
  }
 }

+bool MDSMonitor::has_health_warnings(vector<mds_metric_t> warnings)
+{
+  for (auto& [gid, health] : pending_daemon_health) {
+    for (auto& metric : health.metrics) {
+      // metric.type here is the type of health warning. We are only
+      // looking for types of health warnings passed to this func member
+      // through variable "warnings".
+      auto it = std::find(warnings.begin(), warnings.end(), metric.type);
+      if (it != warnings.end()) {
+	return true;
+      }
+    }
+  }
+
+  return false;
+}
+
 int MDSMonitor::filesystem_command(
    FSMap &fsmap,
    MonOpRequestRef op,
@ -1528,6 +1545,8 @@ int MDSMonitor::filesystem_command(
  } else if (prefix == "mds fail") {
    string who;
    cmd_getval(cmdmap, "role_or_gid", who);
+    bool confirm = false;
+    cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);

    MDSMap::mds_info_t failed_info;
    mds_gid_t gid = gid_from_arg(fsmap, who, ss);
@ -1547,6 +1566,12 @@ int MDSMonitor::filesystem_command(
      return -EPERM;
    }

+    if (!confirm &&
+        has_health_warnings({MDS_HEALTH_TRIM, MDS_HEALTH_CACHE_OVERSIZED})) {
+      ss << errmsg_for_unhealthy_mds;
+      return -EPERM;
+    }
+
    r = fail_mds(fsmap, ss, who, &failed_info);
    if (r < 0 && r == -EAGAIN) {
      mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
--- a/src/mon/MDSMonitor.h
+++ b/src/mon/MDSMonitor.h
@ -20,6 +20,7 @@

 #include <map>
 #include <set>
+#include <vector>

 #include "include/types.h"
 #include "PaxosFSMap.h"
@ -51,6 +52,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand
  bool preprocess_query(MonOpRequestRef op) override;  // true if processed.
  bool prepare_update(MonOpRequestRef op) override;
  bool should_propose(double& delay) override;
+  bool has_health_warnings(std::vector<mds_metric_t> warnings);

  bool should_print_status() const {
    auto& fs = get_fsmap();
--- a/src/mon/MonCommands.h
+++ b/src/mon/MonCommands.h
@ -319,7 +319,8 @@ COMMAND_WITH_FLAG("mds set_state "
 	"name=gid,type=CephInt,range=0 "
 	"name=state,type=CephInt,range=0|20",
 	"set mds state of <gid> to <numeric-state>", "mds", "rw", FLAG(HIDDEN))
-COMMAND("mds fail name=role_or_gid,type=CephString",
+COMMAND("mds fail name=role_or_gid,type=CephString "
+        "name=yes_i_really_mean_it,type=CephBool,req=false",
 	"Mark MDS failed: trigger a failover if a standby is available",
        "mds", "rw")
 COMMAND("mds repaired name=role,type=CephString",
@ -355,7 +356,8 @@ COMMAND("fs new "
 	"make new filesystem using named pools <metadata> and <data>",
 	"fs", "rw")
 COMMAND("fs fail "
-	"name=fs_name,type=CephString ",
+	"name=fs_name,type=CephString "
+        "name=yes_i_really_mean_it,type=CephBool,req=false",
 	"bring the file system down and all of its ranks",
 	"fs", "rw")
 COMMAND("fs rm "