Merge pull request #56066 from rishabh-d-dave/mds-fail-confirm

mon,cephfs: require confirmation flag to bring down unhealthy MDS

Reviewed-by: Leonid Usov <leonid.usov@ibm.com>
Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
This commit is contained in:
Rishabh Dave 2024-05-03 13:13:34 +05:30 committed by GitHub
commit 18c7799cce
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
11 changed files with 280 additions and 7 deletions

View File

@ -178,6 +178,11 @@ CephFS: Disallow delegating preallocated inode ranges to clients. Config
and it can be disabled by using:
`ceph config set mgr mgr/volumes/snapshot_clone_no_wait false`
* CephFS: Command "ceph mds fail" and "ceph fs fail" now requires a
confirmation flag when some MDSs exhibit health warning MDS_TRIM or
MDS_CACHE_OVERSIZED. This is to prevent accidental MDS failover causing
further delays in recovery.
>=18.0.0
* The RGW policy parser now rejects unknown principals by default. If you are

View File

@ -117,4 +117,11 @@ the following method.
$ sudo rm -rf /var/lib/ceph/mds/ceph-${id}
.. note:: When an active MDS either has health warning MDS_TRIM or
MDS_CACHE_OVERSIZED, confirmation flag (--yes-i-really-mean-it)
needs to be passed, else the command will fail. It is not recommended to
restart an MDS which has these warnings since slow recovery at restart may
lead to more problems.
.. _MDS Config Reference: ../mds-config-ref

View File

@ -193,7 +193,11 @@ file system and MDS daemons down, use the ``ceph fs fail`` command:
::
ceph fs fail <fs_name>
ceph fs fail <fs_name> {--yes-i-really-mean-it}
.. note:: Note that confirmation flag is optional because it is only required
when the MDS is active and has health warning MDS_TRIM or
MDS_CACHE_OVERSIZED.
This command sets a file system flag to prevent standbys from
activating on the file system (the ``joinable`` flag).
@ -210,7 +214,11 @@ respawn as standbys. The file system will be left in a degraded state.
::
# For all ranks, 0-N:
ceph mds fail <fs_name>:<n>
ceph mds fail <fs_name>:<n> {--yes-i-really-mean-it}
.. note:: Note that confirmation flag is optional because it is only required
when the MDS is active and has health warning MDS_TRIM or
MDS_CACHE_OVERSIZED.
Once all ranks are inactive, the file system may also be deleted or left in
this state for other purposes (perhaps disaster recovery).

View File

@ -5,6 +5,8 @@ overrides:
lockdep: true
log-ignorelist:
- missing required features
- \(MDS_CACHE_OVERSIZED\)
- \(MDS_TRIM\)
tasks:
- cephfs_test_runner:
fail_on_skip: false

View File

@ -612,7 +612,12 @@ class FilesystemBase(MDSClusterBase):
self.run_ceph_cmd("fs", "reset", str(self.name), '--yes-i-really-mean-it')
def fail(self):
self.run_ceph_cmd("fs", "fail", str(self.name))
cmd = ["fs", "fail", str(self.name)]
try:
self.run_ceph_cmd(cmd)
except CommandFailedError:
cmd.append("--yes-i-really-mean-it")
self.run_ceph_cmd(cmd)
def set_flag(self, var, *args):
a = map(lambda x: str(x).lower(), args)
@ -1181,8 +1186,13 @@ class FilesystemBase(MDSClusterBase):
def rank_repaired(self, rank):
self.run_ceph_cmd("mds", "repaired", "{}:{}".format(self.id, rank))
def rank_fail(self, rank=0):
self.run_ceph_cmd("mds", "fail", "{}:{}".format(self.id, rank))
def rank_fail(self, rank=0, confirm=True):
cmd = f'mds fail {self.id}:{rank}'
try:
self.run_ceph_cmd(args=cmd)
except CommandFailedError:
cmd += ' --yes--i-really-mean-it'
self.run_ceph_cmd(args=cmd)
def rank_is_running(self, rank=0, status=None):
name = self.get_rank(rank=rank, status=status)['name']

View File

@ -91,6 +91,38 @@ class TestAdminCommands(CephFSTestCase):
if overwrites:
self.run_ceph_cmd('osd', 'pool', 'set', n+"-data", 'allow_ec_overwrites', 'true')
def _get_unhealthy_mds_id(self, health_report, health_warn):
'''
Return MDS ID for which health warning in "health_warn" has been
generated.
'''
# variable "msg" should hold string something like this -
# 'mds.b(mds.0): Behind on trimming (865/10) max_segments: 10,
# num_segments: 86
msg = health_report['checks'][health_warn]['detail'][0]['message']
mds_id = msg.split('(')[0]
mds_id = mds_id.replace('mds.', '')
return mds_id
def wait_till_health_warn(self, health_warn, active_mds_id, sleep=3,
tries=10):
errmsg = (f'Expected health warning "{health_warn}" to eventually '
'show up in output of command "ceph health detail". Tried '
f'{tries} times with interval of {sleep} seconds but the '
'health warning didn\'t turn up.')
with safe_while(sleep=sleep, tries=tries, action=errmsg) as proceed:
while proceed():
self.get_ceph_cmd_stdout(
f'tell mds.{active_mds_id} cache status')
health_report = json.loads(self.get_ceph_cmd_stdout(
'health detail --format json'))
if health_warn in health_report['checks']:
return
@classhook('_add_valid_tell')
class TestValidTell(TestAdminCommands):
@classmethod
@ -2154,3 +2186,166 @@ class TestPermErrMsg(CephFSTestCase):
args=(f'fs authorize {self.fs.name} {self.CLIENT_NAME} / '
f'{wrong_perm}'), retval=self.EXPECTED_ERRNO,
errmsgs=self.EXPECTED_ERRMSG)
class TestFSFail(TestAdminCommands):
MDSS_REQUIRED = 2
CLIENTS_REQUIRED = 1
def test_with_health_warn_oversize_cache(self):
'''
Test that, when health warning MDS_CACHE_OVERSIZE is present for an
MDS, command "ceph fs fail" fails without confirmation flag and passes
when confirmation flag is passed.
'''
health_warn = 'MDS_CACHE_OVERSIZED'
self.config_set('mds', 'mds_cache_memory_limit', '1K')
self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
active_mds_id = self.fs.get_active_names()[0]
self.mount_a.open_n_background('.', 400)
self.wait_till_health_warn(health_warn, active_mds_id)
# actual testing begins now.
errmsg = 'mds_cache_oversized'
self.negtest_ceph_cmd(args=f'fs fail {self.fs.name}',
retval=1, errmsgs=errmsg)
self.run_ceph_cmd(f'fs fail {self.fs.name} --yes-i-really-mean-it')
def test_with_health_warn_trim(self):
'''
Test that, when health warning MDS_TRIM is present for an MDS, command
"ceph fs fail" fails without confirmation flag and passes when
confirmation flag is passed.
'''
health_warn = 'MDS_TRIM'
# for generating health warning MDS_TRIM
self.config_set('mds', 'mds_debug_subtrees', 'true')
# this will really really slow the trimming, so that MDS_TRIM stays
# for longer.
self.config_set('mds', 'mds_log_trim_decay_rate', '60')
self.config_set('mds', 'mds_log_trim_threshold', '1')
active_mds_id = self.fs.get_active_names()[0]
self.mount_a.open_n_background('.', 400)
self.wait_till_health_warn(health_warn, active_mds_id)
# actual testing begins now.
errmsg = 'mds_trim'
self.negtest_ceph_cmd(args=f'fs fail {self.fs.name}',
retval=1, errmsgs=errmsg)
self.run_ceph_cmd(f'fs fail {self.fs.name} --yes-i-really-mean-it')
def test_with_health_warn_with_2_active_MDSs(self):
'''
Test that, when a CephFS has 2 active MDSs and one of them have either
health warning MDS_TRIM or MDS_CACHE_OVERSIZE, running "ceph fs fail"
fails without confirmation flag and passes when confirmation flag is
passed.
'''
health_warn = 'MDS_CACHE_OVERSIZED'
self.fs.set_max_mds(2)
self.config_set('mds', 'mds_cache_memory_limit', '1K')
self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
self.fs.wait_for_daemons()
mds1_id, mds2_id = self.fs.get_active_names()
self.mount_a.open_n_background('.', 400)
# MDS ID for which health warning has been generated.
self.wait_till_health_warn(health_warn, mds1_id)
# actual testing begins now.
errmsg = 'mds_cache_oversized'
self.negtest_ceph_cmd(args=f'fs fail {self.fs.name}',
retval=1, errmsgs=errmsg)
self.run_ceph_cmd(f'fs fail {self.fs.name} --yes-i-really-mean-it')
class TestMDSFail(TestAdminCommands):
MDSS_REQUIRED = 2
CLIENTS_REQUIRED = 1
def test_with_health_warn_oversize_cache(self):
'''
Test that, when health warning MDS_CACHE_OVERSIZE is present for an
MDS, command "ceph mds fail" fails without confirmation flag and
passes when confirmation flag is passed.
'''
health_warn = 'MDS_CACHE_OVERSIZED'
self.config_set('mds', 'mds_cache_memory_limit', '1K')
self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
active_mds_id = self.fs.get_active_names()[0]
self.mount_a.open_n_background('.', 400)
self.wait_till_health_warn(health_warn, active_mds_id)
# actual testing begins now.
errmsg = 'mds_cache_oversized'
self.negtest_ceph_cmd(args=f'mds fail {active_mds_id}',
retval=1, errmsgs=errmsg)
self.run_ceph_cmd(f'mds fail {self.fs.name} --yes-i-really-mean-it')
def test_with_health_warn_trim(self):
'''
Test that, when health warning MDS_TRIM is present for an MDS, command
"ceph mds fail" fails without confirmation flag and passes when
confirmation is passed.
'''
health_warn = 'MDS_TRIM'
# for generating health warning MDS_TRIM
self.config_set('mds', 'mds_debug_subtrees', 'true')
# this will really really slow the trimming, so that MDS_TRIM stays
# for longer.
self.config_set('mds', 'mds_log_trim_decay_rate', '60')
self.config_set('mds', 'mds_log_trim_threshold', '1')
active_mds_id = self.fs.get_active_names()[0]
self.mount_a.open_n_background('.', 400)
self.wait_till_health_warn(health_warn, active_mds_id)
# actual testing begins now...
errmsg = 'mds_trim'
self.negtest_ceph_cmd(args=f'mds fail {active_mds_id}',
retval=1, errmsgs=errmsg)
self.run_ceph_cmd(f'mds fail {self.fs.name} --yes-i-really-mean-it')
def test_with_health_warn_with_2_active_MDSs(self):
'''
Test when a CephFS has 2 active MDSs and one of them have either
health warning MDS_TRIM or MDS_CACHE_OVERSIZE, running "ceph mds fail"
fails for both MDSs without confirmation flag and passes for both when
confirmation flag is passed.
'''
health_warn = 'MDS_CACHE_OVERSIZED'
self.fs.set_max_mds(2)
self.config_set('mds', 'mds_cache_memory_limit', '1K')
self.config_set('mds', 'mds_health_cache_threshold', '1.00000')
self.fs.wait_for_daemons()
mds1_id, mds2_id = self.fs.get_active_names()
self.mount_a.open_n_background('.', 400)
self.wait_till_health_warn(health_warn, mds1_id)
health_report = json.loads(self.get_ceph_cmd_stdout('health detail '
'--format json'))
# MDS ID for which health warning has been generated.
hw_mds_id = self._get_unhealthy_mds_id(health_report, health_warn)
if mds1_id == hw_mds_id:
non_hw_mds_id = mds2_id
elif mds2_id == hw_mds_id:
non_hw_mds_id = mds1_id
else:
raise RuntimeError('There are only 2 MDSs right now but apparently'
'health warning was raised for an MDS other '
'than these two. This is definitely an error.')
# actual testing begins now...
errmsg = 'mds_cache_oversized'
self.negtest_ceph_cmd(args=f'mds fail {non_hw_mds_id}', retval=1,
errmsgs=errmsg)
self.negtest_ceph_cmd(args=f'mds fail {hw_mds_id}', retval=1,
errmsgs=errmsg)
self.run_ceph_cmd('mds fail mds1_id --yes-i-really-mean-it')
self.run_ceph_cmd('mds fail mds2_id --yes-i-really-mean-it')

View File

@ -108,6 +108,15 @@ class FailHandler : public FileSystemCommandHandler
return -ENOENT;
}
bool confirm = false;
cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
if (!confirm &&
mon->mdsmon()->has_health_warnings({
MDS_HEALTH_TRIM, MDS_HEALTH_CACHE_OVERSIZED})) {
ss << errmsg_for_unhealthy_mds;
return -EPERM;
}
auto f = [](auto&& fs) {
fs.get_mds_map().set_flag(CEPH_MDSMAP_NOT_JOINABLE);
};

View File

@ -91,4 +91,12 @@ public:
std::ostream &ss) = 0;
};
static constexpr auto errmsg_for_unhealthy_mds = \
"MDS has one of two health warnings which could extend recovery: "
"MDS_TRIM or MDS_CACHE_OVERSIZED. MDS failover is not recommended "
"since it might cause unexpected file system unavailability. If "
"you wish to proceed, pass --yes-i-really-mean-it";
#endif

View File

@ -1491,6 +1491,23 @@ out:
}
}
bool MDSMonitor::has_health_warnings(vector<mds_metric_t> warnings)
{
for (auto& [gid, health] : pending_daemon_health) {
for (auto& metric : health.metrics) {
// metric.type here is the type of health warning. We are only
// looking for types of health warnings passed to this func member
// through variable "warnings".
auto it = std::find(warnings.begin(), warnings.end(), metric.type);
if (it != warnings.end()) {
return true;
}
}
}
return false;
}
int MDSMonitor::filesystem_command(
FSMap &fsmap,
MonOpRequestRef op,
@ -1528,6 +1545,8 @@ int MDSMonitor::filesystem_command(
} else if (prefix == "mds fail") {
string who;
cmd_getval(cmdmap, "role_or_gid", who);
bool confirm = false;
cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
MDSMap::mds_info_t failed_info;
mds_gid_t gid = gid_from_arg(fsmap, who, ss);
@ -1547,6 +1566,12 @@ int MDSMonitor::filesystem_command(
return -EPERM;
}
if (!confirm &&
has_health_warnings({MDS_HEALTH_TRIM, MDS_HEALTH_CACHE_OVERSIZED})) {
ss << errmsg_for_unhealthy_mds;
return -EPERM;
}
r = fail_mds(fsmap, ss, who, &failed_info);
if (r < 0 && r == -EAGAIN) {
mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));

View File

@ -20,6 +20,7 @@
#include <map>
#include <set>
#include <vector>
#include "include/types.h"
#include "PaxosFSMap.h"
@ -51,6 +52,7 @@ class MDSMonitor : public PaxosService, public PaxosFSMap, protected CommandHand
bool preprocess_query(MonOpRequestRef op) override; // true if processed.
bool prepare_update(MonOpRequestRef op) override;
bool should_propose(double& delay) override;
bool has_health_warnings(std::vector<mds_metric_t> warnings);
bool should_print_status() const {
auto& fs = get_fsmap();

View File

@ -319,7 +319,8 @@ COMMAND_WITH_FLAG("mds set_state "
"name=gid,type=CephInt,range=0 "
"name=state,type=CephInt,range=0|20",
"set mds state of <gid> to <numeric-state>", "mds", "rw", FLAG(HIDDEN))
COMMAND("mds fail name=role_or_gid,type=CephString",
COMMAND("mds fail name=role_or_gid,type=CephString "
"name=yes_i_really_mean_it,type=CephBool,req=false",
"Mark MDS failed: trigger a failover if a standby is available",
"mds", "rw")
COMMAND("mds repaired name=role,type=CephString",
@ -355,7 +356,8 @@ COMMAND("fs new "
"make new filesystem using named pools <metadata> and <data>",
"fs", "rw")
COMMAND("fs fail "
"name=fs_name,type=CephString ",
"name=fs_name,type=CephString "
"name=yes_i_really_mean_it,type=CephBool,req=false",
"bring the file system down and all of its ranks",
"fs", "rw")
COMMAND("fs rm "