mirror of
https://github.com/ceph/ceph
synced 2025-02-21 01:47:25 +00:00
mds: fold mds_revoke_cap_timeout into mds_session_timeout
Right now, we have two different timeout settings -- one for when the client is just not responding at all (mds_session_timeout), and one for when the client is otherwise responding but isn't returning caps in a timely fashion (mds_cap_revoke_timeout). The default settings on them are equivalent (60s), but only the mds_session_timeout is communicated via the mdsmap. The mds_cap_revoke_timeout is known only to the MDS. Neither timeout results in anything other than warnings in the current codebase. There is also a third setting (mds_session_autoclose) that is also communicated via the MDSmap. Exceeding that value (default of 300s) could eventually result in the client being blacklisted from the cluster. The code to implement that doesn't exist yet, however. The current codebase doesn't do any real sanity checking of these timeouts, so the potential for admins to get them wrong is rather high. It's hard to concoct a use-case where we'd want to warn about these events at different intervals. Simplify this by just removing the mds_cap_revoke_timeout setting, and replace its use in the code with the mds_session_timeout. With that, the client can at least determine when warnings might start showing up in the MDS' logs. Signed-off-by: Jeff Layton <jlayton@redhat.com>
This commit is contained in:
parent
d9c6a9129e
commit
3321cc7b37
@ -69,7 +69,7 @@ are like locks. Sometimes, for example when another client needs access,
|
||||
the MDS will request clients release their capabilities. If the client
|
||||
is unresponsive or buggy, it might fail to do so promptly or fail to do
|
||||
so at all. This message appears if a client has taken longer than
|
||||
``mds_revoke_cap_timeout`` (default 60s) to comply.
|
||||
``mds_session_timeout`` (default 60s) to comply.
|
||||
|
||||
Message: "Client *name* failing to respond to cache pressure"
|
||||
Code: MDS_HEALTH_CLIENT_RECALL, MDS_HEALTH_CLIENT_RECALL_MANY
|
||||
|
@ -134,10 +134,10 @@ class TestClientLimits(CephFSTestCase):
|
||||
# Client B tries to stat the file that client A created
|
||||
rproc = self.mount_b.write_background("file1")
|
||||
|
||||
# After mds_revoke_cap_timeout, we should see a health warning (extra lag from
|
||||
# After mds_session_timeout, we should see a health warning (extra lag from
|
||||
# MDS beacon period)
|
||||
mds_revoke_cap_timeout = float(self.fs.get_config("mds_revoke_cap_timeout"))
|
||||
self.wait_for_health("MDS_CLIENT_LATE_RELEASE", mds_revoke_cap_timeout + 10)
|
||||
mds_session_timeout = float(self.fs.get_config("mds_session_timeout"))
|
||||
self.wait_for_health("MDS_CLIENT_LATE_RELEASE", mds_session_timeout + 10)
|
||||
|
||||
# Client B should still be stuck
|
||||
self.assertFalse(rproc.finished)
|
||||
|
@ -435,12 +435,11 @@ OPTION(mds_beacon_grace, OPT_FLOAT)
|
||||
OPTION(mds_enforce_unique_name, OPT_BOOL)
|
||||
OPTION(mds_blacklist_interval, OPT_FLOAT) // how long to blacklist failed nodes
|
||||
|
||||
OPTION(mds_session_timeout, OPT_FLOAT) // cap bits and leases time out if client idle
|
||||
OPTION(mds_session_timeout, OPT_FLOAT) // cap bits and leases time out if client unresponsive or not returning its caps
|
||||
OPTION(mds_session_blacklist_on_timeout, OPT_BOOL) // whether to blacklist clients whose sessions are dropped due to timeout
|
||||
OPTION(mds_session_blacklist_on_evict, OPT_BOOL) // whether to blacklist clients whose sessions are dropped via admin commands
|
||||
|
||||
OPTION(mds_sessionmap_keys_per_op, OPT_U32) // how many sessions should I try to load/store in a single OMAP operation?
|
||||
OPTION(mds_revoke_cap_timeout, OPT_FLOAT) // detect clients which aren't revoking caps
|
||||
OPTION(mds_recall_state_timeout, OPT_FLOAT) // detect clients which aren't trimming caps
|
||||
OPTION(mds_freeze_tree_timeout, OPT_FLOAT) // detecting freeze tree deadlock
|
||||
OPTION(mds_session_autoclose, OPT_FLOAT) // autoclose idle session
|
||||
|
@ -5479,10 +5479,6 @@ std::vector<Option> get_mds_options() {
|
||||
.set_default(1024)
|
||||
.set_description(""),
|
||||
|
||||
Option("mds_revoke_cap_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
|
||||
.set_default(60)
|
||||
.set_description(""),
|
||||
|
||||
Option("mds_recall_state_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
|
||||
.set_default(60)
|
||||
.set_description(""),
|
||||
|
@ -3492,7 +3492,7 @@ void Locker::remove_client_cap(CInode *in, client_t client)
|
||||
|
||||
/**
|
||||
* Return true if any currently revoking caps exceed the
|
||||
* mds_revoke_cap_timeout threshold.
|
||||
* mds_session_timeout threshold.
|
||||
*/
|
||||
bool Locker::any_late_revoking_caps(xlist<Capability*> const &revoking) const
|
||||
{
|
||||
@ -3503,7 +3503,7 @@ bool Locker::any_late_revoking_caps(xlist<Capability*> const &revoking) const
|
||||
} else {
|
||||
utime_t now = ceph_clock_now();
|
||||
utime_t age = now - (*p)->get_last_revoke_stamp();
|
||||
if (age <= g_conf->mds_revoke_cap_timeout) {
|
||||
if (age <= g_conf->mds_session_timeout) {
|
||||
return false;
|
||||
} else {
|
||||
return true;
|
||||
@ -3548,8 +3548,8 @@ void Locker::caps_tick()
|
||||
|
||||
utime_t age = now - cap->get_last_revoke_stamp();
|
||||
dout(20) << __func__ << " age = " << age << cap->get_client() << "." << cap->get_inode()->ino() << dendl;
|
||||
if (age <= g_conf->mds_revoke_cap_timeout) {
|
||||
dout(20) << __func__ << " age below timeout " << g_conf->mds_revoke_cap_timeout << dendl;
|
||||
if (age <= g_conf->mds_session_timeout) {
|
||||
dout(20) << __func__ << " age below timeout " << g_conf->mds_session_timeout << dendl;
|
||||
break;
|
||||
} else {
|
||||
++i;
|
||||
@ -3560,7 +3560,7 @@ void Locker::caps_tick()
|
||||
}
|
||||
}
|
||||
// exponential backoff of warning intervals
|
||||
if (age > g_conf->mds_revoke_cap_timeout * (1 << cap->get_num_revoke_warnings())) {
|
||||
if (age > g_conf->mds_session_timeout * (1 << cap->get_num_revoke_warnings())) {
|
||||
cap->inc_num_revoke_warnings();
|
||||
stringstream ss;
|
||||
ss << "client." << cap->get_client() << " isn't responding to mclientcaps(revoke), ino "
|
||||
|
Loading…
Reference in New Issue
Block a user