mds: recall caps from quiescent sessions

This introduces two new config options [1,2] that dictate when a session
is considered quiescent by the MDS. (Options are documented fully in
options.cc.) When a session is quiescent, the MDS will preemptively
recall caps to reduce the outstanding capabilities which optimizes for
reducing work during failover.

[1] mds_session_cache_liveness_magnitude
[2] mds_session_cache_liveness_decay_rate

Fixes: https://tracker.ceph.com/issues/22446
Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
This commit is contained in:
Patrick Donnelly 2019-08-26 14:39:30 -07:00
parent 63f9448947
commit 740f6f99a1
No known key found for this signature in database
GPG Key ID: 3A2A7E25BEA8AADB
7 changed files with 40 additions and 3 deletions

View File

@ -7823,6 +7823,18 @@ std::vector<Option> get_mds_options() {
.set_default(60.0)
.set_description("decay rate for warning on slow session cap recall"),
Option("mds_session_cache_liveness_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.add_see_also("mds_session_cache_liveness_magnitude")
.set_default(5_min)
.set_description("decay rate for session liveness leading to preemptive cap recall")
.set_long_description("This determines how long a session needs to be quiescent before the MDS begins preemptively recalling capabilities. The default of 5 minutes will cause 10 halvings of the decay counter after 1 hour, or 1/1024. The default magnitude of 10 (1^10 or 1024) is chosen so that the MDS considers a previously chatty session (approximately) to be quiescent after 1 hour."),
Option("mds_session_cache_liveness_magnitude", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
.add_see_also("mds_session_cache_liveness_decay_rate")
.set_default(10)
.set_description("decay magnitude for preemptively recalling caps on quiet client")
.set_long_description("This is the order of magnitude difference (in base 2) of the internal liveness decay counter and the number of capabilities the session holds. When this difference occurs, the MDS treats the session as quiescent and begins recalling capabilities."),
Option("mds_freeze_tree_timeout", Option::TYPE_FLOAT, Option::LEVEL_DEV)
.set_default(30)
.set_description(""),

View File

@ -175,7 +175,8 @@ MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
trim_client_leases();
trim();
check_memory_usage();
mds->server->recall_client_state(nullptr, Server::RecallFlags::ENFORCE_MAX);
auto flags = Server::RecallFlags::ENFORCE_MAX|Server::RecallFlags::ENFORCE_LIVENESS;
mds->server->recall_client_state(nullptr, flags);
upkeep_last_trim = clock::now();
} else {
dout(10) << "cache not ready for trimming" << dendl;

View File

@ -3671,6 +3671,7 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const
"mds_recall_max_decay_rate",
"mds_recall_warning_decay_rate",
"mds_request_load_average_decay_rate",
"mds_session_cache_liveness_decay_rate",
NULL
};
return KEYS;

View File

@ -1570,6 +1570,7 @@ std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather,
const auto now = clock::now();
const bool steady = !!(flags&RecallFlags::STEADY);
const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
const bool trim = !!(flags&RecallFlags::TRIM);
const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
@ -1577,6 +1578,7 @@ std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather,
const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
dout(7) << __func__ << ":"
<< " min=" << min_caps_per_client
@ -1587,9 +1589,10 @@ std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather,
/* trim caps of sessions with the most caps first */
std::multimap<uint64_t, Session*> caps_session;
auto f = [&caps_session, enforce_max, trim, max_caps_per_client](auto& s) {
auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
auto num_caps = s->caps.size();
if (trim || (enforce_max && num_caps > max_caps_per_client)) {
auto cache_liveness = s->get_session_cache_liveness();
if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
}
};

View File

@ -174,6 +174,7 @@ public:
STEADY = (1<<0),
ENFORCE_MAX = (1<<1),
TRIM = (1<<2),
ENFORCE_LIVENESS = (1<<3),
};
std::pair<bool, uint64_t> recall_client_state(MDSGatherBuilder* gather, RecallFlags=RecallFlags::NONE);
void force_clients_readonly();

View File

@ -588,6 +588,7 @@ void Session::dump(Formatter *f) const
f->dump_object("release_caps", release_caps);
f->dump_object("recall_caps_throttle", recall_caps_throttle);
f->dump_object("recall_caps_throttle2o", recall_caps_throttle2o);
f->dump_object("session_cache_liveness", session_cache_liveness);
info.dump(f);
}
@ -1072,6 +1073,14 @@ void SessionMap::handle_conf_change(const std::set<std::string>& changed)
};
apply_to_open_sessions(mut);
}
if (changed.count("mds_session_cache_liveness_decay_rate")) {
auto d = g_conf().get_val<double>("mds_session_cache_liveness_decay_rate");
auto mut = [d](auto s) {
s->session_cache_liveness = DecayCounter(d);
s->session_cache_liveness.hit(s->caps.size()); /* so the MDS doesn't immediately start trimming a new session */
};
apply_to_open_sessions(mut);
}
}
void SessionMap::update_average_session_age() {

View File

@ -128,6 +128,9 @@ private:
// New limit in SESSION_RECALL
uint32_t recall_limit = 0;
// session caps liveness
DecayCounter session_cache_liveness;
// session start time -- used to track average session time
// note that this is initialized in the constructor rather
// than at the time of adding a session to the sessionmap
@ -204,6 +207,9 @@ public:
auto get_release_caps() const {
return release_caps.get();
}
auto get_session_cache_liveness() const {
return session_cache_liveness.get();
}
inodeno_t next_ino() const {
if (info.prealloc_inos.empty())
@ -306,14 +312,17 @@ public:
}
void touch_cap(Capability *cap) {
session_cache_liveness.hit(1.0);
caps.push_front(&cap->item_session_caps);
}
void touch_cap_bottom(Capability *cap) {
session_cache_liveness.hit(1.0);
caps.push_back(&cap->item_session_caps);
}
void touch_lease(ClientLease *r) {
session_cache_liveness.hit(1.0);
leases.push_back(&r->item_session_lease);
}
@ -412,6 +421,7 @@ public:
release_caps(g_conf().get_val<double>("mds_recall_warning_decay_rate")),
recall_caps_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
recall_caps_throttle2o(0.5),
session_cache_liveness(g_conf().get_val<double>("mds_session_cache_liveness_decay_rate")),
birth_time(clock::now()),
auth_caps(g_ceph_context),
item_session_list(this),