Merge PR #52944 into main

* refs/pull/52944/head:
	PendingReleaseNotes: add a note for `mds_session_metadata_threshold` mds config
	test: add test to verify that a buggy client is blocklisted
	mds: add perf counter to track number of sessions evicted due to metadata threshold being exceeded
	mds: blocklist clients with "bloated" session metadata

Reviewed-by: Robin H. Johnson <robbat2@orbis-terrarum.net>
Reviewed-by: Dhairya Parmar <dparmar@redhat.com>
Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
This commit is contained in:
Venky Shankar 2023-08-25 18:46:19 +05:30
commit e9f8be4bac
7 changed files with 138 additions and 16 deletions

View File

@ -174,3 +174,4 @@ parth-gr Parth Arora <paarora@redhat.com>
phlogistonjohn John Mulligan <jmulligan@redhat.com>
baergj Joshua Baergen <jbaergen@digitalocean.com>
zmc Zack Cerza <zack@redhat.com>
robbat2 Robin H. Johnson <robbat2@orbis-terrarum.net>

View File

@ -8,6 +8,10 @@
For multi-site deployments that make any use of Server-Side Encryption, we
recommended running this command against every bucket in every zone after all
zones have upgraded.
* CEPHFS: MDS evicts clients which are not advancing their request tids which causes
a large buildup of session metadata resulting in the MDS going read-only due to
the RADOS operation exceeding the size threshold. `mds_session_metadata_threshold`
config controls the maximum size that a (encoded) session metadata can grow.
>=18.0.0

View File

@ -9,6 +9,7 @@ from textwrap import dedent
from tasks.ceph_test_case import TestTimeoutError
from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming
from tasks.cephfs.fuse_mount import FuseMount
from teuthology.exceptions import CommandFailedError
import os
from io import StringIO
@ -242,6 +243,56 @@ class TestClientLimits(CephFSTestCase):
self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
rproc.wait()
def test_client_blocklisted_oldest_tid(self):
"""
that a client is blocklisted when its encoded session metadata exceeds the
configured threshold (due to ever growing `completed_requests` caused due
to an unidentified bug (in the client or the MDS)).
"""
# num of requests client issues
max_requests = 10000
# The debug hook to inject the failure only exists in the fuse client
if not isinstance(self.mount_a, FuseMount):
self.skipTest("Require FUSE client to inject client release failure")
self.config_set('client', 'client inject fixed oldest tid', 'true')
self.mount_a.teardown()
self.mount_a.mount_wait()
self.config_set('mds', 'mds_max_completed_requests', max_requests);
# Create lots of files
self.mount_a.create_n_files("testdir/file1", max_requests + 100)
# Create a few files synchronously. This makes sure previous requests are completed
self.mount_a.create_n_files("testdir/file2", 5, True)
# Wait for the health warnings. Assume mds can handle 10 request per second at least
self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests // 10, check_in_detail=str(self.mount_a.client_id))
# why a multiplier of 20, you may ask - I arrieved at this from some debugs
# that I put when testing the fix in a vstart cluster where its a ratio of
# encoded session information to the number of completed requests.
self.config_set('mds', 'mds_session_metadata_threshold', max_requests*20);
# Create a few more files synchronously. This would hit the session metadata threshold
# causing the client to get blocklisted.
with self.assertRaises(CommandFailedError):
self.mount_a.create_n_files("testdir/file2", 20, True)
self.mds_cluster.is_addr_blocklisted(self.mount_a.get_global_addr())
# the mds should bump up the relevant perf counter
pd = self.perf_dump()
self.assertGreater(pd['mds_sessions']['md_thresh_evicted'], 0)
# reset the config
self.config_set('client', 'client inject fixed oldest tid', 'false')
self.mount_a.kill_cleanup()
self.mount_a.mount_wait()
def test_client_oldest_tid(self):
"""
When a client does not advance its oldest tid, the MDS should notice that

View File

@ -1577,3 +1577,13 @@ options:
- mds
flags:
- runtime
- name: mds_session_metadata_threshold
type: size
level: advanced
desc: Evict non-advancing client-tid sessions exceeding the config size.
long_desc: Evict clients which are not advancing their request tids which causes a large buildup of session metadata (`completed_requests`) in the MDS causing the MDS to go read-only since the RADOS operation exceeds the size threashold. This config is the maximum size (in bytes) that a session metadata (encoded) can grow.
default: 16_M
services:
- mds
flags:
- runtime

View File

@ -3867,6 +3867,7 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const
"mds_session_cap_acquisition_throttle",
"mds_session_max_caps_throttle_ratio",
"mds_symlink_recovery",
"mds_session_metadata_threshold",
NULL
};
return KEYS;

View File

@ -45,6 +45,11 @@ class SessionMapIOContext : public MDSIOContextBase
};
};
SessionMap::SessionMap(MDSRank *m)
: mds(m),
mds_session_metadata_threshold(g_conf().get_val<Option::size_t>("mds_session_metadata_threshold")) {
}
void SessionMap::register_perfcounters()
{
PerfCountersBuilder plb(g_ceph_context, "mds_sessions",
@ -66,6 +71,8 @@ void SessionMap::register_perfcounters()
plb.add_u64(l_mdssm_avg_load, "average_load", "Average Load");
plb.add_u64(l_mdssm_avg_session_uptime, "avg_session_uptime",
"Average session uptime");
plb.add_u64(l_mdssm_metadata_threshold_sessions_evicted, "mdthresh_evicted",
"Sessions evicted on reaching metadata threshold");
logger = plb.create_perf_counters();
g_ceph_context->get_perfcounters_collection()->add(logger);
@ -375,6 +382,11 @@ public:
};
}
bool SessionMap::validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl) {
session->info.encode(bl, mds->mdsmap->get_up_features());
return bl.length() < mds_session_metadata_threshold;
}
void SessionMap::save(MDSContext *onsave, version_t needv)
{
dout(10) << __func__ << ": needv " << needv << ", v " << version << dendl;
@ -410,6 +422,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
dout(20) << " updating keys:" << dendl;
map<string, bufferlist> to_set;
std::set<entity_name_t> to_blocklist;
for(std::set<entity_name_t>::iterator i = dirty_sessions.begin();
i != dirty_sessions.end(); ++i) {
const entity_name_t name = *i;
@ -420,13 +433,19 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
session->is_stale() ||
session->is_killing()) {
dout(20) << " " << name << dendl;
// Serialize K
CachedStackStringStream css;
*css << name;
// Serialize V
bufferlist bl;
session->info.encode(bl, mds->mdsmap->get_up_features());
if (!validate_and_encode_session(mds, session, bl)) {
derr << __func__ << ": session (" << name << ") exceeds"
<< " sesion metadata threshold - blocklisting" << dendl;
to_blocklist.emplace(name);
continue;
}
// Serialize K
CachedStackStringStream css;
*css << name;
// Add to RADOS op
to_set[std::string(css->strv())] = bl;
@ -461,6 +480,8 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
0,
new C_OnFinisher(new C_IO_SM_Save(this, version),
mds->finisher));
apply_blocklist(to_blocklist);
logger->inc(l_mdssm_metadata_threshold_sessions_evicted, to_blocklist.size());
}
void SessionMap::_save_finish(version_t v)
@ -826,7 +847,8 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
{
ceph_assert(gather_bld != NULL);
std::vector<entity_name_t> write_sessions;
std::set<entity_name_t> to_blocklist;
std::map<entity_name_t, bufferlist> write_sessions;
// Decide which sessions require a write
for (std::set<entity_name_t>::iterator i = tgt_sessions.begin();
@ -851,13 +873,24 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
// need to pre-empt that.
continue;
}
// Serialize V
bufferlist bl;
if (!validate_and_encode_session(mds, session, bl)) {
derr << __func__ << ": session (" << session_id << ") exceeds"
<< " sesion metadata threshold - blocklisting" << dendl;
to_blocklist.emplace(session_id);
continue;
}
// Okay, passed all our checks, now we write
// this session out. The version we write
// into the OMAP may now be higher-versioned
// than the version in the header, but that's
// okay because it's never a problem to have
// an overly-fresh copy of a session.
write_sessions.push_back(*i);
write_sessions.emplace(session_id, std::move(bl));
session->clear_dirty_completed_requests();
}
dout(4) << __func__ << ": writing " << write_sessions.size() << dendl;
@ -865,21 +898,15 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
// Batch writes into mds_sessionmap_keys_per_op
const uint32_t kpo = g_conf()->mds_sessionmap_keys_per_op;
map<string, bufferlist> to_set;
for (uint32_t i = 0; i < write_sessions.size(); ++i) {
const entity_name_t &session_id = write_sessions[i];
Session *session = session_map[session_id];
session->clear_dirty_completed_requests();
uint32_t i = 0;
for (auto &[session_id, bl] : write_sessions) {
// Serialize K
CachedStackStringStream css;
*css << session_id;
// Serialize V
bufferlist bl;
session->info.encode(bl, mds->mdsmap->get_up_features());
// Add to RADOS op
to_set[css->str()] = bl;
to_set[css->str()] = std::move(bl);
// Complete this write transaction?
if (i == write_sessions.size() - 1
@ -898,7 +925,11 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
new C_IO_SM_Save_One(this, on_safe),
mds->finisher));
}
++i;
}
apply_blocklist(to_blocklist);
logger->inc(l_mdssm_metadata_threshold_sessions_evicted, to_blocklist.size());
}
// =================
@ -1112,6 +1143,10 @@ void SessionMap::handle_conf_change(const std::set<std::string>& changed)
};
apply_to_open_sessions(mut);
}
if (changed.count("mds_session_metadata_threshold")) {
mds_session_metadata_threshold = g_conf().get_val<Option::size_t>("mds_session_metadata_threshold");
}
}
void SessionMap::update_average_session_age() {
@ -1123,6 +1158,20 @@ void SessionMap::update_average_session_age() {
logger->set(l_mdssm_avg_session_uptime, (uint64_t)avg_uptime);
}
void SessionMap::apply_blocklist(const std::set<entity_name_t>& victims) {
if (victims.empty()) {
return;
}
C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop);
for (auto &victim : victims) {
CachedStackStringStream css;
mds->evict_client(victim.num(), false, g_conf()->mds_session_blocklist_on_evict, *css,
gather.new_sub());
}
gather.activate();
}
int SessionFilter::parse(
const std::vector<std::string> &args,
std::ostream *ss)

View File

@ -45,6 +45,7 @@ enum {
l_mdssm_total_load,
l_mdssm_avg_load,
l_mdssm_avg_session_uptime,
l_mdssm_metadata_threshold_sessions_evicted,
l_mdssm_last,
};
@ -594,7 +595,7 @@ protected:
class SessionMap : public SessionMapStore {
public:
SessionMap() = delete;
explicit SessionMap(MDSRank *m) : mds(m) {}
explicit SessionMap(MDSRank *m);
~SessionMap() override
{
@ -843,6 +844,11 @@ private:
}
time avg_birth_time = clock::zero();
size_t mds_session_metadata_threshold;
bool validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl);
void apply_blocklist(const std::set<entity_name_t>& victims);
};
std::ostream& operator<<(std::ostream &out, const Session &s);