mirror of https://github.com/ceph/ceph
Merge PR #52944 into main
* refs/pull/52944/head: PendingReleaseNotes: add a note for `mds_session_metadata_threshold` mds config test: add test to verify that a buggy client is blocklisted mds: add perf counter to track number of sessions evicted due to metadata threshold being exceeded mds: blocklist clients with "bloated" session metadata Reviewed-by: Robin H. Johnson <robbat2@orbis-terrarum.net> Reviewed-by: Dhairya Parmar <dparmar@redhat.com> Reviewed-by: Patrick Donnelly <pdonnell@redhat.com>
This commit is contained in:
commit
e9f8be4bac
|
@ -174,3 +174,4 @@ parth-gr Parth Arora <paarora@redhat.com>
|
|||
phlogistonjohn John Mulligan <jmulligan@redhat.com>
|
||||
baergj Joshua Baergen <jbaergen@digitalocean.com>
|
||||
zmc Zack Cerza <zack@redhat.com>
|
||||
robbat2 Robin H. Johnson <robbat2@orbis-terrarum.net>
|
||||
|
|
|
@ -8,6 +8,10 @@
|
|||
For multi-site deployments that make any use of Server-Side Encryption, we
|
||||
recommended running this command against every bucket in every zone after all
|
||||
zones have upgraded.
|
||||
* CEPHFS: MDS evicts clients which are not advancing their request tids which causes
|
||||
a large buildup of session metadata resulting in the MDS going read-only due to
|
||||
the RADOS operation exceeding the size threshold. `mds_session_metadata_threshold`
|
||||
config controls the maximum size that a (encoded) session metadata can grow.
|
||||
|
||||
>=18.0.0
|
||||
|
||||
|
|
|
@ -9,6 +9,7 @@ from textwrap import dedent
|
|||
from tasks.ceph_test_case import TestTimeoutError
|
||||
from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming
|
||||
from tasks.cephfs.fuse_mount import FuseMount
|
||||
from teuthology.exceptions import CommandFailedError
|
||||
import os
|
||||
from io import StringIO
|
||||
|
||||
|
@ -242,6 +243,56 @@ class TestClientLimits(CephFSTestCase):
|
|||
self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
|
||||
rproc.wait()
|
||||
|
||||
def test_client_blocklisted_oldest_tid(self):
|
||||
"""
|
||||
that a client is blocklisted when its encoded session metadata exceeds the
|
||||
configured threshold (due to ever growing `completed_requests` caused due
|
||||
to an unidentified bug (in the client or the MDS)).
|
||||
"""
|
||||
|
||||
# num of requests client issues
|
||||
max_requests = 10000
|
||||
|
||||
# The debug hook to inject the failure only exists in the fuse client
|
||||
if not isinstance(self.mount_a, FuseMount):
|
||||
self.skipTest("Require FUSE client to inject client release failure")
|
||||
|
||||
self.config_set('client', 'client inject fixed oldest tid', 'true')
|
||||
self.mount_a.teardown()
|
||||
self.mount_a.mount_wait()
|
||||
|
||||
self.config_set('mds', 'mds_max_completed_requests', max_requests);
|
||||
|
||||
# Create lots of files
|
||||
self.mount_a.create_n_files("testdir/file1", max_requests + 100)
|
||||
|
||||
# Create a few files synchronously. This makes sure previous requests are completed
|
||||
self.mount_a.create_n_files("testdir/file2", 5, True)
|
||||
|
||||
# Wait for the health warnings. Assume mds can handle 10 request per second at least
|
||||
self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests // 10, check_in_detail=str(self.mount_a.client_id))
|
||||
|
||||
# why a multiplier of 20, you may ask - I arrieved at this from some debugs
|
||||
# that I put when testing the fix in a vstart cluster where its a ratio of
|
||||
# encoded session information to the number of completed requests.
|
||||
self.config_set('mds', 'mds_session_metadata_threshold', max_requests*20);
|
||||
|
||||
# Create a few more files synchronously. This would hit the session metadata threshold
|
||||
# causing the client to get blocklisted.
|
||||
with self.assertRaises(CommandFailedError):
|
||||
self.mount_a.create_n_files("testdir/file2", 20, True)
|
||||
|
||||
self.mds_cluster.is_addr_blocklisted(self.mount_a.get_global_addr())
|
||||
# the mds should bump up the relevant perf counter
|
||||
pd = self.perf_dump()
|
||||
self.assertGreater(pd['mds_sessions']['md_thresh_evicted'], 0)
|
||||
|
||||
# reset the config
|
||||
self.config_set('client', 'client inject fixed oldest tid', 'false')
|
||||
|
||||
self.mount_a.kill_cleanup()
|
||||
self.mount_a.mount_wait()
|
||||
|
||||
def test_client_oldest_tid(self):
|
||||
"""
|
||||
When a client does not advance its oldest tid, the MDS should notice that
|
||||
|
|
|
@ -1577,3 +1577,13 @@ options:
|
|||
- mds
|
||||
flags:
|
||||
- runtime
|
||||
- name: mds_session_metadata_threshold
|
||||
type: size
|
||||
level: advanced
|
||||
desc: Evict non-advancing client-tid sessions exceeding the config size.
|
||||
long_desc: Evict clients which are not advancing their request tids which causes a large buildup of session metadata (`completed_requests`) in the MDS causing the MDS to go read-only since the RADOS operation exceeds the size threashold. This config is the maximum size (in bytes) that a session metadata (encoded) can grow.
|
||||
default: 16_M
|
||||
services:
|
||||
- mds
|
||||
flags:
|
||||
- runtime
|
||||
|
|
|
@ -3867,6 +3867,7 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const
|
|||
"mds_session_cap_acquisition_throttle",
|
||||
"mds_session_max_caps_throttle_ratio",
|
||||
"mds_symlink_recovery",
|
||||
"mds_session_metadata_threshold",
|
||||
NULL
|
||||
};
|
||||
return KEYS;
|
||||
|
|
|
@ -45,6 +45,11 @@ class SessionMapIOContext : public MDSIOContextBase
|
|||
};
|
||||
};
|
||||
|
||||
SessionMap::SessionMap(MDSRank *m)
|
||||
: mds(m),
|
||||
mds_session_metadata_threshold(g_conf().get_val<Option::size_t>("mds_session_metadata_threshold")) {
|
||||
}
|
||||
|
||||
void SessionMap::register_perfcounters()
|
||||
{
|
||||
PerfCountersBuilder plb(g_ceph_context, "mds_sessions",
|
||||
|
@ -66,6 +71,8 @@ void SessionMap::register_perfcounters()
|
|||
plb.add_u64(l_mdssm_avg_load, "average_load", "Average Load");
|
||||
plb.add_u64(l_mdssm_avg_session_uptime, "avg_session_uptime",
|
||||
"Average session uptime");
|
||||
plb.add_u64(l_mdssm_metadata_threshold_sessions_evicted, "mdthresh_evicted",
|
||||
"Sessions evicted on reaching metadata threshold");
|
||||
|
||||
logger = plb.create_perf_counters();
|
||||
g_ceph_context->get_perfcounters_collection()->add(logger);
|
||||
|
@ -375,6 +382,11 @@ public:
|
|||
};
|
||||
}
|
||||
|
||||
bool SessionMap::validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl) {
|
||||
session->info.encode(bl, mds->mdsmap->get_up_features());
|
||||
return bl.length() < mds_session_metadata_threshold;
|
||||
}
|
||||
|
||||
void SessionMap::save(MDSContext *onsave, version_t needv)
|
||||
{
|
||||
dout(10) << __func__ << ": needv " << needv << ", v " << version << dendl;
|
||||
|
@ -410,6 +422,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
|
|||
|
||||
dout(20) << " updating keys:" << dendl;
|
||||
map<string, bufferlist> to_set;
|
||||
std::set<entity_name_t> to_blocklist;
|
||||
for(std::set<entity_name_t>::iterator i = dirty_sessions.begin();
|
||||
i != dirty_sessions.end(); ++i) {
|
||||
const entity_name_t name = *i;
|
||||
|
@ -420,13 +433,19 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
|
|||
session->is_stale() ||
|
||||
session->is_killing()) {
|
||||
dout(20) << " " << name << dendl;
|
||||
// Serialize K
|
||||
CachedStackStringStream css;
|
||||
*css << name;
|
||||
|
||||
// Serialize V
|
||||
bufferlist bl;
|
||||
session->info.encode(bl, mds->mdsmap->get_up_features());
|
||||
if (!validate_and_encode_session(mds, session, bl)) {
|
||||
derr << __func__ << ": session (" << name << ") exceeds"
|
||||
<< " sesion metadata threshold - blocklisting" << dendl;
|
||||
to_blocklist.emplace(name);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Serialize K
|
||||
CachedStackStringStream css;
|
||||
*css << name;
|
||||
|
||||
// Add to RADOS op
|
||||
to_set[std::string(css->strv())] = bl;
|
||||
|
@ -461,6 +480,8 @@ void SessionMap::save(MDSContext *onsave, version_t needv)
|
|||
0,
|
||||
new C_OnFinisher(new C_IO_SM_Save(this, version),
|
||||
mds->finisher));
|
||||
apply_blocklist(to_blocklist);
|
||||
logger->inc(l_mdssm_metadata_threshold_sessions_evicted, to_blocklist.size());
|
||||
}
|
||||
|
||||
void SessionMap::_save_finish(version_t v)
|
||||
|
@ -826,7 +847,8 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
|
|||
{
|
||||
ceph_assert(gather_bld != NULL);
|
||||
|
||||
std::vector<entity_name_t> write_sessions;
|
||||
std::set<entity_name_t> to_blocklist;
|
||||
std::map<entity_name_t, bufferlist> write_sessions;
|
||||
|
||||
// Decide which sessions require a write
|
||||
for (std::set<entity_name_t>::iterator i = tgt_sessions.begin();
|
||||
|
@ -851,13 +873,24 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
|
|||
// need to pre-empt that.
|
||||
continue;
|
||||
}
|
||||
|
||||
// Serialize V
|
||||
bufferlist bl;
|
||||
if (!validate_and_encode_session(mds, session, bl)) {
|
||||
derr << __func__ << ": session (" << session_id << ") exceeds"
|
||||
<< " sesion metadata threshold - blocklisting" << dendl;
|
||||
to_blocklist.emplace(session_id);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Okay, passed all our checks, now we write
|
||||
// this session out. The version we write
|
||||
// into the OMAP may now be higher-versioned
|
||||
// than the version in the header, but that's
|
||||
// okay because it's never a problem to have
|
||||
// an overly-fresh copy of a session.
|
||||
write_sessions.push_back(*i);
|
||||
write_sessions.emplace(session_id, std::move(bl));
|
||||
session->clear_dirty_completed_requests();
|
||||
}
|
||||
|
||||
dout(4) << __func__ << ": writing " << write_sessions.size() << dendl;
|
||||
|
@ -865,21 +898,15 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
|
|||
// Batch writes into mds_sessionmap_keys_per_op
|
||||
const uint32_t kpo = g_conf()->mds_sessionmap_keys_per_op;
|
||||
map<string, bufferlist> to_set;
|
||||
for (uint32_t i = 0; i < write_sessions.size(); ++i) {
|
||||
const entity_name_t &session_id = write_sessions[i];
|
||||
Session *session = session_map[session_id];
|
||||
session->clear_dirty_completed_requests();
|
||||
|
||||
uint32_t i = 0;
|
||||
for (auto &[session_id, bl] : write_sessions) {
|
||||
// Serialize K
|
||||
CachedStackStringStream css;
|
||||
*css << session_id;
|
||||
|
||||
// Serialize V
|
||||
bufferlist bl;
|
||||
session->info.encode(bl, mds->mdsmap->get_up_features());
|
||||
|
||||
// Add to RADOS op
|
||||
to_set[css->str()] = bl;
|
||||
to_set[css->str()] = std::move(bl);
|
||||
|
||||
// Complete this write transaction?
|
||||
if (i == write_sessions.size() - 1
|
||||
|
@ -898,7 +925,11 @@ void SessionMap::save_if_dirty(const std::set<entity_name_t> &tgt_sessions,
|
|||
new C_IO_SM_Save_One(this, on_safe),
|
||||
mds->finisher));
|
||||
}
|
||||
++i;
|
||||
}
|
||||
|
||||
apply_blocklist(to_blocklist);
|
||||
logger->inc(l_mdssm_metadata_threshold_sessions_evicted, to_blocklist.size());
|
||||
}
|
||||
|
||||
// =================
|
||||
|
@ -1112,6 +1143,10 @@ void SessionMap::handle_conf_change(const std::set<std::string>& changed)
|
|||
};
|
||||
apply_to_open_sessions(mut);
|
||||
}
|
||||
|
||||
if (changed.count("mds_session_metadata_threshold")) {
|
||||
mds_session_metadata_threshold = g_conf().get_val<Option::size_t>("mds_session_metadata_threshold");
|
||||
}
|
||||
}
|
||||
|
||||
void SessionMap::update_average_session_age() {
|
||||
|
@ -1123,6 +1158,20 @@ void SessionMap::update_average_session_age() {
|
|||
logger->set(l_mdssm_avg_session_uptime, (uint64_t)avg_uptime);
|
||||
}
|
||||
|
||||
void SessionMap::apply_blocklist(const std::set<entity_name_t>& victims) {
|
||||
if (victims.empty()) {
|
||||
return;
|
||||
}
|
||||
|
||||
C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop);
|
||||
for (auto &victim : victims) {
|
||||
CachedStackStringStream css;
|
||||
mds->evict_client(victim.num(), false, g_conf()->mds_session_blocklist_on_evict, *css,
|
||||
gather.new_sub());
|
||||
}
|
||||
gather.activate();
|
||||
}
|
||||
|
||||
int SessionFilter::parse(
|
||||
const std::vector<std::string> &args,
|
||||
std::ostream *ss)
|
||||
|
|
|
@ -45,6 +45,7 @@ enum {
|
|||
l_mdssm_total_load,
|
||||
l_mdssm_avg_load,
|
||||
l_mdssm_avg_session_uptime,
|
||||
l_mdssm_metadata_threshold_sessions_evicted,
|
||||
l_mdssm_last,
|
||||
};
|
||||
|
||||
|
@ -594,7 +595,7 @@ protected:
|
|||
class SessionMap : public SessionMapStore {
|
||||
public:
|
||||
SessionMap() = delete;
|
||||
explicit SessionMap(MDSRank *m) : mds(m) {}
|
||||
explicit SessionMap(MDSRank *m);
|
||||
|
||||
~SessionMap() override
|
||||
{
|
||||
|
@ -843,6 +844,11 @@ private:
|
|||
}
|
||||
|
||||
time avg_birth_time = clock::zero();
|
||||
|
||||
size_t mds_session_metadata_threshold;
|
||||
|
||||
bool validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl);
|
||||
void apply_blocklist(const std::set<entity_name_t>& victims);
|
||||
};
|
||||
|
||||
std::ostream& operator<<(std::ostream &out, const Session &s);
|
||||
|
|
Loading…
Reference in New Issue