From bc6814d72a9fbec9c41ed75aee2314666cfca34b Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Fri, 11 Aug 2023 04:36:52 -0400 Subject: [PATCH 1/4] mds: blocklist clients with "bloated" session metadata Buggy clients (or maybe a MDS bug) causes a huge buildup of `completed_requests` metadata in its session information. This could cause the MDS to go read-only when its flushing session metadata to the journal since the bloated metadata causes the ODSOp payload to exceed the maximum write size. Blocklist such clients so as to allow the MDS to continue servicing requests. Fixes: http://tracker.ceph.com/issues/61947 Signed-off-by: Venky Shankar --- src/common/options/mds.yaml.in | 10 +++++ src/mds/MDSRank.cc | 1 + src/mds/SessionMap.cc | 75 +++++++++++++++++++++++++++------- src/mds/SessionMap.h | 7 +++- 4 files changed, 77 insertions(+), 16 deletions(-) diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index d320ce139db..5a06326f9fc 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -1577,3 +1577,13 @@ options: - mds flags: - runtime +- name: mds_session_metadata_threshold + type: size + level: advanced + desc: Evict non-advancing client-tid sessions exceeding the config size. + long_desc: Evict clients which are not advancing their request tids which causes a large buildup of session metadata (`completed_requests`) in the MDS causing the MDS to go read-only since the RADOS operation exceeds the size threashold. This config is the maximum size (in bytes) that a session metadata (encoded) can grow. + default: 16_M + services: + - mds + flags: + - runtime diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 31ea7f6710f..cc810d7e39a 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -3867,6 +3867,7 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const "mds_session_cap_acquisition_throttle", "mds_session_max_caps_throttle_ratio", "mds_symlink_recovery", + "mds_session_metadata_threshold", NULL }; return KEYS; diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc index c21ac469356..873dedbe68d 100644 --- a/src/mds/SessionMap.cc +++ b/src/mds/SessionMap.cc @@ -45,6 +45,11 @@ class SessionMapIOContext : public MDSIOContextBase }; }; +SessionMap::SessionMap(MDSRank *m) + : mds(m), + mds_session_metadata_threshold(g_conf().get_val("mds_session_metadata_threshold")) { +} + void SessionMap::register_perfcounters() { PerfCountersBuilder plb(g_ceph_context, "mds_sessions", @@ -375,6 +380,11 @@ public: }; } +bool SessionMap::validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl) { + session->info.encode(bl, mds->mdsmap->get_up_features()); + return bl.length() < mds_session_metadata_threshold; +} + void SessionMap::save(MDSContext *onsave, version_t needv) { dout(10) << __func__ << ": needv " << needv << ", v " << version << dendl; @@ -410,6 +420,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv) dout(20) << " updating keys:" << dendl; map to_set; + std::set to_blocklist; for(std::set::iterator i = dirty_sessions.begin(); i != dirty_sessions.end(); ++i) { const entity_name_t name = *i; @@ -420,13 +431,19 @@ void SessionMap::save(MDSContext *onsave, version_t needv) session->is_stale() || session->is_killing()) { dout(20) << " " << name << dendl; - // Serialize K - CachedStackStringStream css; - *css << name; // Serialize V bufferlist bl; - session->info.encode(bl, mds->mdsmap->get_up_features()); + if (!validate_and_encode_session(mds, session, bl)) { + derr << __func__ << ": session (" << name << ") exceeds" + << " sesion metadata threshold - blocklisting" << dendl; + to_blocklist.emplace(name); + continue; + } + + // Serialize K + CachedStackStringStream css; + *css << name; // Add to RADOS op to_set[std::string(css->strv())] = bl; @@ -461,6 +478,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv) 0, new C_OnFinisher(new C_IO_SM_Save(this, version), mds->finisher)); + apply_blocklist(to_blocklist); } void SessionMap::_save_finish(version_t v) @@ -826,7 +844,8 @@ void SessionMap::save_if_dirty(const std::set &tgt_sessions, { ceph_assert(gather_bld != NULL); - std::vector write_sessions; + std::set to_blocklist; + std::map write_sessions; // Decide which sessions require a write for (std::set::iterator i = tgt_sessions.begin(); @@ -851,13 +870,24 @@ void SessionMap::save_if_dirty(const std::set &tgt_sessions, // need to pre-empt that. continue; } + + // Serialize V + bufferlist bl; + if (!validate_and_encode_session(mds, session, bl)) { + derr << __func__ << ": session (" << session_id << ") exceeds" + << " sesion metadata threshold - blocklisting" << dendl; + to_blocklist.emplace(session_id); + continue; + } + // Okay, passed all our checks, now we write // this session out. The version we write // into the OMAP may now be higher-versioned // than the version in the header, but that's // okay because it's never a problem to have // an overly-fresh copy of a session. - write_sessions.push_back(*i); + write_sessions.emplace(session_id, std::move(bl)); + session->clear_dirty_completed_requests(); } dout(4) << __func__ << ": writing " << write_sessions.size() << dendl; @@ -865,21 +895,15 @@ void SessionMap::save_if_dirty(const std::set &tgt_sessions, // Batch writes into mds_sessionmap_keys_per_op const uint32_t kpo = g_conf()->mds_sessionmap_keys_per_op; map to_set; - for (uint32_t i = 0; i < write_sessions.size(); ++i) { - const entity_name_t &session_id = write_sessions[i]; - Session *session = session_map[session_id]; - session->clear_dirty_completed_requests(); + uint32_t i = 0; + for (auto &[session_id, bl] : write_sessions) { // Serialize K CachedStackStringStream css; *css << session_id; - // Serialize V - bufferlist bl; - session->info.encode(bl, mds->mdsmap->get_up_features()); - // Add to RADOS op - to_set[css->str()] = bl; + to_set[css->str()] = std::move(bl); // Complete this write transaction? if (i == write_sessions.size() - 1 @@ -898,7 +922,10 @@ void SessionMap::save_if_dirty(const std::set &tgt_sessions, new C_IO_SM_Save_One(this, on_safe), mds->finisher)); } + ++i; } + + apply_blocklist(to_blocklist); } // ================= @@ -1112,6 +1139,10 @@ void SessionMap::handle_conf_change(const std::set& changed) }; apply_to_open_sessions(mut); } + + if (changed.count("mds_session_metadata_threshold")) { + mds_session_metadata_threshold = g_conf().get_val("mds_session_metadata_threshold"); + } } void SessionMap::update_average_session_age() { @@ -1123,6 +1154,20 @@ void SessionMap::update_average_session_age() { logger->set(l_mdssm_avg_session_uptime, (uint64_t)avg_uptime); } +void SessionMap::apply_blocklist(const std::set& victims) { + if (victims.empty()) { + return; + } + + C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop); + for (auto &victim : victims) { + CachedStackStringStream css; + mds->evict_client(victim.num(), false, g_conf()->mds_session_blocklist_on_evict, *css, + gather.new_sub()); + } + gather.activate(); +} + int SessionFilter::parse( const std::vector &args, std::ostream *ss) diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h index 067e1474cc3..c1c28a8455b 100644 --- a/src/mds/SessionMap.h +++ b/src/mds/SessionMap.h @@ -594,7 +594,7 @@ protected: class SessionMap : public SessionMapStore { public: SessionMap() = delete; - explicit SessionMap(MDSRank *m) : mds(m) {} + explicit SessionMap(MDSRank *m); ~SessionMap() override { @@ -843,6 +843,11 @@ private: } time avg_birth_time = clock::zero(); + + size_t mds_session_metadata_threshold; + + bool validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl); + void apply_blocklist(const std::set& victims); }; std::ostream& operator<<(std::ostream &out, const Session &s); From 59dd587ddb3bc95a95d0fb8715511d194181ec6f Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Mon, 21 Aug 2023 03:50:22 -0400 Subject: [PATCH 2/4] mds: add perf counter to track number of sessions evicted due to metadata threshold being exceeded Signed-off-by: Venky Shankar --- src/mds/SessionMap.cc | 4 ++++ src/mds/SessionMap.h | 1 + 2 files changed, 5 insertions(+) diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc index 873dedbe68d..9cc2b013847 100644 --- a/src/mds/SessionMap.cc +++ b/src/mds/SessionMap.cc @@ -71,6 +71,8 @@ void SessionMap::register_perfcounters() plb.add_u64(l_mdssm_avg_load, "average_load", "Average Load"); plb.add_u64(l_mdssm_avg_session_uptime, "avg_session_uptime", "Average session uptime"); + plb.add_u64(l_mdssm_metadata_threshold_sessions_evicted, "mdthresh_evicted", + "Sessions evicted on reaching metadata threshold"); logger = plb.create_perf_counters(); g_ceph_context->get_perfcounters_collection()->add(logger); @@ -479,6 +481,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv) new C_OnFinisher(new C_IO_SM_Save(this, version), mds->finisher)); apply_blocklist(to_blocklist); + logger->inc(l_mdssm_metadata_threshold_sessions_evicted, to_blocklist.size()); } void SessionMap::_save_finish(version_t v) @@ -926,6 +929,7 @@ void SessionMap::save_if_dirty(const std::set &tgt_sessions, } apply_blocklist(to_blocklist); + logger->inc(l_mdssm_metadata_threshold_sessions_evicted, to_blocklist.size()); } // ================= diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h index c1c28a8455b..360dd66a27b 100644 --- a/src/mds/SessionMap.h +++ b/src/mds/SessionMap.h @@ -45,6 +45,7 @@ enum { l_mdssm_total_load, l_mdssm_avg_load, l_mdssm_avg_session_uptime, + l_mdssm_metadata_threshold_sessions_evicted, l_mdssm_last, }; From 84df4b3d0c9e767a74cf5af80e8138239992df2c Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Fri, 11 Aug 2023 04:40:36 -0400 Subject: [PATCH 3/4] test: add test to verify that a buggy client is blocklisted ... when its session metadata is bloated due to buildup of `completed_requests`. Signed-off-by: Venky Shankar --- qa/tasks/cephfs/test_client_limits.py | 51 +++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py index 836f81af164..102f22f960c 100644 --- a/qa/tasks/cephfs/test_client_limits.py +++ b/qa/tasks/cephfs/test_client_limits.py @@ -9,6 +9,7 @@ from textwrap import dedent from tasks.ceph_test_case import TestTimeoutError from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming from tasks.cephfs.fuse_mount import FuseMount +from teuthology.exceptions import CommandFailedError import os @@ -221,6 +222,56 @@ class TestClientLimits(CephFSTestCase): self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) rproc.wait() + def test_client_blocklisted_oldest_tid(self): + """ + that a client is blocklisted when its encoded session metadata exceeds the + configured threshold (due to ever growing `completed_requests` caused due + to an unidentified bug (in the client or the MDS)). + """ + + # num of requests client issues + max_requests = 10000 + + # The debug hook to inject the failure only exists in the fuse client + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Require FUSE client to inject client release failure") + + self.config_set('client', 'client inject fixed oldest tid', 'true') + self.mount_a.teardown() + self.mount_a.mount_wait() + + self.config_set('mds', 'mds_max_completed_requests', max_requests); + + # Create lots of files + self.mount_a.create_n_files("testdir/file1", max_requests + 100) + + # Create a few files synchronously. This makes sure previous requests are completed + self.mount_a.create_n_files("testdir/file2", 5, True) + + # Wait for the health warnings. Assume mds can handle 10 request per second at least + self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests // 10, check_in_detail=str(self.mount_a.client_id)) + + # why a multiplier of 20, you may ask - I arrieved at this from some debugs + # that I put when testing the fix in a vstart cluster where its a ratio of + # encoded session information to the number of completed requests. + self.config_set('mds', 'mds_session_metadata_threshold', max_requests*20); + + # Create a few more files synchronously. This would hit the session metadata threshold + # causing the client to get blocklisted. + with self.assertRaises(CommandFailedError): + self.mount_a.create_n_files("testdir/file2", 20, True) + + self.mds_cluster.is_addr_blocklisted(self.mount_a.get_global_addr()) + # the mds should bump up the relevant perf counter + pd = self.perf_dump() + self.assertGreater(pd['mds_sessions']['md_thresh_evicted'], 0) + + # reset the config + self.config_set('client', 'client inject fixed oldest tid', 'false') + + self.mount_a.kill_cleanup() + self.mount_a.mount_wait() + def test_client_oldest_tid(self): """ When a client does not advance its oldest tid, the MDS should notice that From ac3ab1a203ad65c7db07bc2c02acb5620800fb25 Mon Sep 17 00:00:00 2001 From: Venky Shankar Date: Thu, 17 Aug 2023 10:10:37 +0530 Subject: [PATCH 4/4] PendingReleaseNotes: add a note for `mds_session_metadata_threshold` mds config Signed-off-by: Venky Shankar --- PendingReleaseNotes | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/PendingReleaseNotes b/PendingReleaseNotes index df9c22cc192..8917fdadc0a 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -8,6 +8,10 @@ For multi-site deployments that make any use of Server-Side Encryption, we recommended running this command against every bucket in every zone after all zones have upgraded. +* CEPHFS: MDS evicts clients which are not advancing their request tids which causes + a large buildup of session metadata resulting in the MDS going read-only due to + the RADOS operation exceeding the size threshold. `mds_session_metadata_threshold` + config controls the maximum size that a (encoded) session metadata can grow. >=18.0.0