diff --git a/.githubmap b/.githubmap index 08da187a58e..b5d44763134 100644 --- a/.githubmap +++ b/.githubmap @@ -174,3 +174,4 @@ parth-gr Parth Arora phlogistonjohn John Mulligan baergj Joshua Baergen zmc Zack Cerza +robbat2 Robin H. Johnson diff --git a/PendingReleaseNotes b/PendingReleaseNotes index df9c22cc192..8917fdadc0a 100644 --- a/PendingReleaseNotes +++ b/PendingReleaseNotes @@ -8,6 +8,10 @@ For multi-site deployments that make any use of Server-Side Encryption, we recommended running this command against every bucket in every zone after all zones have upgraded. +* CEPHFS: MDS evicts clients which are not advancing their request tids which causes + a large buildup of session metadata resulting in the MDS going read-only due to + the RADOS operation exceeding the size threshold. `mds_session_metadata_threshold` + config controls the maximum size that a (encoded) session metadata can grow. >=18.0.0 diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py index f2d63920564..a9bd7d631c0 100644 --- a/qa/tasks/cephfs/test_client_limits.py +++ b/qa/tasks/cephfs/test_client_limits.py @@ -9,6 +9,7 @@ from textwrap import dedent from tasks.ceph_test_case import TestTimeoutError from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming from tasks.cephfs.fuse_mount import FuseMount +from teuthology.exceptions import CommandFailedError import os from io import StringIO @@ -242,6 +243,56 @@ class TestClientLimits(CephFSTestCase): self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) rproc.wait() + def test_client_blocklisted_oldest_tid(self): + """ + that a client is blocklisted when its encoded session metadata exceeds the + configured threshold (due to ever growing `completed_requests` caused due + to an unidentified bug (in the client or the MDS)). + """ + + # num of requests client issues + max_requests = 10000 + + # The debug hook to inject the failure only exists in the fuse client + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Require FUSE client to inject client release failure") + + self.config_set('client', 'client inject fixed oldest tid', 'true') + self.mount_a.teardown() + self.mount_a.mount_wait() + + self.config_set('mds', 'mds_max_completed_requests', max_requests); + + # Create lots of files + self.mount_a.create_n_files("testdir/file1", max_requests + 100) + + # Create a few files synchronously. This makes sure previous requests are completed + self.mount_a.create_n_files("testdir/file2", 5, True) + + # Wait for the health warnings. Assume mds can handle 10 request per second at least + self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests // 10, check_in_detail=str(self.mount_a.client_id)) + + # why a multiplier of 20, you may ask - I arrieved at this from some debugs + # that I put when testing the fix in a vstart cluster where its a ratio of + # encoded session information to the number of completed requests. + self.config_set('mds', 'mds_session_metadata_threshold', max_requests*20); + + # Create a few more files synchronously. This would hit the session metadata threshold + # causing the client to get blocklisted. + with self.assertRaises(CommandFailedError): + self.mount_a.create_n_files("testdir/file2", 20, True) + + self.mds_cluster.is_addr_blocklisted(self.mount_a.get_global_addr()) + # the mds should bump up the relevant perf counter + pd = self.perf_dump() + self.assertGreater(pd['mds_sessions']['md_thresh_evicted'], 0) + + # reset the config + self.config_set('client', 'client inject fixed oldest tid', 'false') + + self.mount_a.kill_cleanup() + self.mount_a.mount_wait() + def test_client_oldest_tid(self): """ When a client does not advance its oldest tid, the MDS should notice that diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index d320ce139db..5a06326f9fc 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -1577,3 +1577,13 @@ options: - mds flags: - runtime +- name: mds_session_metadata_threshold + type: size + level: advanced + desc: Evict non-advancing client-tid sessions exceeding the config size. + long_desc: Evict clients which are not advancing their request tids which causes a large buildup of session metadata (`completed_requests`) in the MDS causing the MDS to go read-only since the RADOS operation exceeds the size threashold. This config is the maximum size (in bytes) that a session metadata (encoded) can grow. + default: 16_M + services: + - mds + flags: + - runtime diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 31ea7f6710f..cc810d7e39a 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -3867,6 +3867,7 @@ const char** MDSRankDispatcher::get_tracked_conf_keys() const "mds_session_cap_acquisition_throttle", "mds_session_max_caps_throttle_ratio", "mds_symlink_recovery", + "mds_session_metadata_threshold", NULL }; return KEYS; diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc index c21ac469356..9cc2b013847 100644 --- a/src/mds/SessionMap.cc +++ b/src/mds/SessionMap.cc @@ -45,6 +45,11 @@ class SessionMapIOContext : public MDSIOContextBase }; }; +SessionMap::SessionMap(MDSRank *m) + : mds(m), + mds_session_metadata_threshold(g_conf().get_val("mds_session_metadata_threshold")) { +} + void SessionMap::register_perfcounters() { PerfCountersBuilder plb(g_ceph_context, "mds_sessions", @@ -66,6 +71,8 @@ void SessionMap::register_perfcounters() plb.add_u64(l_mdssm_avg_load, "average_load", "Average Load"); plb.add_u64(l_mdssm_avg_session_uptime, "avg_session_uptime", "Average session uptime"); + plb.add_u64(l_mdssm_metadata_threshold_sessions_evicted, "mdthresh_evicted", + "Sessions evicted on reaching metadata threshold"); logger = plb.create_perf_counters(); g_ceph_context->get_perfcounters_collection()->add(logger); @@ -375,6 +382,11 @@ public: }; } +bool SessionMap::validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl) { + session->info.encode(bl, mds->mdsmap->get_up_features()); + return bl.length() < mds_session_metadata_threshold; +} + void SessionMap::save(MDSContext *onsave, version_t needv) { dout(10) << __func__ << ": needv " << needv << ", v " << version << dendl; @@ -410,6 +422,7 @@ void SessionMap::save(MDSContext *onsave, version_t needv) dout(20) << " updating keys:" << dendl; map to_set; + std::set to_blocklist; for(std::set::iterator i = dirty_sessions.begin(); i != dirty_sessions.end(); ++i) { const entity_name_t name = *i; @@ -420,13 +433,19 @@ void SessionMap::save(MDSContext *onsave, version_t needv) session->is_stale() || session->is_killing()) { dout(20) << " " << name << dendl; - // Serialize K - CachedStackStringStream css; - *css << name; // Serialize V bufferlist bl; - session->info.encode(bl, mds->mdsmap->get_up_features()); + if (!validate_and_encode_session(mds, session, bl)) { + derr << __func__ << ": session (" << name << ") exceeds" + << " sesion metadata threshold - blocklisting" << dendl; + to_blocklist.emplace(name); + continue; + } + + // Serialize K + CachedStackStringStream css; + *css << name; // Add to RADOS op to_set[std::string(css->strv())] = bl; @@ -461,6 +480,8 @@ void SessionMap::save(MDSContext *onsave, version_t needv) 0, new C_OnFinisher(new C_IO_SM_Save(this, version), mds->finisher)); + apply_blocklist(to_blocklist); + logger->inc(l_mdssm_metadata_threshold_sessions_evicted, to_blocklist.size()); } void SessionMap::_save_finish(version_t v) @@ -826,7 +847,8 @@ void SessionMap::save_if_dirty(const std::set &tgt_sessions, { ceph_assert(gather_bld != NULL); - std::vector write_sessions; + std::set to_blocklist; + std::map write_sessions; // Decide which sessions require a write for (std::set::iterator i = tgt_sessions.begin(); @@ -851,13 +873,24 @@ void SessionMap::save_if_dirty(const std::set &tgt_sessions, // need to pre-empt that. continue; } + + // Serialize V + bufferlist bl; + if (!validate_and_encode_session(mds, session, bl)) { + derr << __func__ << ": session (" << session_id << ") exceeds" + << " sesion metadata threshold - blocklisting" << dendl; + to_blocklist.emplace(session_id); + continue; + } + // Okay, passed all our checks, now we write // this session out. The version we write // into the OMAP may now be higher-versioned // than the version in the header, but that's // okay because it's never a problem to have // an overly-fresh copy of a session. - write_sessions.push_back(*i); + write_sessions.emplace(session_id, std::move(bl)); + session->clear_dirty_completed_requests(); } dout(4) << __func__ << ": writing " << write_sessions.size() << dendl; @@ -865,21 +898,15 @@ void SessionMap::save_if_dirty(const std::set &tgt_sessions, // Batch writes into mds_sessionmap_keys_per_op const uint32_t kpo = g_conf()->mds_sessionmap_keys_per_op; map to_set; - for (uint32_t i = 0; i < write_sessions.size(); ++i) { - const entity_name_t &session_id = write_sessions[i]; - Session *session = session_map[session_id]; - session->clear_dirty_completed_requests(); + uint32_t i = 0; + for (auto &[session_id, bl] : write_sessions) { // Serialize K CachedStackStringStream css; *css << session_id; - // Serialize V - bufferlist bl; - session->info.encode(bl, mds->mdsmap->get_up_features()); - // Add to RADOS op - to_set[css->str()] = bl; + to_set[css->str()] = std::move(bl); // Complete this write transaction? if (i == write_sessions.size() - 1 @@ -898,7 +925,11 @@ void SessionMap::save_if_dirty(const std::set &tgt_sessions, new C_IO_SM_Save_One(this, on_safe), mds->finisher)); } + ++i; } + + apply_blocklist(to_blocklist); + logger->inc(l_mdssm_metadata_threshold_sessions_evicted, to_blocklist.size()); } // ================= @@ -1112,6 +1143,10 @@ void SessionMap::handle_conf_change(const std::set& changed) }; apply_to_open_sessions(mut); } + + if (changed.count("mds_session_metadata_threshold")) { + mds_session_metadata_threshold = g_conf().get_val("mds_session_metadata_threshold"); + } } void SessionMap::update_average_session_age() { @@ -1123,6 +1158,20 @@ void SessionMap::update_average_session_age() { logger->set(l_mdssm_avg_session_uptime, (uint64_t)avg_uptime); } +void SessionMap::apply_blocklist(const std::set& victims) { + if (victims.empty()) { + return; + } + + C_GatherBuilder gather(g_ceph_context, new C_MDSInternalNoop); + for (auto &victim : victims) { + CachedStackStringStream css; + mds->evict_client(victim.num(), false, g_conf()->mds_session_blocklist_on_evict, *css, + gather.new_sub()); + } + gather.activate(); +} + int SessionFilter::parse( const std::vector &args, std::ostream *ss) diff --git a/src/mds/SessionMap.h b/src/mds/SessionMap.h index 067e1474cc3..360dd66a27b 100644 --- a/src/mds/SessionMap.h +++ b/src/mds/SessionMap.h @@ -45,6 +45,7 @@ enum { l_mdssm_total_load, l_mdssm_avg_load, l_mdssm_avg_session_uptime, + l_mdssm_metadata_threshold_sessions_evicted, l_mdssm_last, }; @@ -594,7 +595,7 @@ protected: class SessionMap : public SessionMapStore { public: SessionMap() = delete; - explicit SessionMap(MDSRank *m) : mds(m) {} + explicit SessionMap(MDSRank *m); ~SessionMap() override { @@ -843,6 +844,11 @@ private: } time avg_birth_time = clock::zero(); + + size_t mds_session_metadata_threshold; + + bool validate_and_encode_session(MDSRank *mds, Session *session, bufferlist& bl); + void apply_blocklist(const std::set& victims); }; std::ostream& operator<<(std::ostream &out, const Session &s);