Merge PR #26038 into master

* refs/pull/26038/head: mds: simplify recall warnings mds: add extra details for cache drop output qa: test mds_max_caps_per_client conf mds: limit maximum number of caps held by session mds: adapt drop cache for incremental recall mds: recall caps incrementally mds: adapt drop cache for incremental trim mds: add throttle for trimming MDCache mds: cleanup SessionMap init mds: cleanup Session init Reviewed-by: Zheng Yan <zyan@redhat.com>
2025-03-20 09:16:59 +00:00 · 2019-01-31 12:08:26 -08:00 · 2019-01-31 12:08:26 -08:00 · 0d26266ccb
commit 0d26266ccb
parent e6c2da1e96 c0b3a11484
14 changed files with 443 additions and 205 deletions
--- a/20
+++ b/20
@ -165,6 +165,26 @@
  respectively.  This is to clarify that these warnings are related to pg scrubbing
  and are a ratio of the related interval.  These options are now enabled by default.

+* The MDS cache trimming is now throttled. Dropping the MDS cache
+  via the `ceph tell mds.<foo> cache drop` command or large reductions in the
+  cache size will no longer cause service unavailability.
+
+* The CephFS MDS behavior with recalling caps has been significantly improved
+  to not attempt recalling too many caps at once, leading to instability.
+  MDS with a large cache (64GB+) should be more stable.
+
+* MDS now provides a config option "mds_max_caps_per_client" (default: 1M) to
+  limit the number of caps a client session may hold. Long running client
+  sessions with a large number of caps have been a source of instability in the
+  MDS when all of these caps need to be processed during certain session
+  events. It is recommended to not unnecessarily increase this value.
+
+* The MDS config mds_recall_state_timeout has been removed. Late client recall
+  warnings are now generated based on the number of caps the MDS has recalled
+  which have not been released. The new configs mds_recall_warning_threshold
+  (default: 32K) and mds_recall_warning_decay_rate (default: 60s) sets the
+  threshold for this warning.
+
 >=13.1.0
 --------

--- a/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml
+++ b/qa/suites/fs/bugs/client_trim_caps/tasks/trim-i22073.yaml
@ -10,7 +10,6 @@ overrides:
 tasks:
 - exec:
    mon.a:
-    - "ceph tell mds.* config set mds_max_ratio_caps_per_client 1"
    - "ceph tell mds.* config set mds_min_caps_per_client 1"
 - background_exec:
    mon.a:
--- a/qa/tasks/cephfs/test_client_limits.py
+++ b/qa/tasks/cephfs/test_client_limits.py
@ -42,12 +42,14 @@ class TestClientLimits(CephFSTestCase):
        cache_size = open_files/2

        self.set_conf('mds', 'mds cache size', cache_size)
+        self.set_conf('mds', 'mds_recall_max_caps', open_files/2)
+        self.set_conf('mds', 'mds_recall_warning_threshold', open_files)
        self.fs.mds_fail_restart()
        self.fs.wait_for_daemons()

        mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+        mds_recall_warning_decay_rate = self.fs.get_config("mds_recall_warning_decay_rate")
        self.assertTrue(open_files >= mds_min_caps_per_client)
-        mds_max_ratio_caps_per_client = float(self.fs.get_config("mds_max_ratio_caps_per_client"))

        mount_a_client_id = self.mount_a.get_global_id()
        path = "subdir/mount_a" if use_subdir else "mount_a"
@ -64,13 +66,11 @@ class TestClientLimits(CephFSTestCase):

        # MDS should not be happy about that, as the client is failing to comply
        # with the SESSION_RECALL messages it is being sent
-        mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout"))
-        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_state_timeout+10)
+        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_warning_decay_rate*2)

        # We can also test that the MDS health warning for oversized
        # cache is functioning as intended.
-        self.wait_for_health("MDS_CACHE_OVERSIZED",
-                mds_recall_state_timeout + 10)
+        self.wait_for_health("MDS_CACHE_OVERSIZED", mds_recall_warning_decay_rate*2)

        # When the client closes the files, it should retain only as many caps as allowed
        # under the SESSION_RECALL policy
@ -84,14 +84,13 @@ class TestClientLimits(CephFSTestCase):

        # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
        # which depend on the caps outstanding, cache size and overall ratio
-        recall_expected_value = int((1.0-mds_max_ratio_caps_per_client)*(open_files+2))
        def expected_caps():
            num_caps = self.get_session(mount_a_client_id)['num_caps']
            if num_caps < mds_min_caps_per_client:
                raise RuntimeError("client caps fell below min!")
            elif num_caps == mds_min_caps_per_client:
                return True
-            elif recall_expected_value*.95 <= num_caps <= recall_expected_value*1.05:
+            elif num_caps < cache_size:
                return True
            else:
                return False
@ -237,3 +236,28 @@ class TestClientLimits(CephFSTestCase):
    def test_client_cache_size(self):
        self._test_client_cache_size(False)
        self._test_client_cache_size(True)
+
+    def test_client_max_caps(self):
+        """
+        That the MDS will not let a client sit above mds_max_caps_per_client caps.
+        """
+
+        mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+        mds_max_caps_per_client = 2*mds_min_caps_per_client
+        self.set_conf('mds', 'mds_max_caps_per_client', mds_max_caps_per_client)
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.create_n_files("foo/", 3*mds_max_caps_per_client, sync=True)
+
+        mount_a_client_id = self.mount_a.get_global_id()
+        def expected_caps():
+            num_caps = self.get_session(mount_a_client_id)['num_caps']
+            if num_caps < mds_min_caps_per_client:
+                raise RuntimeError("client caps fell below min!")
+            elif num_caps <= mds_max_caps_per_client:
+                return True
+            else:
+                return False
+
+        self.wait_until_true(expected_caps, timeout=60)
--- a/src/common/legacy_config_opts.h
+++ b/src/common/legacy_config_opts.h
@ -410,7 +410,6 @@ OPTION(mds_session_blacklist_on_timeout, OPT_BOOL)    // whether to blacklist cl
 OPTION(mds_session_blacklist_on_evict, OPT_BOOL)  // whether to blacklist clients whose sessions are dropped via admin commands

 OPTION(mds_sessionmap_keys_per_op, OPT_U32)    // how many sessions should I try to load/store in a single OMAP operation?
-OPTION(mds_recall_state_timeout, OPT_FLOAT)    // detect clients which aren't trimming caps
 OPTION(mds_freeze_tree_timeout, OPT_FLOAT)    // detecting freeze tree deadlock
 OPTION(mds_health_summarize_threshold, OPT_INT) // collapse N-client health metrics to a single 'many'
 OPTION(mds_reconnect_timeout, OPT_FLOAT)  // seconds to wait for clients during mds restart
--- a/src/common/options.cc
+++ b/src/common/options.cc
@ -7179,6 +7179,14 @@ std::vector<Option> get_mds_options() {
    .set_default(.7)
    .set_description("midpoint for MDS cache LRU"),

+    Option("mds_cache_trim_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(1)
+    .set_description("decay rate for trimming MDS cache throttle"),
+
+    Option("mds_cache_trim_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_description("threshold for number of dentries that can be trimmed"),
+
    Option("mds_max_file_recover", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
    .set_default(32)
    .set_description("maximum number of files to recover file sizes in parallel"),
@ -7223,9 +7231,29 @@ std::vector<Option> get_mds_options() {
    .set_default(1024)
    .set_description("number of omap keys to read from the SessionMap in one operation"),

-    Option("mds_recall_state_timeout", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
-    .set_default(60)
-    .set_description("timeout for clients late on cap recall to create health warnings"),
+    Option("mds_recall_max_caps", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(5000)
+    .set_description("maximum number of caps to recall from client session in single recall"),
+
+    Option("mds_recall_max_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(2.5)
+    .set_description("decay rate for throttle on recalled caps on a session"),
+
+    Option("mds_recall_max_decay_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(16_K)
+    .set_description("decay threshold for throttle on recalled caps on a session"),
+
+    Option("mds_recall_global_max_decay_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(64_K)
+    .set_description("decay threshold for throttle on recalled caps globally"),
+
+    Option("mds_recall_warning_threshold", Option::TYPE_SIZE, Option::LEVEL_ADVANCED)
+    .set_default(32_K)
+    .set_description("decay threshold for warning on slow session cap recall"),
+
+    Option("mds_recall_warning_decay_rate", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
+    .set_default(60.0)
+    .set_description("decay rate for warning on slow session cap recall"),

    Option("mds_freeze_tree_timeout", Option::TYPE_FLOAT, Option::LEVEL_DEV)
    .set_default(30)
@ -7605,9 +7633,9 @@ std::vector<Option> get_mds_options() {
    .set_default(100)
    .set_description("minimum number of capabilities a client may hold"),

-    Option("mds_max_ratio_caps_per_client", Option::TYPE_FLOAT, Option::LEVEL_DEV)
-    .set_default(.8)
-    .set_description("maximum ratio of current caps that may be recalled during MDS cache pressure"),
+    Option("mds_max_caps_per_client", Option::TYPE_UINT, Option::LEVEL_ADVANCED)
+    .set_default(1_M)
+    .set_description("maximum number of capabilities a client may hold"),

    Option("mds_hack_allow_loading_invalid_metadata", Option::TYPE_BOOL, Option::LEVEL_ADVANCED)
     .set_default(0)
--- a/src/mds/Beacon.cc
+++ b/src/mds/Beacon.cc
@ -374,40 +374,27 @@ void Beacon::notify_health(MDSRank const *mds)
    set<Session*> sessions;
    mds->sessionmap.get_client_session_set(sessions);

-    auto mds_recall_state_timeout = g_conf()->mds_recall_state_timeout;
-    auto last_recall = mds->mdcache->last_recall_state;
-    auto last_recall_span = std::chrono::duration<double>(clock::now()-last_recall).count();
-    bool recall_state_timedout = last_recall_span > mds_recall_state_timeout;
-
+    const auto recall_warning_threshold = g_conf().get_val<Option::size_t>("mds_recall_warning_threshold");
+    const auto max_completed_requests = g_conf()->mds_max_completed_requests;
+    const auto max_completed_flushes = g_conf()->mds_max_completed_flushes;
    std::list<MDSHealthMetric> late_recall_metrics;
    std::list<MDSHealthMetric> large_completed_requests_metrics;
    for (auto& session : sessions) {
-      if (session->recalled_at != Session::clock::zero()) {
-        auto last_recall_sent = session->last_recall_sent;
-        auto recalled_at = session->recalled_at;
-        auto recalled_at_span = std::chrono::duration<double>(clock::now()-recalled_at).count();
-
-        dout(20) << "Session servicing RECALL " << session->info.inst
-          << ": " << recalled_at_span << "s ago " << session->recall_release_count
-          << "/" << session->recall_count << dendl;
-	if (recall_state_timedout || last_recall_sent < last_recall) {
-	  dout(20) << "  no longer recall" << dendl;
-	  session->clear_recalled_at();
-	} else if (recalled_at_span > mds_recall_state_timeout) {
-          dout(20) << "  exceeded timeout " << recalled_at_span << " vs. " << mds_recall_state_timeout << dendl;
-          std::ostringstream oss;
-	  oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
-          MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
-          m.metadata["client_id"] = stringify(session->get_client());
-          late_recall_metrics.push_back(m);
-        } else {
-          dout(20) << "  within timeout " << recalled_at_span << " vs. " << mds_recall_state_timeout << dendl;
-        }
+      const uint64_t recall_caps = session->get_recall_caps();
+      if (recall_caps > recall_warning_threshold) {
+        dout(2) << "Session " << *session <<
+             " is not releasing caps fast enough. Recalled caps at " << recall_caps
+          << " > " << recall_warning_threshold << " (mds_recall_warning_threshold)." << dendl;
+        std::ostringstream oss;
+        oss << "Client " << session->get_human_name() << " failing to respond to cache pressure";
+        MDSHealthMetric m(MDS_HEALTH_CLIENT_RECALL, HEALTH_WARN, oss.str());
+        m.metadata["client_id"] = stringify(session->get_client());
+        late_recall_metrics.push_back(m);
      }
      if ((session->get_num_trim_requests_warnings() > 0 &&
-	   session->get_num_completed_requests() >= g_conf()->mds_max_completed_requests) ||
+	   session->get_num_completed_requests() >= max_completed_requests) ||
 	  (session->get_num_trim_flushes_warnings() > 0 &&
-	   session->get_num_completed_flushes() >= g_conf()->mds_max_completed_flushes)) {
+	   session->get_num_completed_flushes() >= max_completed_flushes)) {
 	std::ostringstream oss;
 	oss << "Client " << session->get_human_name() << " failing to advance its oldest client/flush tid";
 	MDSHealthMetric m(MDS_HEALTH_CLIENT_OLDEST_TID, HEALTH_WARN, oss.str());
--- a/src/mds/MDCache.cc
+++ b/src/mds/MDCache.cc
@ -139,6 +139,7 @@ MDCache::MDCache(MDSRank *m, PurgeQueue &purge_queue_) :
  exceeded_size_limit(false),
  recovery_queue(m),
  stray_manager(m, purge_queue_),
+  trim_counter(g_conf().get_val<double>("mds_cache_trim_decay_rate")),
  open_file_table(m)
 {
  migrator.reset(new Migrator(mds, this));
@ -211,6 +212,9 @@ void MDCache::handle_conf_change(const ConfigProxy& conf,
    cache_health_threshold = g_conf().get_val<double>("mds_health_cache_threshold");
  if (changed.count("mds_cache_mid"))
    lru.lru_set_midpoint(g_conf().get_val<double>("mds_cache_mid"));
+  if (changed.count("mds_cache_trim_decay_rate")) {
+    trim_counter = DecayCounter(g_conf().get_val<double>("mds_cache_trim_decay_rate"));
+  }

  migrator->handle_conf_change(conf, changed, mdsmap);
  mds->balancer->handle_conf_change(conf, changed, mdsmap);
@ -6528,12 +6532,14 @@ void MDCache::start_recovered_truncates()
 // ================================================================================
 // cache trimming

-void MDCache::trim_lru(uint64_t count, expiremap& expiremap)
+std::pair<bool, uint64_t> MDCache::trim_lru(uint64_t count, expiremap& expiremap)
 {
  bool is_standby_replay = mds->is_standby_replay();
  std::vector<CDentry *> unexpirables;
  uint64_t trimmed = 0;

+  auto trim_threshold = g_conf().get_val<Option::size_t>("mds_cache_trim_threshold");
+
  dout(7) << "trim_lru trimming " << count
          << " items from LRU"
          << " size=" << lru.lru_get_size()
@ -6542,7 +6548,11 @@ void MDCache::trim_lru(uint64_t count, expiremap& expiremap)
          << " pinned=" << lru.lru_get_num_pinned()
          << dendl;

-  for (;;) {
+  const uint64_t trim_counter_start = trim_counter.get();
+  bool throttled = false;
+  while (1) {
+    throttled |= trim_counter_start+trimmed >= trim_threshold;
+    if (throttled) break;
    CDentry *dn = static_cast<CDentry*>(bottom_lru.lru_expire());
    if (!dn)
      break;
@ -6559,7 +6569,9 @@ void MDCache::trim_lru(uint64_t count, expiremap& expiremap)
  unexpirables.clear();

  // trim dentries from the LRU until count is reached
-  while (cache_toofull() || count > 0) {
+  while (!throttled && (cache_toofull() || count > 0)) {
+    throttled |= trim_counter_start+trimmed >= trim_threshold;
+    if (throttled) break;
    CDentry *dn = static_cast<CDentry*>(lru.lru_expire());
    if (!dn) {
      break;
@ -6574,6 +6586,7 @@ void MDCache::trim_lru(uint64_t count, expiremap& expiremap)
      if (count > 0) count--;
    }
  }
+  trim_counter.hit(trimmed);

  for (auto &dn : unexpirables) {
    lru.lru_insert_mid(dn);
@ -6581,6 +6594,7 @@ void MDCache::trim_lru(uint64_t count, expiremap& expiremap)
  unexpirables.clear();

  dout(7) << "trim_lru trimmed " << trimmed << " items" << dendl;
+  return std::pair<bool, uint64_t>(throttled, trimmed);
 }

 /*
@ -6589,7 +6603,7 @@ void MDCache::trim_lru(uint64_t count, expiremap& expiremap)
 *
 * @param count is number of dentries to try to expire
 */
-bool MDCache::trim(uint64_t count)
+std::pair<bool, uint64_t> MDCache::trim(uint64_t count)
 {
  uint64_t used = cache_size();
  uint64_t limit = cache_memory_limit;
@ -6603,7 +6617,8 @@ bool MDCache::trim(uint64_t count)
  // process delayed eval_stray()
  stray_manager.advance_delayed();

-  trim_lru(count, expiremap);
+  auto result = trim_lru(count, expiremap);
+  auto& trimmed = result.second;

  // trim non-auth, non-bound subtrees
  for (auto p = subtrees.begin(); p != subtrees.end();) {
@ -6619,6 +6634,7 @@ bool MDCache::trim(uint64_t count)
 	  continue;

 	migrator->export_empty_import(dir);
+        ++trimmed;
      }
    } else {
      if (!diri->is_auth()) {
@ -6635,6 +6651,7 @@ bool MDCache::trim(uint64_t count)
 	    rejoin_ack_gather.count(dir->get_dir_auth().first))
 	  continue;
 	trim_dirfrag(dir, 0, expiremap);
+        ++trimmed;
      }
    }
  }
@ -6645,11 +6662,15 @@ bool MDCache::trim(uint64_t count)
    root->get_dirfrags(ls);
    for (list<CDir*>::iterator p = ls.begin(); p != ls.end(); ++p) {
      CDir *dir = *p;
-      if (dir->get_num_ref() == 1)  // subtree pin
+      if (dir->get_num_ref() == 1) { // subtree pin
 	trim_dirfrag(dir, 0, expiremap);
+        ++trimmed;
+      }
    }
-    if (root->get_num_ref() == 0)
+    if (root->get_num_ref() == 0) {
      trim_inode(0, root, 0, expiremap);
+      ++trimmed;
+    }
  }

  std::set<mds_rank_t> stopping;
@ -6673,11 +6694,15 @@ bool MDCache::trim(uint64_t count)
      list<CDir*> ls;
      mdsdir_in->get_dirfrags(ls);
      for (auto dir : ls) {
-	if (dir->get_num_ref() == 1)  // subtree pin
+	if (dir->get_num_ref() == 1) {  // subtree pin
 	  trim_dirfrag(dir, dir, expiremap);
+          ++trimmed;
+        }
      }
-      if (mdsdir_in->get_num_ref() == 0)
+      if (mdsdir_in->get_num_ref() == 0) {
 	trim_inode(NULL, mdsdir_in, NULL, expiremap);
+        ++trimmed;
+      }
    } else {
      dout(20) << __func__ << ": some unexpirable contents in mdsdir" << dendl;
    }
@ -6694,6 +6719,7 @@ bool MDCache::trim(uint64_t count)
        dout(20) << __func__ << ": maybe trimming base: " << *base_in << dendl;
        if (base_in->get_num_ref() == 0) {
          trim_inode(NULL, base_in, NULL, expiremap);
+          ++trimmed;
        }
      }
    }
@ -6702,7 +6728,7 @@ bool MDCache::trim(uint64_t count)
  // send any expire messages
  send_expire_messages(expiremap);

-  return true;
+  return result;
 }

 void MDCache::send_expire_messages(expiremap& expiremap)
@ -7539,8 +7565,7 @@ void MDCache::check_memory_usage()
  mds->mlogger->set(l_mdm_heap, last.get_heap());

  if (cache_toofull()) {
-    last_recall_state = clock::now();
-    mds->server->recall_client_state(-1.0, false, nullptr);
+    mds->server->recall_client_state(nullptr);
  }

  // If the cache size had exceeded its limit, but we're back in bounds
--- a/src/mds/MDCache.h
+++ b/src/mds/MDCache.h
@ -19,6 +19,7 @@

 #include <string_view>

+#include "common/DecayCounter.h"
 #include "include/types.h"
 #include "include/filepath.h"
 #include "include/elist.h"
@ -743,9 +744,9 @@ public:
  size_t get_cache_size() { return lru.lru_get_size(); }

  // trimming
-  bool trim(uint64_t count=0);
+  std::pair<bool, uint64_t> trim(uint64_t count=0);
 private:
-  void trim_lru(uint64_t count, expiremap& expiremap);
+  std::pair<bool, uint64_t> trim_lru(uint64_t count, expiremap& expiremap);
  bool trim_dentry(CDentry *dn, expiremap& expiremap);
  void trim_dirfrag(CDir *dir, CDir *con, expiremap& expiremap);
  bool trim_inode(CDentry *dn, CInode *in, CDir *con, expiremap&);
@ -779,8 +780,6 @@ public:
  void trim_client_leases();
  void check_memory_usage();

-  time last_recall_state;
-
  // shutdown
 private:
  set<inodeno_t> shutdown_exporting_strays;
@ -1187,6 +1186,10 @@ private:
 				LogSegment *ls, bufferlist *rollback=NULL);
  void finish_uncommitted_fragment(dirfrag_t basedirfrag, int op);
  void rollback_uncommitted_fragment(dirfrag_t basedirfrag, frag_vec_t&& old_frags);
+
+
+  DecayCounter trim_counter;
+
 public:
  void wait_for_uncommitted_fragment(dirfrag_t dirfrag, MDSInternalContextBase *c) {
    ceph_assert(uncommitted_fragments.count(dirfrag));
--- a/src/mds/MDSDaemon.cc
+++ b/src/mds/MDSDaemon.cc
@ -370,6 +370,7 @@ const char** MDSDaemon::get_tracked_conf_keys() const
    "mds_health_cache_threshold",
    "mds_cache_mid",
    "mds_dump_cache_threshold_formatter",
+    "mds_cache_trim_decay_rate",
    "mds_dump_cache_threshold_file",
    // MDBalancer
    "mds_bal_fragment_dirs",
@ -383,8 +384,10 @@ const char** MDSDaemon::get_tracked_conf_keys() const
    "mds_inject_migrator_session_race",
    "host",
    "fsid",
-    "mds_request_load_average_decay_rate",
    "mds_cap_revoke_eviction_timeout",
+    // SessionMap
+    "mds_request_load_average_decay_rate",
+    "mds_recall_max_decay_rate",
    NULL
  };
  return KEYS;
--- a/src/mds/MDSRank.cc
+++ b/src/mds/MDSRank.cc
@ -245,7 +245,8 @@ public:
               Formatter *f, Context *on_finish)
    : MDSInternalContext(mds),
      server(server), mdcache(mdcache), mdlog(mdlog),
-      recall_timeout(recall_timeout), f(f), on_finish(on_finish),
+      recall_timeout(recall_timeout), recall_start(mono_clock::now()),
+      f(f), on_finish(on_finish),
      whoami(mds->whoami), incarnation(mds->incarnation) {
  }

@ -255,6 +256,7 @@ public:
    assert(mds->mds_lock.is_locked());

    dout(20) << __func__ << dendl;
+    f->open_object_section("result");
    recall_client_state();
  }

@ -313,25 +315,40 @@ private:

  void recall_client_state() {
    dout(20) << __func__ << dendl;
-
-    f->open_object_section("result");
+    auto now = mono_clock::now();
+    auto duration = std::chrono::duration<double>(now-recall_start).count();

    MDSGatherBuilder *gather = new MDSGatherBuilder(g_ceph_context);
-    server->recall_client_state(1.0, true, gather);
-    if (!gather->has_subs()) {
-      handle_recall_client_state(0);
-      delete gather;
-      return;
+    auto [throttled, count] = server->recall_client_state(gather, Server::RecallFlags::STEADY);
+    dout(10) << __func__
+             << (throttled ? " (throttled)" : "")
+             << " recalled " << count << " caps" << dendl;
+
+    caps_recalled += count;
+    if ((throttled || count > 0) && (recall_timeout == 0 || duration < recall_timeout)) {
+      auto timer = new FunctionContext([this](int _) {
+        recall_client_state();
+      });
+      mds->timer.add_event_after(1.0, timer);
+    } else {
+      if (!gather->has_subs()) {
+        delete gather;
+        return handle_recall_client_state(0);
+      } else if (recall_timeout > 0 && duration > recall_timeout) {
+        delete gather;
+        return handle_recall_client_state(-ETIMEDOUT);
+      } else {
+        uint64_t remaining = (recall_timeout == 0 ? 0 : recall_timeout-duration);
+        C_ContextTimeout *ctx = new C_ContextTimeout(
+          mds, remaining, new FunctionContext([this](int r) {
+              handle_recall_client_state(r);
+            }));
+
+        ctx->start_timer();
+        gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
+        gather->activate();
+      }
    }
-
-    C_ContextTimeout *ctx = new C_ContextTimeout(
-      mds, recall_timeout, new FunctionContext([this](int r) {
-          handle_recall_client_state(r);
-        }));
-
-    ctx->start_timer();
-    gather->set_finisher(new MDSInternalContextWrapper(mds, ctx));
-    gather->activate();
  }

  void handle_recall_client_state(int r) {
@ -341,6 +358,7 @@ private:
    f->open_object_section("client_recall");
    f->dump_int("return_code", r);
    f->dump_string("message", cpp_strerror(r));
+    f->dump_int("recalled", caps_recalled);
    f->close_section();

    // we can still continue after recall timeout
@ -379,21 +397,30 @@ private:
  void trim_cache() {
    dout(20) << __func__ << dendl;

-    if (!mdcache->trim(UINT64_MAX)) {
-      cmd_err(f, "failed to trim cache");
-      complete(-EINVAL);
-      return;
+    auto [throttled, count] = mdcache->trim(UINT64_MAX);
+    dout(10) << __func__
+             << (throttled ? " (throttled)" : "")
+             << " trimmed " << count << " caps" << dendl;
+    dentries_trimmed += count;
+    if (throttled && count > 0) {
+      auto timer = new FunctionContext([this](int _) {
+        trim_cache();
+      });
+      mds->timer.add_event_after(1.0, timer);
+    } else {
+      cache_status();
    }
-
-    cache_status();
  }

  void cache_status() {
    dout(20) << __func__ << dendl;

+    f->open_object_section("trim_cache");
+    f->dump_int("trimmed", dentries_trimmed);
+    f->close_section();
+
    // cache status section
    mdcache->cache_status(f);
-    f->close_section();

    complete(0);
  }
@ -401,6 +428,10 @@ private:
  void finish(int r) override {
    dout(20) << __func__ << ": r=" << r << dendl;

+    auto d = std::chrono::duration<double>(mono_clock::now()-recall_start);
+    f->dump_float("duration", d.count());
+
+    f->close_section();
    on_finish->complete(r);
  }

@ -408,11 +439,14 @@ private:
  MDCache *mdcache;
  MDLog *mdlog;
  uint64_t recall_timeout;
+  mono_time recall_start;
  Formatter *f;
  Context *on_finish;

  int retval = 0;
  std::stringstream ss;
+  uint64_t caps_recalled = 0;
+  uint64_t dentries_trimmed = 0;

  // so as to use dout
  mds_rank_t whoami;
@ -693,6 +727,7 @@ void MDSRankDispatcher::tick()
  sessionmap.update_average_session_age();

  if (is_active() || is_stopping()) {
+    server->recall_client_state(nullptr, Server::RecallFlags::ENFORCE_MAX);
    mdcache->trim();
    mdcache->trim_client_leases();
    mdcache->check_memory_usage();
--- a/src/mds/Server.cc
+++ b/src/mds/Server.cc
@ -17,6 +17,7 @@

 #include <boost/config/warning_disable.hpp>
 #include <boost/fusion/include/std_pair.hpp>
+#include <boost/range/adaptor/reversed.hpp>

 #include "MDSRank.h"
 #include "Server.h"
@ -49,6 +50,7 @@
 #include "osd/OSDMap.h"

 #include <errno.h>
+#include <math.h>

 #include <list>
 #include <iostream>
@ -188,7 +190,8 @@ Server::Server(MDSRank *m) :
  reconnect_done(NULL),
  failed_reconnects(0),
  reconnect_evicting(false),
-  terminating_sessions(false)
+  terminating_sessions(false),
+  recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate"))
 {
  supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
 }
@ -1072,6 +1075,9 @@ void Server::handle_conf_change(const ConfigProxy& conf,
    dout(20) << __func__ << " cap revoke eviction timeout changed to "
            << cap_revoke_eviction_timeout << dendl;
  }
+  if (changed.count("mds_recall_max_decay_rate")) {
+    recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
+  }
 }

 /*
@ -1510,62 +1516,122 @@ void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
  }
 }

-
 /**
 * Call this when the MDCache is oversized, to send requests to the clients
 * to trim some caps, and consequently unpin some inodes in the MDCache so
 * that it can trim too.
 */
-void Server::recall_client_state(double ratio, bool flush_client_session,
-                                 MDSGatherBuilder *gather) {
-  if (flush_client_session) {
-    assert(gather != nullptr);
-  }
+std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
+{
+  const auto now = clock::now();
+  const bool steady = flags&RecallFlags::STEADY;
+  const bool enforce_max = flags&RecallFlags::ENFORCE_MAX;

-  /* try to recall at least 80% of all caps */
-  uint64_t max_caps_per_client = Capability::count() * g_conf().get_val<double>("mds_max_ratio_caps_per_client");
-  uint64_t min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
-  if (max_caps_per_client < min_caps_per_client) {
-    dout(0) << "max_caps_per_client " << max_caps_per_client
-            << " < min_caps_per_client " << min_caps_per_client << dendl;
-    max_caps_per_client = min_caps_per_client + 1;
-  }
+  const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
+  const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
+  const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
+  const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
+  const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");

-  /* unless this ratio is smaller: */
-  /* ratio: determine the amount of caps to recall from each client. Use
-   * percentage full over the cache reservation. Cap the ratio at 80% of client
-   * caps. */
-  if (ratio < 0.0)
-    ratio = 1.0 - fmin(0.80, mdcache->cache_toofull_ratio());
+  dout(7) << __func__ << ":"
+           << " min=" << min_caps_per_client
+           << " max=" << max_caps_per_client
+           << " total=" << Capability::count()
+           << " flags=0x" << std::hex << flags
+           << dendl;

-  dout(10) << __func__ << ": ratio=" << ratio << ", caps per client "
-           << min_caps_per_client << "-" << max_caps_per_client << dendl;
+  /* trim caps of sessions with the most caps first */
+  std::multimap<uint64_t, Session*> caps_session;
+  auto f = [&caps_session, enforce_max, max_caps_per_client](auto& s) {
+    auto num_caps = s->caps.size();
+    if (!enforce_max || num_caps > max_caps_per_client) {
+      caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
+    }
+  };
+  mds->sessionmap.get_client_sessions(std::move(f));

-  set<Session*> sessions;
-  mds->sessionmap.get_client_session_set(sessions);
-
-  for (auto &session : sessions) {
+  std::pair<bool, uint64_t> result = {false, 0};
+  auto& [throttled, caps_recalled] = result;
+  last_recall_state = now;
+  for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
    if (!session->is_open() ||
        !session->get_connection() ||
 	!session->info.inst.name.is_client())
      continue;

-    dout(10) << " session " << session->info.inst
-	     << " caps " << session->caps.size()
+    dout(10) << __func__ << ":"
+             << " session " << session->info.inst
+	     << " caps " << num_caps
 	     << ", leases " << session->leases.size()
 	     << dendl;

-    uint64_t newlim = std::max(std::min<uint64_t>((session->caps.size() * ratio), max_caps_per_client), min_caps_per_client);
-    if (session->caps.size() > newlim) {
+    uint64_t newlim;
+    if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
+      newlim = min_caps_per_client;
+    } else {
+      newlim = num_caps-recall_max_caps;
+    }
+    if (num_caps > newlim) {
+      /* now limit the number of caps we recall at a time to prevent overloading ourselves */
+      uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
+      newlim = num_caps-recall;
+      const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
+      const uint64_t global_recall_throttle = recall_throttle.get();
+      if (session_recall_throttle+recall > recall_max_decay_threshold) {
+        dout(15) << "  session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
+        throttled = true;
+        continue;
+      } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
+        dout(15) << "  global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
+        throttled = true;
+        break;
+      }
+
+      // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
+      if (steady) {
+        const auto session_recall = session->get_recall_caps();
+        const auto session_release = session->get_release_caps();
+        if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
+          /* The session has been unable to keep up with the number of caps
+           * recalled (by half); additionally, to prevent marking sessions
+           * we've just begun to recall from, the session_recall counter
+           * (decayed count of caps recently recalled) is **greater** than the
+           * session threshold for the session's cap recall throttle.
+           */
+          dout(15) << "  2*session_release < session_recall"
+                      " (2*" << session_release << " < " << session_recall << ");"
+                      " Skipping because we are unlikely to get more released." << dendl;
+          continue;
+        } else if (recall < recall_max_caps && 2*recall < session_recall) {
+          /* The number of caps recalled is less than the number we *could*
+           * recall (so there isn't much left to recall?) and the number of
+           * caps is less than the current recall_caps counter (decayed count
+           * of caps recently recalled).
+           */
+          dout(15) << "  2*recall < session_recall "
+                      " (2*" << recall << " < " << session_recall << ") &&"
+                      " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
+                      " Skipping because we are unlikely to get more released." << dendl;
+          continue;
+        }
+      }
+
+      dout(7) << "  recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
+
      auto m = MClientSession::create(CEPH_SESSION_RECALL_STATE);
      m->head.max_caps = newlim;
      mds->send_message_client(m, session);
-      if (flush_client_session) {
+      if (gather) {
        flush_session(session, gather);
      }
-      session->notify_recall_sent(newlim);
+      caps_recalled += session->notify_recall_sent(newlim);
+      recall_throttle.hit(recall);
    }
  }
+
+  dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
+
+  return result;
 }

 void Server::force_clients_readonly()
--- a/src/mds/Server.h
+++ b/src/mds/Server.h
@ -17,6 +17,8 @@

 #include <string_view>

+#include <common/DecayCounter.h>
+
 #include "messages/MClientReconnect.h"
 #include "messages/MClientReply.h"
 #include "messages/MClientRequest.h"
@ -130,6 +132,10 @@ public:
  bool waiting_for_reconnect(client_t c) const;
  void dump_reconnect_status(Formatter *f) const;

+  time last_recalled() const {
+    return last_recall_state;
+  }
+
  void handle_client_session(const MClientSession::const_ref &m);
  void _session_logged(Session *session, uint64_t state_seq, 
 		       bool open, version_t pv, interval_set<inodeno_t>& inos,version_t piv);
@ -163,8 +169,12 @@ public:
  void reconnect_tick();
  void recover_filelocks(CInode *in, bufferlist locks, int64_t client);

-  void recall_client_state(double ratio, bool flush_client_session,
-                           MDSGatherBuilder *gather);
+  enum RecallFlags {
+    NONE = 0,
+    STEADY = (1<<0),
+    ENFORCE_MAX = (1<<1),
+  };
+  std::pair<bool, uint64_t> recall_client_state(MDSGatherBuilder* gather, enum RecallFlags=RecallFlags::NONE);
  void force_clients_readonly();

  // -- requests --
@ -339,6 +349,9 @@ public:
 private:
  void reply_client_request(MDRequestRef& mdr, const MClientReply::ref &reply);
  void flush_session(Session *session, MDSGatherBuilder *gather);
+
+  DecayCounter recall_throttle;
+  time last_recall_state;
 };

 #endif
--- a/src/mds/SessionMap.cc
+++ b/src/mds/SessionMap.cc
@ -547,7 +547,7 @@ void SessionMapStore::decode_legacy(bufferlist::const_iterator& p)
    
    while (n-- && !p.end()) {
      auto p2 = p;
-      Session *s = new Session(nullptr);
+      Session *s = new Session(ConnectionRef());
      s->info.decode(p);
      if (session_map.count(s->info.inst.name)) {
 	// eager client connected too fast!  aie.
@ -855,11 +855,8 @@ size_t Session::get_request_count()
 */
 void Session::notify_cap_release(size_t n_caps)
 {
-  if (recalled_at != clock::zero()) {
-    recall_release_count += n_caps;
-    if (recall_release_count >= recall_count)
-      clear_recalled_at();
-  }
+  recall_caps.hit(-(double)n_caps);
+  release_caps.hit(n_caps);
 }

 /**
@ -868,25 +865,26 @@ void Session::notify_cap_release(size_t n_caps)
 * in order to generate health metrics if the session doesn't see
 * a commensurate number of calls to ::notify_cap_release
 */
-void Session::notify_recall_sent(const size_t new_limit)
+uint64_t Session::notify_recall_sent(size_t new_limit)
 {
-  if (recalled_at == clock::zero()) {
-    // Entering recall phase, set up counters so we can later
-    // judge whether the client has respected the recall request
-    recalled_at = last_recall_sent = clock::now();
-    assert (new_limit < caps.size());  // Behaviour of Server::recall_client_state
-    recall_count = caps.size() - new_limit;
-    recall_release_count = 0;
+  const auto num_caps = caps.size();
+  ceph_assert(new_limit < num_caps);  // Behaviour of Server::recall_client_state
+  const auto count = num_caps-new_limit;
+  uint64_t new_change;
+  if (recall_limit != new_limit) {
+    new_change = count;
  } else {
-    last_recall_sent = clock::now();
+    new_change = 0; /* no change! */
  }
-}

-void Session::clear_recalled_at()
-{
-  recalled_at = last_recall_sent = clock::zero();
-  recall_count = 0;
-  recall_release_count = 0;
+  /* Always hit the session counter as a RECALL message is still sent to the
+   * client and we do not want the MDS to burn its global counter tokens on a
+   * session that is not releasing caps (i.e. allow the session counter to
+   * throttle future RECALL messages).
+   */
+  recall_caps_throttle.hit(count);
+  recall_caps.hit(count);
+  return new_change;
 }

 /**
@ -980,25 +978,47 @@ void SessionMap::hit_session(Session *session) {
 }

 void SessionMap::handle_conf_change(const ConfigProxy &conf,
-                                    const std::set <std::string> &changed) {
+                                    const std::set <std::string> &changed)
+{
+  auto apply_to_open_sessions = [this](auto f) {
+    if (auto it = by_state.find(Session::STATE_OPEN); it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        f(session);
+      }
+    }
+    if (auto it = by_state.find(Session::STATE_STALE); it != by_state.end()) {
+      for (const auto &session : *(it->second)) {
+        f(session);
+      }
+    }
+  };
+
  if (changed.count("mds_request_load_average_decay_rate")) {
-    decay_rate = g_conf().get_val<double>("mds_request_load_average_decay_rate");
-    dout(20) << __func__ << " decay rate changed to " << decay_rate << dendl;
+    auto d = g_conf().get_val<double>("mds_request_load_average_decay_rate");
+    dout(20) << __func__ << " decay rate changed to " << d << dendl;

-    total_load_avg = DecayCounter(decay_rate);
+    decay_rate = d;
+    total_load_avg = DecayCounter(d);

-    auto p = by_state.find(Session::STATE_OPEN);
-    if (p != by_state.end()) {
-      for (const auto &session : *(p->second)) {
-        session->set_load_avg_decay_rate(decay_rate);
-      }
-    }
-    p = by_state.find(Session::STATE_STALE);
-    if (p != by_state.end()) {
-      for (const auto &session : *(p->second)) {
-        session->set_load_avg_decay_rate(decay_rate);
-      }
-    }
+    auto mut = [d](auto s) {
+      s->set_load_avg_decay_rate(d);
+    };
+    apply_to_open_sessions(mut);
+  }
+  if (changed.count("mds_recall_max_decay_rate")) {
+    auto d = g_conf().get_val<double>("mds_recall_max_decay_rate");
+    auto mut = [d](auto s) {
+      s->recall_caps_throttle = DecayCounter(d);
+    };
+    apply_to_open_sessions(mut);
+  }
+  if (changed.count("mds_recall_warning_decay_rate")) {
+    auto d = g_conf().get_val<double>("mds_recall_warning_decay_rate");
+    auto mut = [d](auto s) {
+      s->recall_caps = DecayCounter(d);
+      s->release_caps = DecayCounter(d);
+    };
+    apply_to_open_sessions(mut);
  }
 }

--- a/src/mds/SessionMap.h
+++ b/src/mds/SessionMap.h
@ -27,10 +27,10 @@ using std::set;
 #include "mdstypes.h"
 #include "mds/MDSAuthCaps.h"
 #include "common/perf_counters.h"
+#include "common/DecayCounter.h"

 class CInode;
 struct MDRequestImpl;
-class DecayCounter;

 #include "CInode.h"
 #include "Capability.h"
@ -97,9 +97,9 @@ public:
  }

 private:
-  int state;
-  uint64_t state_seq;
-  int importing_count;
+  int state = STATE_CLOSED;
+  uint64_t state_seq = 0;
+  int importing_count = 0;
  friend class SessionMap;

  // Human (friendly) name is soft state generated from client metadata
@ -113,6 +113,16 @@ private:
  // request load average for this session
  DecayCounter load_avg;

+  // Ephemeral state for tracking progress of capability recalls
+  // caps being recalled recently by this session; used for Beacon warnings
+  DecayCounter recall_caps;
+  // caps that have been released
+  DecayCounter release_caps;
+  // throttle on caps recalled
+  DecayCounter recall_caps_throttle;
+  // New limit in SESSION_RECALL
+  uint32_t recall_limit = 0;
+
  // session start time -- used to track average session time
  // note that this is initialized in the constructor rather
  // than at the time of adding a session to the sessionmap
@ -153,12 +163,6 @@ public:

  const std::string& get_human_name() const {return human_name;}

-  // Ephemeral state for tracking progress of capability recalls
-  time recalled_at = clock::zero();  // When was I asked to SESSION_RECALL?
-  time last_recall_sent = clock::zero();
-  uint32_t recall_count;  // How many caps was I asked to SESSION_RECALL?
-  uint32_t recall_release_count;  // How many caps have I actually revoked?
-
  session_info_t info;                         ///< durable bits

  MDSAuthCaps auth_caps;
@ -177,8 +181,16 @@ public:
  interval_set<inodeno_t> pending_prealloc_inos; // journaling prealloc, will be added to prealloc_inos

  void notify_cap_release(size_t n_caps);
-  void notify_recall_sent(const size_t new_limit);
-  void clear_recalled_at();
+  uint64_t notify_recall_sent(size_t new_limit);
+  auto get_recall_caps_throttle() const {
+    return recall_caps_throttle.get();
+  }
+  auto get_recall_caps() const {
+    return recall_caps.get();
+  }
+  auto get_release_caps() const {
+    return release_caps.get();
+  }

  inodeno_t next_ino() const {
    if (info.prealloc_inos.empty())
@ -249,8 +261,8 @@ public:

  // -- caps --
 private:
-  uint32_t cap_gen;
-  version_t cap_push_seq;        // cap push seq #
+  uint32_t cap_gen = 0;
+  version_t cap_push_seq = 0;        // cap push seq #
  map<version_t, MDSInternalContextBase::vec > waitfor_flush; // flush session messages

 public:
@ -291,16 +303,16 @@ public:
  }

  // -- leases --
-  uint32_t lease_seq;
+  uint32_t lease_seq = 0;

  // -- completed requests --
 private:
  // Has completed_requests been modified since the last time we
  // wrote this session out?
-  bool completed_requests_dirty;
+  bool completed_requests_dirty = false;

-  unsigned num_trim_flushes_warnings;
-  unsigned num_trim_requests_warnings;
+  unsigned num_trim_flushes_warnings = 0;
+  unsigned num_trim_requests_warnings = 0;
 public:
  void add_completed_request(ceph_tid_t t, inodeno_t created) {
    info.completed_requests[t] = created;
@ -375,21 +387,17 @@ public:
  int check_access(CInode *in, unsigned mask, int caller_uid, int caller_gid,
 		   const vector<uint64_t> *gid_list, int new_uid, int new_gid);

-
-  Session(Connection *con) :
-    state(STATE_CLOSED), state_seq(0), importing_count(0),
-    birth_time(clock::now()), recall_count(0),
-    recall_release_count(0), auth_caps(g_ceph_context),
-    connection(NULL), item_session_list(this),
-    requests(0),  // member_offset passed to front() manually
-    cap_gen(0), cap_push_seq(0),
-    lease_seq(0),
-    completed_requests_dirty(false),
-    num_trim_flushes_warnings(0),
-    num_trim_requests_warnings(0) {
-    if (con) {
-      set_connection(con);
-    }
+  Session() = delete;
+  Session(ConnectionRef con) :
+    recall_caps(g_conf().get_val<double>("mds_recall_warning_decay_rate")),
+    release_caps(g_conf().get_val<double>("mds_recall_warning_decay_rate")),
+    recall_caps_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
+    birth_time(clock::now()),
+    auth_caps(g_ceph_context),
+    item_session_list(this),
+    requests(0)  // member_offset passed to front() manually
+  {
+    set_connection(std::move(con));
  }
  ~Session() override {
    if (state == STATE_CLOSED) {
@ -400,9 +408,11 @@ public:
    preopen_out_queue.clear();
  }

-  void set_connection(Connection *con) {
-    connection = con;
-    socket_addr = con->get_peer_socket_addr();
+  void set_connection(ConnectionRef con) {
+    connection = std::move(con);
+    if (connection) {
+      socket_addr = connection->get_peer_socket_addr();
+    }
  }
  const ConnectionRef& get_connection() const {
    return connection;
@ -490,7 +500,7 @@ public:
    if (session_map_entry != session_map.end()) {
      s = session_map_entry->second;
    } else {
-      s = session_map[i.name] = new Session(nullptr);
+      s = session_map[i.name] = new Session(ConnectionRef());
      s->info.inst = i;
      s->last_cap_renew = Session::clock::now();
      if (logger) {
@ -522,17 +532,15 @@ public:
  MDSRank *mds;

 protected:
-  version_t projected, committing, committed;
+  version_t projected = 0, committing = 0, committed = 0;
 public:
  map<int,xlist<Session*>* > by_state;
  uint64_t set_state(Session *session, int state);
  map<version_t, MDSInternalContextBase::vec > commit_waiters;
  void update_average_session_age();

-  explicit SessionMap(MDSRank *m) : mds(m),
-		       projected(0), committing(0), committed(0),
-                       loaded_legacy(false)
-  { }
+  SessionMap() = delete;
+  explicit SessionMap(MDSRank *m) : mds(m) {}

  ~SessionMap() override
  {
@ -626,12 +634,20 @@ public:

  void dump();

-  void get_client_session_set(set<Session*>& s) const {
-    for (ceph::unordered_map<entity_name_t,Session*>::const_iterator p = session_map.begin();
-	 p != session_map.end();
-	 ++p)
-      if (p->second->info.inst.name.is_client())
-	s.insert(p->second);
+  template<typename F>
+  void get_client_sessions(F&& f) const {
+    for (const auto& p : session_map) {
+      auto& session = p.second;
+      if (session->info.inst.name.is_client())
+	f(session);
+    }
+  }
+  template<typename C>
+  void get_client_session_set(C& c) const {
+    auto f = [&c](auto& s) {
+      c.insert(s);
+    };
+    get_client_sessions(f);
  }

  void replay_open_sessions(map<client_t,entity_inst_t>& client_map,
@ -695,7 +711,7 @@ public:
 protected:
  std::set<entity_name_t> dirty_sessions;
  std::set<entity_name_t> null_sessions;
-  bool loaded_legacy;
+  bool loaded_legacy = false;
  void _mark_dirty(Session *session);
 public: