From 402919cbe65c9fa219a9ec7a0f6534c6e1479f03 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Mon, 8 Nov 2021 14:55:05 -0500 Subject: [PATCH] mds: test connections to bootstrapping MDS Signed-off-by: Patrick Donnelly --- .../fs/thrash/workloads/overrides/races.yaml | 5 ++++ qa/tasks/cephfs/test_failover.py | 10 ++++++++ src/common/options/mds.yaml.in | 12 ++++++++++ src/mds/MDSRank.cc | 23 +++++++++++++++++-- 4 files changed, 48 insertions(+), 2 deletions(-) create mode 100644 qa/suites/fs/thrash/workloads/overrides/races.yaml diff --git a/qa/suites/fs/thrash/workloads/overrides/races.yaml b/qa/suites/fs/thrash/workloads/overrides/races.yaml new file mode 100644 index 00000000000..e7d75389699 --- /dev/null +++ b/qa/suites/fs/thrash/workloads/overrides/races.yaml @@ -0,0 +1,5 @@ +overrides: + ceph: + conf: + mds: + mds_sleep_rank_change: 5000000.0 diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py index 7558ede1684..7147807bf52 100644 --- a/qa/tasks/cephfs/test_failover.py +++ b/qa/tasks/cephfs/test_failover.py @@ -441,6 +441,16 @@ class TestFailover(CephFSTestCase): self.assertEqual(mds_0['gid'], self.fs.get_rank(rank=0)['gid']) self.fs.rank_freeze(False, rank=0) + def test_connect_bootstrapping(self): + self.config_set("mds", "mds_sleep_rank_change", 10000000.0) + self.config_set("mds", "mds_connect_bootstrapping", True) + self.fs.set_max_mds(2) + self.fs.wait_for_daemons() + self.fs.rank_fail(rank=0) + # rank 0 will get stuck in up:resolve, see https://tracker.ceph.com/issues/53194 + self.fs.wait_for_daemons() + + class TestStandbyReplay(CephFSTestCase): CLIENTS_REQUIRED = 0 MDSS_REQUIRED = 4 diff --git a/src/common/options/mds.yaml.in b/src/common/options/mds.yaml.in index f418be951ab..dde26e8abf4 100644 --- a/src/common/options/mds.yaml.in +++ b/src/common/options/mds.yaml.in @@ -1409,3 +1409,15 @@ options: - mds flags: - runtime +- name: mds_sleep_rank_change + type: float + level: dev + default: 0.0 + flags: + - runtime +- name: mds_connect_bootstrapping + type: bool + level: dev + default: false + flags: + - runtime diff --git a/src/mds/MDSRank.cc b/src/mds/MDSRank.cc index 186b528010e..5aa1b126917 100644 --- a/src/mds/MDSRank.cc +++ b/src/mds/MDSRank.cc @@ -2247,9 +2247,16 @@ void MDSRankDispatcher::handle_mds_map( if (oldstate != state) { // update messenger. - if (state == MDSMap::STATE_STANDBY_REPLAY) { + auto sleep_rank_change = g_conf().get_val("mds_sleep_rank_change"); + if (unlikely(sleep_rank_change > 0)) { + // This is to trigger a race where another rank tries to connect to this + // MDS before an update to the messenger "myname" is processed. This race + // should be closed by ranks holding messages until the rank is out of a + // "bootstrapping" state. + usleep(sleep_rank_change); + } if (state == MDSMap::STATE_STANDBY_REPLAY) { dout(1) << "handle_mds_map i am now mds." << mds_gid << "." << incarnation - << " replaying mds." << whoami << "." << incarnation << dendl; + << " replaying mds." << whoami << "." << incarnation << dendl; messenger->set_myname(entity_name_t::MDS(mds_gid)); } else { dout(1) << "handle_mds_map i am now mds." << whoami << "." << incarnation << dendl; @@ -2437,6 +2444,18 @@ void MDSRankDispatcher::handle_mds_map( } } + // for testing... + if (unlikely(g_conf().get_val("mds_connect_bootstrapping"))) { + std::set bootstrapping; + mdsmap->get_mds_set(bootstrapping, MDSMap::STATE_REPLAY); + mdsmap->get_mds_set(bootstrapping, MDSMap::STATE_CREATING); + mdsmap->get_mds_set(bootstrapping, MDSMap::STATE_STARTING); + for (const auto& rank : bootstrapping) { + auto m = make_message(monc->get_fsid(), *mdsmap); + send_message_mds(std::move(m), rank); + } + } + // did someone go active? if (state >= MDSMap::STATE_CLIENTREPLAY && oldstate >= MDSMap::STATE_CLIENTREPLAY) {