From 4d447092c3542bf57dfb4942db766adf2923c069 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 18 Mar 2021 11:45:48 -0500 Subject: [PATCH] mon/MgrStatMonitor: ignore MMgrReport from non-active mgr If it's not the active mgr, we should ignore it. Since the mgr instance is best identified by the gid, add that to the message. (We can't use the source_addrs for the message since that is the MgrStandby monc addr, not the active mgr addrs in the MgrMap.) This fixes a problem where a just-demoted mgr report gets processed and a new mgr gets a ServiceMap with an epoch >= its pending map. (At least, that is my theory!) Fixes: https://tracker.ceph.com/issues/48022 Signed-off-by: Sage Weil --- src/messages/MMonMgrReport.h | 10 ++++++++-- src/mgr/DaemonServer.cc | 1 + src/mon/MgrStatMonitor.cc | 8 ++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/src/messages/MMonMgrReport.h b/src/messages/MMonMgrReport.h index 0ca37a8ef9d..f5a68c7d398 100644 --- a/src/messages/MMonMgrReport.h +++ b/src/messages/MMonMgrReport.h @@ -23,7 +23,7 @@ class MMonMgrReport final : public PaxosServiceMessage { private: - static constexpr int HEAD_VERSION = 2; + static constexpr int HEAD_VERSION = 3; static constexpr int COMPAT_VERSION = 1; public: @@ -31,6 +31,7 @@ public: health_check_map_t health_checks; ceph::buffer::list service_map_bl; // encoded ServiceMap std::map progress_events; + uint64_t gid = 0; MMonMgrReport() : PaxosServiceMessage{MSG_MON_MGR_REPORT, 0, HEAD_VERSION, COMPAT_VERSION} @@ -42,7 +43,8 @@ public: std::string_view get_type_name() const override { return "monmgrreport"; } void print(std::ostream& out) const override { - out << get_type_name() << "(" << health_checks.checks.size() << " checks, " + out << get_type_name() << "(gid " << gid + << ", " << health_checks.checks.size() << " checks, " << progress_events.size() << " progress events)"; } @@ -52,6 +54,7 @@ public: encode(health_checks, payload); encode(service_map_bl, payload); encode(progress_events, payload); + encode(gid, payload); if (!HAVE_FEATURE(features, SERVER_NAUTILUS) || !HAVE_FEATURE(features, SERVER_MIMIC)) { @@ -79,6 +82,9 @@ public: if (header.version >= 2) { decode(progress_events, p); } + if (header.version >= 3) { + decode(gid, p); + } } private: template diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc index f665d28304b..64edc8bdfb7 100644 --- a/src/mgr/DaemonServer.cc +++ b/src/mgr/DaemonServer.cc @@ -2525,6 +2525,7 @@ void DaemonServer::send_report() } auto m = ceph::make_message(); + m->gid = monc->get_global_id(); py_modules.get_health_checks(&m->health_checks); py_modules.get_progress_events(&m->progress_events); diff --git a/src/mon/MgrStatMonitor.cc b/src/mon/MgrStatMonitor.cc index 40a322d7698..4996c9b38bf 100644 --- a/src/mon/MgrStatMonitor.cc +++ b/src/mon/MgrStatMonitor.cc @@ -3,6 +3,7 @@ #include "MgrStatMonitor.h" #include "mon/OSDMonitor.h" +#include "mon/MgrMonitor.h" #include "mon/PGMap.h" #include "messages/MGetPoolStats.h" #include "messages/MGetPoolStatsReply.h" @@ -211,7 +212,14 @@ bool MgrStatMonitor::prepare_update(MonOpRequestRef op) bool MgrStatMonitor::preprocess_report(MonOpRequestRef op) { + auto m = op->get_req(); mon.no_reply(op); + if (m->gid && + m->gid != mon.mgrmon()->get_map().get_active_gid()) { + dout(10) << "ignoring report from non-active mgr " << m->gid + << dendl; + return true; + } return false; }