From 767a4be12d64c42f5857cf390bb1d868cab96c72 Mon Sep 17 00:00:00 2001 From: Kamoltat Date: Fri, 11 Nov 2022 22:56:46 +0000 Subject: [PATCH] mon: clear connection score during update & add sanity check live/dead connection report When upgrading the monitors (include booting up), we check if `peer_tracker` is dirty or not. If so, we clear it. Added some functions in `Elector` and `ConnectionTracker` class to check for clean `peer_tracker`. Moreover, there could be some cases where due to startup weirdness or abnormal circumstances, we might get a report from our own rank. Therefore, it doesn't hurt to add a sanity check in `ConnectionTracker::report_live_connection` and `ConnectionTracker::report_dead_connection`. Fixes: https://tracker.ceph.com/issues/58049 Signed-off-by: Kamoltat --- src/mon/ConnectionTracker.cc | 24 ++++++++++++++++++++++++ src/mon/ConnectionTracker.h | 8 ++++++++ src/mon/Elector.cc | 5 +++++ src/mon/Elector.h | 5 +++++ src/mon/Monitor.cc | 8 ++++++++ 5 files changed, 50 insertions(+) diff --git a/src/mon/ConnectionTracker.cc b/src/mon/ConnectionTracker.cc index 022522aa970..272ad40c274 100644 --- a/src/mon/ConnectionTracker.cc +++ b/src/mon/ConnectionTracker.cc @@ -106,6 +106,10 @@ void ConnectionTracker::report_live_connection(int peer_rank, double units_alive { ldout(cct, 30) << __func__ << " peer_rank: " << peer_rank << " units_alive: " << units_alive << dendl; ldout(cct, 30) << "my_reports before: " << my_reports << dendl; + if (peer_rank == rank) { + lderr(cct) << "Got a report from my own rank, hopefully this is startup weirdness, dropping" << dendl; + return; + } // we need to "auto-initialize" to 1, do shenanigans auto i = my_reports.history.find(peer_rank); if (i == my_reports.history.end()) { @@ -130,6 +134,10 @@ void ConnectionTracker::report_dead_connection(int peer_rank, double units_dead) { ldout(cct, 30) << __func__ << " peer_rank: " << peer_rank << " units_dead: " << units_dead << dendl; ldout(cct, 30) << "my_reports before: " << my_reports << dendl; + if (peer_rank == rank) { + lderr(cct) << "Got a report from my own rank, hopefully this is startup weirdness, dropping" << dendl; + return; + } // we need to "auto-initialize" to 1, do shenanigans auto i = my_reports.history.find(peer_rank); if (i == my_reports.history.end()) { @@ -246,6 +254,22 @@ void ConnectionTracker::notify_rank_removed(int rank_removed, int new_rank) increase_version(); } +bool ConnectionTracker::is_clean(int mon_rank, int monmap_size) +{ + ldout(cct, 30) << __func__ << dendl; + // check consistency between our rank according + // to monmap and our rank according to our report. + if (rank != mon_rank || + my_reports.rank != mon_rank) { + return false; + } else if (!peer_reports.empty()){ + // if peer_report max rank is greater than monmap max rank + // then there is a problem. + if (peer_reports.rbegin()->first > monmap_size - 1) return false; + } + return true; +} + void ConnectionTracker::encode(bufferlist &bl) const { ENCODE_START(1, 1, bl); diff --git a/src/mon/ConnectionTracker.h b/src/mon/ConnectionTracker.h index 4196bca2793..c1a32c08fe0 100644 --- a/src/mon/ConnectionTracker.h +++ b/src/mon/ConnectionTracker.h @@ -120,6 +120,13 @@ class ConnectionTracker { */ void get_total_connection_score(int peer_rank, double *rating, int *live_count) const; + /** + * Check if our ranks are clean and make + * sure there are no extra peer_report lingering. + * In the future we also want to check the reports + * current and history of each peer_report. + */ + bool is_clean(int mon_rank, int monmap_size); /** * Encode this ConnectionTracker. Useful both for storing on disk * and for sending off to peers for decoding and import @@ -185,6 +192,7 @@ class ConnectionTracker { rank = new_rank; my_reports.rank = rank; } + void notify_rank_changed(int new_rank); void notify_rank_removed(int rank_removed, int new_rank); friend std::ostream& operator<<(std::ostream& o, const ConnectionTracker& c); diff --git a/src/mon/Elector.cc b/src/mon/Elector.cc index b5fa55a9250..9ef74d70a4e 100644 --- a/src/mon/Elector.cc +++ b/src/mon/Elector.cc @@ -716,6 +716,11 @@ void Elector::start_participating() logic.participating = true; } +bool Elector::peer_tracker_is_clean() +{ + return peer_tracker.is_clean(mon->rank, paxos_size()); +} + void Elector::notify_clear_peer_state() { dout(10) << __func__ << dendl; diff --git a/src/mon/Elector.h b/src/mon/Elector.h index 60afa5dd60c..be5eee5d4da 100644 --- a/src/mon/Elector.h +++ b/src/mon/Elector.h @@ -357,6 +357,11 @@ class Elector : public ElectionOwner, RankProvider { * @post @p participating is true */ void start_participating(); + /** + * Check if our peer_tracker is self-consistent, not suffering from + * https://tracker.ceph.com/issues/58049 + */ + bool peer_tracker_is_clean(); /** * Forget everything about our peers. :( */ diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index e12b7f0102c..0a2a9d336ef 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -943,7 +943,15 @@ int Monitor::init() osdmon()->get_filestore_osd_list(); state = STATE_PROBING; + bootstrap(); + + if (!elector.peer_tracker_is_clean()){ + dout(10) << "peer_tracker looks inconsistent" + << " previous bad logic, clearing ..." << dendl; + elector.notify_clear_peer_state(); + } + // add features of myself into feature_map session_map.feature_map.add_mon(con_self->get_features()); return 0;