From 06ae53e2b6029a1faec921957749b20635e0520e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 19 Jul 2013 14:50:03 -0700 Subject: [PATCH 1/8] mon: improve osdmap subscription debug output Signed-off-by: Sage Weil --- src/mon/OSDMonitor.cc | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index c8baac58c83..ecf6b89d110 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -1651,6 +1651,7 @@ epoch_t OSDMonitor::blacklist(const entity_addr_t& a, utime_t until) void OSDMonitor::check_subs() { + dout(10) << __func__ << dendl; string type = "osdmap"; if (mon->session_map.subs.count(type) == 0) return; @@ -1664,6 +1665,8 @@ void OSDMonitor::check_subs() void OSDMonitor::check_sub(Subscription *sub) { + dout(10) << __func__ << " " << sub << " next " << sub->next + << (sub->onetime ? " (onetime)":" (ongoing)") << dendl; if (sub->next <= osdmap.get_epoch()) { if (sub->next >= 1) send_incremental(sub->next, sub->session->inst, sub->incremental_onetime); From 0a9964934de1214088de866c763f2065e4cd5ff5 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 19 Jul 2013 16:22:48 -0700 Subject: [PATCH 2/8] Revert "mon/OSDMonitor: fix typo" This reverts commit d656aed599ee754646e16386ce5a4ab0117f2d6e. --- src/mon/OSDMonitor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index ecf6b89d110..65443b2354a 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -298,7 +298,7 @@ void OSDMonitor::on_active() check_subs(); if (thrash_map) { - if (mon->is_leader()) { + if (is_leader()) { if (thrash()) propose_pending(); } else { From 2795eb123231dc0227bf76a47ae0bd8b48f3da5e Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 19 Jul 2013 16:23:04 -0700 Subject: [PATCH 3/8] Revert "mon: OSDMonitor: only thrash and propose if we are the leader" This reverts commit 5eac38797d9eb5a59fcff1d81571cff7a2f10e66. --- src/mon/OSDMonitor.cc | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 65443b2354a..5c7573e4d60 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -297,14 +297,8 @@ void OSDMonitor::on_active() send_to_waiting(); check_subs(); - if (thrash_map) { - if (is_leader()) { - if (thrash()) - propose_pending(); - } else { - thrash_map = 0; - } - } + if (thrash_map && thrash()) + propose_pending(); if (mon->is_leader()) mon->clog.info() << "osdmap " << osdmap << "\n"; From 6edec516bf82f2c6d622a5485a4eca7b0959d3d7 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 19 Jul 2013 16:39:47 -0700 Subject: [PATCH 4/8] Revert "mon/OSDMonitor: send_to_waiting() in on_active()" This reverts commit f06a124a7fa0717ef8c523408b31d814df57caca. On peons, on_active() is only called when we *first* become active after an election. Only on the leader is it called after each commit/update. This makes this change cause other problems (broken subscriptions on peons, in particular). We possibly should fix that, but there is also a simpler fix for the original problem we were trying to solve. Signed-off-by: Sage Weil --- src/mon/OSDMonitor.cc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 5c7573e4d60..95bd72c0b58 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -185,8 +185,12 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap) mon->pgmon()->check_osd_map(osdmap.epoch); } - update_logger(); + send_to_waiting(); + check_subs(); + share_map_with_random_osd(); + update_logger(); + process_failures(); // make sure our feature bits reflect the latest map @@ -294,9 +298,6 @@ void OSDMonitor::on_active() { update_logger(); - send_to_waiting(); - check_subs(); - if (thrash_map && thrash()) propose_pending(); @@ -1305,7 +1306,6 @@ void OSDMonitor::_reply_map(PaxosServiceMessage *m, epoch_t e) { dout(7) << "_reply_map " << e << " from " << m->get_orig_source_inst() - << " for " << m << dendl; send_latest(m, e); } From e4f2e3ecd0aee8f30005f915595271c5ae2149cb Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 19 Jul 2013 16:35:02 -0700 Subject: [PATCH 5/8] mon/OSDMonitor: do not wait for readable in send_latest() send_latest() checks for readable and, if untrue, will wait before sending out the latest OSDMap. This is completely unnecessary; I think it is a hold-over from when we have independent paxos states. An audit of all callers confirms that everyone would be happy with whatever is committed, even if we are in the process of committing an even newer version. Effectively, everyone waits *above* this layer in the usual PaxosService traps for whether we are readable or not. This means that waiting_for_map and send_to_waiting() go away entirely, which is nice. This addresses, among other things: send_to_waiting() is called from update_from_paxos(), which can be called when we are not readable due to the paxos commit/finish timing changes in f1ce8d7c955a24 and c711203c0d4b. If no subsequent update happens, those waiters never get their maps. Instead, we send them immediately--we know they are committed and old history is as good as future history. Fixes: #5643 Signed-off-by: Sage Weil --- src/mon/OSDMonitor.cc | 63 +++++-------------------------------------- src/mon/OSDMonitor.h | 3 --- 2 files changed, 7 insertions(+), 59 deletions(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 95bd72c0b58..82ff36d8469 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -185,7 +185,6 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap) mon->pgmon()->check_osd_map(osdmap.epoch); } - send_to_waiting(); check_subs(); share_map_with_random_osd(); @@ -312,16 +311,6 @@ void OSDMonitor::on_active() void OSDMonitor::on_shutdown() { dout(10) << __func__ << dendl; - map >::iterator p = waiting_for_map.begin(); - while (p != waiting_for_map.end()) { - while (!p->second.empty()) { - Message *m = p->second.front(); - dout(20) << " discarding " << m << " " << *m << dendl; - m->put(); - p->second.pop_front(); - } - waiting_for_map.erase(p++); - } } void OSDMonitor::update_logger() @@ -1444,53 +1433,15 @@ bool OSDMonitor::prepare_remove_snaps(MRemoveSnaps *m) // --------------- // map helpers -void OSDMonitor::send_to_waiting() -{ - dout(10) << "send_to_waiting " << osdmap.get_epoch() << dendl; - - map >::iterator p = waiting_for_map.begin(); - while (p != waiting_for_map.end()) { - epoch_t from = p->first; - - if (from) { - if (from <= osdmap.get_epoch()) { - while (!p->second.empty()) { - send_incremental(p->second.front(), from); - p->second.front()->put(); - p->second.pop_front(); - } - } else { - dout(10) << "send_to_waiting from " << from << dendl; - ++p; - continue; - } - } else { - while (!p->second.empty()) { - send_full(p->second.front()); - p->second.front()->put(); - p->second.pop_front(); - } - } - - waiting_for_map.erase(p++); - } -} - void OSDMonitor::send_latest(PaxosServiceMessage *m, epoch_t start) { - if (is_readable()) { - dout(5) << "send_latest to " << m->get_orig_source_inst() - << " start " << start << dendl; - if (start == 0) - send_full(m); - else - send_incremental(m, start); - m->put(); - } else { - dout(5) << "send_latest to " << m->get_orig_source_inst() - << " start " << start << " later" << dendl; - waiting_for_map[start].push_back(m); - } + dout(5) << "send_latest to " << m->get_orig_source_inst() + << " start " << start << dendl; + if (start == 0) + send_full(m); + else + send_incremental(m, start); + m->put(); } diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index d6553228321..dda2374d7e8 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -118,8 +118,6 @@ public: OSDMap osdmap; private: - map > waiting_for_map; - // [leader] OSDMap::Incremental pending_inc; map failure_info; @@ -192,7 +190,6 @@ private: bool can_mark_in(int o); // ... - void send_to_waiting(); // send current map to waiters. MOSDMap *build_latest_full(); MOSDMap *build_incremental(epoch_t first, epoch_t last); void send_full(PaxosServiceMessage *m); From 8371680bab780933944dffd02246c73b17d6d930 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 19 Jul 2013 16:36:01 -0700 Subject: [PATCH 6/8] mon: OSDMonitor: only thrash and propose if we are the leader 'thrash_map' is only set if we are the leader, so we would thrash and propose the pending value if we are the leader. However, we should keep the 'is_leader()' check not only for clarity's sake (an unfamiliar reader may cry OMGBUG, prompting to a patch much like this), but also because we may lose a subsequent election and become a peon instead, while still holding a 'thrash_map' value > 0 -- and we really don't want to propose while being a peon. [This is a rebased version of 5eac38797d9eb5a59fcff1d81571cff7a2f10e66, complete with the typo fix in d656aed599ee754646e16386ce5a4ab0117f2d6e.] Signed-off-by: Joao Eduardo Luis Reviewed-by: Sage Weil --- src/mon/OSDMonitor.cc | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index 82ff36d8469..c41a59fa148 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -297,8 +297,14 @@ void OSDMonitor::on_active() { update_logger(); - if (thrash_map && thrash()) - propose_pending(); + if (thrash_map) { + if (mon->is_leader()) { + if (thrash()) + propose_pending(); + } else { + thrash_map = 0; + } + } if (mon->is_leader()) mon->clog.info() << "osdmap " << osdmap << "\n"; From 6d326b84248b6069e8c90acdf1389204848f1bd3 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 19 Jul 2013 16:55:03 -0700 Subject: [PATCH 7/8] mon/OSDMonitor: discard failure waiters, info on shutdown This would prevent a leak, if we didn't assert before that in the failure_reporter_t dtor. Signed-off-by: Sage Weil --- src/mon/OSDMonitor.cc | 26 ++++++++++++++++---------- src/mon/OSDMonitor.h | 2 +- 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index c41a59fa148..20e4eac88cb 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -310,13 +310,26 @@ void OSDMonitor::on_active() mon->clog.info() << "osdmap " << osdmap << "\n"; if (!mon->is_leader()) { - kick_all_failures(); + list ls; + take_all_failures(ls); + while (!ls.empty()) { + dispatch(ls.front()); + ls.pop_front(); + } } } void OSDMonitor::on_shutdown() { dout(10) << __func__ << dendl; + + // discard failure info, waiters + list ls; + take_all_failures(ls); + while (!ls.empty()) { + ls.front()->put(); + ls.pop_front(); + } } void OSDMonitor::update_logger() @@ -1039,23 +1052,16 @@ void OSDMonitor::process_failures() } } -void OSDMonitor::kick_all_failures() +void OSDMonitor::take_all_failures(list& ls) { - dout(10) << "kick_all_failures on " << failure_info.size() << " osds" << dendl; - assert(!mon->is_leader()); + dout(10) << __func__ << " on " << failure_info.size() << " osds" << dendl; - list ls; for (map::iterator p = failure_info.begin(); p != failure_info.end(); ++p) { p->second.take_report_messages(ls); } failure_info.clear(); - - while (!ls.empty()) { - dispatch(ls.front()); - ls.pop_front(); - } } diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index dda2374d7e8..d7cb8fdf369 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -209,7 +209,7 @@ private: bool prepare_failure(class MOSDFailure *m); bool prepare_mark_me_down(class MOSDMarkMeDown *m); void process_failures(); - void kick_all_failures(); + void take_all_failures(list& ls); bool preprocess_boot(class MOSDBoot *m); bool prepare_boot(class MOSDBoot *m); From 0356eebfa51d0f3bb94102c271776250f610d090 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 19 Jul 2013 16:59:15 -0700 Subject: [PATCH 8/8] mon/PaxosService: update on_active() docs to clarify calling rules Signed-off-by: Sage Weil --- src/mon/PaxosService.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/mon/PaxosService.h b/src/mon/PaxosService.h index a5761d19ad8..74d5a90494c 100644 --- a/src/mon/PaxosService.h +++ b/src/mon/PaxosService.h @@ -438,8 +438,9 @@ public: /** * This is called when the Paxos state goes to active. * - * @remarks It's a courtesy method, in case the class implementing this - * service has anything it wants/needs to do at that time. + * On the peon, this is after each election. + * On the leader, this is after each election, *and* after each completed + * proposal. * * @note This function may get called twice in certain recovery cases. */