diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst index e16f3e0569d..dbb8f5224be 100644 --- a/doc/rados/operations/health-checks.rst +++ b/doc/rados/operations/health-checks.rst @@ -21,6 +21,57 @@ that are defined by ceph-mgr python modules. Definitions =========== +Monitor +------- + +MON_DOWN +________ + +One or more monitor daemons is currently down. The cluster requires a +majority (more than 1/2) of the monitors in order to function. When +one or more monitors are down, clients may have a harder time forming +their initial connection to the cluster as they may need to try more +addresses before they reach an operating monitor. + +The down monitor daemon should generally be restarted as soon as +possible to reduce the risk of a subsequen monitor failure leading to +a service outage. + +MON_CLOCK_SKEW +______________ + +The clocks on the hosts running the ceph-mon monitor daemons are not +sufficiently well synchronized. This health alert is raised if the +cluster detects a clock skew greater than ``mon_clock_drift_allowed``. + +This is best resolved by synchronizing the clocks using a tool like +``ntpd`` or ``chrony``. + +If it is impractical to keep the clocks closely synchronized, the +``mon_clock_drift_allowed`` threshold can also be increased, but this +value must stay significantly below the ``mon_lease`` interval in +order for monitor cluster to function properly. + +MON_MSGR2_NOT_ENABLED +_____________________ + +The ``ms_bind_msgr2`` option is enabled but one or more monitors is +not configured to bind to a v2 port in the cluster's monmap. This +means that features specific to the msgr2 protocol (e.g., encryption) +are not available on some or all connections. + +In most cases this can be corrected by issuing the command:: + + ceph mon enable-msgr2 + +That command will change any monitor configured for the old default +port 6789 to continue to listen for v1 connections on 6789 and also +listen for v2 connections on the new default 3300 port. + +If a monitor is configured to listen for v1 connections on a non-standard port (not 6789), then the monmap will need to be modified manually. + + + Manager ------- diff --git a/qa/suites/rados/thrash-old-clients/ceph.yaml b/qa/suites/rados/thrash-old-clients/ceph.yaml index 1b9c8568005..364f9d03b06 100644 --- a/qa/suites/rados/thrash-old-clients/ceph.yaml +++ b/qa/suites/rados/thrash-old-clients/ceph.yaml @@ -1,3 +1,7 @@ tasks: - ceph: mon_bind_addrvec: false + mon_bind_msgr2: false + conf: + global: + ms bind msgr2: false diff --git a/qa/suites/rados/upgrade/luminous-x-singleton/0-cluster/start.yaml b/qa/suites/rados/upgrade/luminous-x-singleton/0-cluster/start.yaml index c67ec972d91..3c65ea9ccc5 100644 --- a/qa/suites/rados/upgrade/luminous-x-singleton/0-cluster/start.yaml +++ b/qa/suites/rados/upgrade/luminous-x-singleton/0-cluster/start.yaml @@ -11,20 +11,27 @@ overrides: conf: global: ms dump corrupt message level: 0 + ms bind msgr2: false mds: debug ms: 1 debug mds: 20 roles: - - mon.a - - mon.c - mgr.x - - mgr.y - mds.a - osd.0 - osd.1 - osd.2 -- - mon.b - osd.3 +- - mon.b - osd.4 - osd.5 + - osd.6 + - osd.7 +- - mon.c + - mgr.y + - osd.8 + - osd.9 + - osd.10 + - osd.11 - - client.0 diff --git a/qa/suites/rados/upgrade/luminous-x-singleton/2-partial-upgrade/firsthalf.yaml b/qa/suites/rados/upgrade/luminous-x-singleton/2-partial-upgrade/firsthalf.yaml index 51f32ccc13d..a6cf4a46094 100644 --- a/qa/suites/rados/upgrade/luminous-x-singleton/2-partial-upgrade/firsthalf.yaml +++ b/qa/suites/rados/upgrade/luminous-x-singleton/2-partial-upgrade/firsthalf.yaml @@ -5,13 +5,14 @@ meta: restart : osd.0,1,2,3,4,5 tasks: - install.upgrade: - osd.0: + mon.a: + mon.b: - print: "**** done install.upgrade osd.0" - ceph.restart: - daemons: [mon.a, mon.c] + daemons: [mon.a, mon.b] wait-for-healthy: false mon-health-to-clog: false - ceph.restart: - daemons: [osd.0, osd.1, osd.2] + daemons: [osd.0, osd.1, osd.2, osd.3, osd.4, osd.5, osd.6, osd.7] wait-for-healthy: false -- print: "**** done ceph.restart 1st half" +- print: "**** done ceph.restart 1st 2/3s" diff --git a/qa/suites/rados/upgrade/luminous-x-singleton/6-finish-upgrade.yaml b/qa/suites/rados/upgrade/luminous-x-singleton/6-finish-upgrade.yaml index bd917f24f90..e7fa4b2f47c 100644 --- a/qa/suites/rados/upgrade/luminous-x-singleton/6-finish-upgrade.yaml +++ b/qa/suites/rados/upgrade/luminous-x-singleton/6-finish-upgrade.yaml @@ -10,18 +10,21 @@ overrides: - \(MDS_ tasks: - install.upgrade: - osd.3: + mon.c: - ceph.restart: - daemons: [mon.b, mgr.x, mgr.y] + daemons: [mon.c, mgr.x, mgr.y] wait-for-up: true wait-for-healthy: false - ceph.restart: - daemons: [osd.3, osd.4, osd.5] + daemons: [osd.8, osd.9, osd.10, osd.11] wait-for-up: true wait-for-healthy: false - ceph.restart: daemons: [mds.a] wait-for-up: true wait-for-healthy: false +- exec: + mon.a: + - ceph mon enable-msgr2 - install.upgrade: client.0: diff --git a/src/ceph_mon.cc b/src/ceph_mon.cc index 450544ae3da..79f28cf2269 100644 --- a/src/ceph_mon.cc +++ b/src/ceph_mon.cc @@ -801,6 +801,9 @@ int main(int argc, const char **argv) mon = new Monitor(g_ceph_context, g_conf()->name.get_id(), store, msgr, mgr_msgr, &monmap); + mon->orig_argc = argc; + mon->orig_argv = argv; + if (force_sync) { derr << "flagging a forced sync ..." << dendl; ostringstream oss; diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index e255b16e2f1..84594251474 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -366,6 +366,27 @@ bool HealthMonitor::check_leader_health() } } + // MON_MSGR2_NOT_ENABLED + if (g_conf().get_val<bool>("ms_bind_msgr2") && + mon->monmap->get_required_features().contains_all( + ceph::features::mon::FEATURE_NAUTILUS)) { + list<string> details; + for (auto& i : mon->monmap->mon_info) { + if (!i.second.public_addrs.has_msgr2()) { + ostringstream ds; + ds << "mon." << i.first << " is not bound to a msgr2 port, only " + << i.second.public_addrs; + details.push_back(ds.str()); + } + } + if (!details.empty()) { + ostringstream ss; + ss << details.size() << " monitors have not enabled msgr2"; + auto& d = next.add("MON_MSGR2_NOT_ENABLED", HEALTH_WARN, ss.str()); + d.detail.swap(details); + } + } + if (next != leader_checks) { changed = true; leader_checks = next; diff --git a/src/mon/MonCommands.h b/src/mon/MonCommands.h index 2c5451185ea..785b07c4ed7 100644 --- a/src/mon/MonCommands.h +++ b/src/mon/MonCommands.h @@ -451,6 +451,9 @@ COMMAND("mon set-rank " \ "name=rank,type=CephInt", "set the rank for the specified mon", "mon", "rw") +COMMAND("mon enable-msgr2", + "enable the msgr2 protocol on port 3300", + "mon", "rw") /* * OSD commands diff --git a/src/mon/MonMap.h b/src/mon/MonMap.h index d376a12068c..ee3d06338de 100644 --- a/src/mon/MonMap.h +++ b/src/mon/MonMap.h @@ -334,10 +334,17 @@ public: } int get_rank(const entity_addr_t& a) const { string n = get_name(a); - if (n.empty()) - return -1; - - return get_rank(n); + if (!n.empty()) { + return get_rank(n); + } + return -1; + } + int get_rank(const entity_addrvec_t& av) const { + string n = get_name(av); + if (!n.empty()) { + return get_rank(n); + } + return -1; } bool get_addr_name(const entity_addr_t& a, string& name) { if (addr_mons.count(a) == 0) @@ -358,6 +365,7 @@ public: void set_addrvec(const string& n, const entity_addrvec_t& a) { ceph_assert(mon_info.count(n)); mon_info[n].public_addrs = a; + calc_addr_mons(); } void encode(bufferlist& blist, uint64_t con_features) const; diff --git a/src/mon/Monitor.cc b/src/mon/Monitor.cc index e3dcd96b1bc..0690639f6d4 100644 --- a/src/mon/Monitor.cc +++ b/src/mon/Monitor.cc @@ -30,6 +30,7 @@ #include "common/version.h" #include "common/blkdev.h" #include "common/cmdparse.h" +#include "common/signal.h" #include "osd/OSDMap.h" @@ -1029,6 +1030,54 @@ void Monitor::wait_for_paxos_write() } } +void Monitor::respawn() +{ + dout(0) << __func__ << dendl; + + char *new_argv[orig_argc+1]; + dout(1) << " e: '" << orig_argv[0] << "'" << dendl; + for (int i=0; i<orig_argc; i++) { + new_argv[i] = (char *)orig_argv[i]; + dout(1) << " " << i << ": '" << orig_argv[i] << "'" << dendl; + } + new_argv[orig_argc] = NULL; + + /* Determine the path to our executable, test if Linux /proc/self/exe exists. + * This allows us to exec the same executable even if it has since been + * unlinked. + */ + char exe_path[PATH_MAX] = ""; +#ifdef PROCPREFIX + if (readlink(PROCPREFIX "/proc/self/exe", exe_path, PATH_MAX-1) != -1) { + dout(1) << "respawning with exe " << exe_path << dendl; + strcpy(exe_path, PROCPREFIX "/proc/self/exe"); + } else { +#else + { +#endif + /* Print CWD for the user's interest */ + char buf[PATH_MAX]; + char *cwd = getcwd(buf, sizeof(buf)); + ceph_assert(cwd); + dout(1) << " cwd " << cwd << dendl; + + /* Fall back to a best-effort: just running in our CWD */ + strncpy(exe_path, orig_argv[0], PATH_MAX-1); + } + + dout(1) << " exe_path " << exe_path << dendl; + + unblock_all_signals(NULL); + execv(exe_path, new_argv); + + dout(0) << "respawn execv " << orig_argv[0] + << " failed with " << cpp_strerror(errno) << dendl; + + // We have to assert out here, because suicide() returns, and callers + // to respawn expect it never to return. + ceph_abort(); +} + void Monitor::bootstrap() { dout(10) << "bootstrap" << dendl; @@ -1045,7 +1094,7 @@ void Monitor::bootstrap() dout(10) << "monmap " << *monmap << dendl; // note my rank - int newrank = monmap->get_rank(messenger->get_myaddr()); + int newrank = monmap->get_rank(messenger->get_myaddrs()); if (newrank < 0 && rank >= 0) { // was i ever part of the quorum? if (has_ever_joined) { @@ -1053,6 +1102,14 @@ void Monitor::bootstrap() exit(0); } } + if (newrank >= 0 && + monmap->get_addrs(newrank) != messenger->get_myaddrs()) { + dout(0) << " monmap addrs for rank " << newrank << " changed, i am " + << messenger->get_myaddrs() + << ", monmap is " << monmap->get_addrs(newrank) << ", respawning" + << dendl; + respawn(); + } if (newrank != rank) { dout(0) << " my rank is now " << newrank << " (was " << rank << ")" << dendl; messenger->set_myname(entity_name_t::MON(newrank)); @@ -1799,7 +1856,8 @@ void Monitor::handle_probe_probe(MonOpRequestRef op) void Monitor::handle_probe_reply(MonOpRequestRef op) { MMonProbe *m = static_cast<MMonProbe*>(op->get_req()); - dout(10) << "handle_probe_reply " << m->get_source_inst() << *m << dendl; + dout(10) << "handle_probe_reply " << m->get_source_inst() + << " " << *m << dendl; dout(10) << " monmap is " << *monmap << dendl; // discover name and addrs during probing or electing states. @@ -1840,8 +1898,10 @@ void Monitor::handle_probe_reply(MonOpRequestRef op) bootstrap(); return; } - } else { + } else if (peer_name.size()) { dout(10) << " peer name is " << peer_name << dendl; + } else { + dout(10) << " peer " << m->get_source_addr() << " not in map" << dendl; } // new initial peer? diff --git a/src/mon/Monitor.h b/src/mon/Monitor.h index f6ba7231258..4b81f113c0b 100644 --- a/src/mon/Monitor.h +++ b/src/mon/Monitor.h @@ -125,6 +125,9 @@ public: class Monitor : public Dispatcher, public md_config_obs_t { public: + int orig_argc = 0; + const char **orig_argv = nullptr; + // me string name; int rank; @@ -593,6 +596,7 @@ private: void _reset(); ///< called from bootstrap, start_, or join_election void wait_for_paxos_write(); void _finish_svc_election(); ///< called by {win,lose}_election + void respawn(); public: void bootstrap(); void join_election(); diff --git a/src/mon/MonmapMonitor.cc b/src/mon/MonmapMonitor.cc index 97e36e49145..b3bb2f89179 100644 --- a/src/mon/MonmapMonitor.cc +++ b/src/mon/MonmapMonitor.cc @@ -732,6 +732,32 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op) pending_map.set_rank(name, rank); pending_map.last_changed = ceph_clock_now(); propose = true; + } else if (prefix == "mon enable-msgr2") { + if (!monmap.get_required_features().contains_all( + ceph::features::mon::FEATURE_NAUTILUS)) { + err = -EACCES; + ss << "all monitors must be running nautilus to enable v2"; + goto reply; + } + for (auto& i : pending_map.mon_info) { + if (i.second.public_addrs.v.size() == 1 && + i.second.public_addrs.front().is_legacy() && + i.second.public_addrs.front().get_port() == CEPH_MON_PORT_LEGACY) { + entity_addrvec_t av; + entity_addr_t a = i.second.public_addrs.front(); + a.set_type(entity_addr_t::TYPE_MSGR2); + a.set_port(CEPH_MON_PORT_IANA); + av.v.push_back(a); + av.v.push_back(i.second.public_addrs.front()); + dout(10) << " setting mon." << i.first + << " addrs " << i.second.public_addrs + << " -> " << av << dendl; + pending_map.set_addrvec(i.first, av); + propose = true; + pending_map.last_changed = ceph_clock_now(); + } + } + err = 0; } else { ss << "unknown command " << prefix; err = -EINVAL; diff --git a/src/msg/async/AsyncMessenger.cc b/src/msg/async/AsyncMessenger.cc index 852402a9cfd..c8403130d42 100644 --- a/src/msg/async/AsyncMessenger.cc +++ b/src/msg/async/AsyncMessenger.cc @@ -601,34 +601,42 @@ AsyncConnectionRef AsyncMessenger::create_connect( return conn; } -ConnectionRef AsyncMessenger::connect_to(int type, const entity_addrvec_t& addrs) -{ - Mutex::Locker l(lock); - if (*my_addrs == addrs || - (addrs.v.size() == 1 && - my_addrs->contains(addrs.front()))) { - // local - return local_connection; - } - - AsyncConnectionRef conn = _lookup_conn(addrs); - if (conn) { - ldout(cct, 10) << __func__ << " " << addrs << " existing " << conn << dendl; - } else { - conn = create_connect(addrs, type); - ldout(cct, 10) << __func__ << " " << addrs << " new " << conn << dendl; - } - - return conn; -} ConnectionRef AsyncMessenger::get_loopback_connection() { return local_connection; } -int AsyncMessenger::_send_to(Message *m, int type, const entity_addrvec_t& addrs) +entity_addrvec_t AsyncMessenger::_filter_addrs(int type, + const entity_addrvec_t& addrs) { + if (did_bind && + !get_myaddrs().has_msgr2() && + get_mytype() == type) { + // if we are bound to v1 only, and we are connecting to a peer, we cannot + // use the peer's v2 address (yet). otherwise the connection is assymetrical, + // because they would have to use v1 to connect to us, and we would use v2, + // and connection race detection etc would totally break down (among other + // things). + ldout(cct, 10) << __func__ << " " << addrs << " type " << type + << " limiting to v1 ()" << dendl; + entity_addrvec_t r; + for (auto& i : addrs.v) { + if (i.is_msgr2()) { + continue; + } + r.v.push_back(i); + } + return r; + } else { + return addrs; + } +} + +int AsyncMessenger::send_to(Message *m, int type, const entity_addrvec_t& addrs) +{ + Mutex::Locker l(lock); + FUNCTRACE(cct); ceph_assert(m); @@ -648,11 +656,35 @@ int AsyncMessenger::_send_to(Message *m, int type, const entity_addrvec_t& addrs return -EINVAL; } - AsyncConnectionRef conn = _lookup_conn(addrs); - submit_message(m, conn, addrs, type); + auto av = _filter_addrs(type, addrs); + AsyncConnectionRef conn = _lookup_conn(av); + submit_message(m, conn, av, type); return 0; } +ConnectionRef AsyncMessenger::connect_to(int type, const entity_addrvec_t& addrs) +{ + Mutex::Locker l(lock); + if (*my_addrs == addrs || + (addrs.v.size() == 1 && + my_addrs->contains(addrs.front()))) { + // local + return local_connection; + } + + auto av = _filter_addrs(type, addrs); + + AsyncConnectionRef conn = _lookup_conn(av); + if (conn) { + ldout(cct, 10) << __func__ << " " << av << " existing " << conn << dendl; + } else { + conn = create_connect(av, type); + ldout(cct, 10) << __func__ << " " << av << " new " << conn << dendl; + } + + return conn; +} + void AsyncMessenger::submit_message(Message *m, AsyncConnectionRef con, const entity_addrvec_t& dest_addrs, int dest_type) diff --git a/src/msg/async/AsyncMessenger.h b/src/msg/async/AsyncMessenger.h index b12dce625fd..666187a720d 100644 --- a/src/msg/async/AsyncMessenger.h +++ b/src/msg/async/AsyncMessenger.h @@ -137,11 +137,7 @@ public: * @defgroup Messaging * @{ */ - int send_to(Message *m, int type, const entity_addrvec_t& addrs) override { - Mutex::Locker l(lock); - - return _send_to(m, type, addrs); - } + int send_to(Message *m, int type, const entity_addrvec_t& addrs) override; /** @} // Messaging */ @@ -216,10 +212,12 @@ private: void submit_message(Message *m, AsyncConnectionRef con, const entity_addrvec_t& dest_addrs, int dest_type); - int _send_to(Message *m, int type, const entity_addrvec_t& addrs); void _finish_bind(const entity_addrvec_t& bind_addrs, const entity_addrvec_t& listen_addrs); + entity_addrvec_t _filter_addrs(int type, + const entity_addrvec_t& addrs); + private: static const uint64_t ReapDeadConnectionThreshold = 5; diff --git a/src/msg/msg_types.h b/src/msg/msg_types.h index dcaae42063d..8400194b9f5 100644 --- a/src/msg/msg_types.h +++ b/src/msg/msg_types.h @@ -609,6 +609,14 @@ struct entity_addrvec_t { } return entity_addr_t(); } + bool has_msgr2() const { + for (auto& a : v) { + if (a.is_msgr2()) { + return true; + } + } + return false; + } bool parse(const char *s, const char **end = 0);