Merge PR #25849 into master

* refs/pull/25849/head:
	qa/suites/rados/upgrade: one mon per node, and enable-msgr2 at end
	qa/rados/thrash-old-clients: avoid msgr2
	mon: make bootstrap rank check more robust
	mon: clean up probe debug output a bit
	msg/async: use v1 for v1 <-> [v2,v1] peers
	msg/async/AsyncMessenger: drop single-use _send_to
	mon/HealthMonitor: raise MON_MSGR2_NOT_ENABLED if mons not bound to msgr2
	doc/rados/operations/health-checks: document MON_* health warnings
	mon/MonMapMonitor: add 'mon enable-msgr2' command
	mon: respawn if rank addr changes
	mon/MonMap: calc_addr_mons() after setting rank addrvec

Reviewed-by: Ricardo Dias <rdias@suse.com>
This commit is contained in:
Sage Weil 2019-01-17 11:04:30 -06:00
commit b5e5ee6f40
15 changed files with 275 additions and 46 deletions

View File

@ -21,6 +21,57 @@ that are defined by ceph-mgr python modules.
Definitions
===========
Monitor
-------
MON_DOWN
________
One or more monitor daemons is currently down. The cluster requires a
majority (more than 1/2) of the monitors in order to function. When
one or more monitors are down, clients may have a harder time forming
their initial connection to the cluster as they may need to try more
addresses before they reach an operating monitor.
The down monitor daemon should generally be restarted as soon as
possible to reduce the risk of a subsequen monitor failure leading to
a service outage.
MON_CLOCK_SKEW
______________
The clocks on the hosts running the ceph-mon monitor daemons are not
sufficiently well synchronized. This health alert is raised if the
cluster detects a clock skew greater than ``mon_clock_drift_allowed``.
This is best resolved by synchronizing the clocks using a tool like
``ntpd`` or ``chrony``.
If it is impractical to keep the clocks closely synchronized, the
``mon_clock_drift_allowed`` threshold can also be increased, but this
value must stay significantly below the ``mon_lease`` interval in
order for monitor cluster to function properly.
MON_MSGR2_NOT_ENABLED
_____________________
The ``ms_bind_msgr2`` option is enabled but one or more monitors is
not configured to bind to a v2 port in the cluster's monmap. This
means that features specific to the msgr2 protocol (e.g., encryption)
are not available on some or all connections.
In most cases this can be corrected by issuing the command::
ceph mon enable-msgr2
That command will change any monitor configured for the old default
port 6789 to continue to listen for v1 connections on 6789 and also
listen for v2 connections on the new default 3300 port.
If a monitor is configured to listen for v1 connections on a non-standard port (not 6789), then the monmap will need to be modified manually.
Manager
-------

View File

@ -1,3 +1,7 @@
tasks:
- ceph:
mon_bind_addrvec: false
mon_bind_msgr2: false
conf:
global:
ms bind msgr2: false

View File

@ -11,20 +11,27 @@ overrides:
conf:
global:
ms dump corrupt message level: 0
ms bind msgr2: false
mds:
debug ms: 1
debug mds: 20
roles:
- - mon.a
- mon.c
- mgr.x
- mgr.y
- mds.a
- osd.0
- osd.1
- osd.2
- - mon.b
- osd.3
- - mon.b
- osd.4
- osd.5
- osd.6
- osd.7
- - mon.c
- mgr.y
- osd.8
- osd.9
- osd.10
- osd.11
- - client.0

View File

@ -5,13 +5,14 @@ meta:
restart : osd.0,1,2,3,4,5
tasks:
- install.upgrade:
osd.0:
mon.a:
mon.b:
- print: "**** done install.upgrade osd.0"
- ceph.restart:
daemons: [mon.a, mon.c]
daemons: [mon.a, mon.b]
wait-for-healthy: false
mon-health-to-clog: false
- ceph.restart:
daemons: [osd.0, osd.1, osd.2]
daemons: [osd.0, osd.1, osd.2, osd.3, osd.4, osd.5, osd.6, osd.7]
wait-for-healthy: false
- print: "**** done ceph.restart 1st half"
- print: "**** done ceph.restart 1st 2/3s"

View File

@ -10,18 +10,21 @@ overrides:
- \(MDS_
tasks:
- install.upgrade:
osd.3:
mon.c:
- ceph.restart:
daemons: [mon.b, mgr.x, mgr.y]
daemons: [mon.c, mgr.x, mgr.y]
wait-for-up: true
wait-for-healthy: false
- ceph.restart:
daemons: [osd.3, osd.4, osd.5]
daemons: [osd.8, osd.9, osd.10, osd.11]
wait-for-up: true
wait-for-healthy: false
- ceph.restart:
daemons: [mds.a]
wait-for-up: true
wait-for-healthy: false
- exec:
mon.a:
- ceph mon enable-msgr2
- install.upgrade:
client.0:

View File

@ -801,6 +801,9 @@ int main(int argc, const char **argv)
mon = new Monitor(g_ceph_context, g_conf()->name.get_id(), store,
msgr, mgr_msgr, &monmap);
mon->orig_argc = argc;
mon->orig_argv = argv;
if (force_sync) {
derr << "flagging a forced sync ..." << dendl;
ostringstream oss;

View File

@ -366,6 +366,27 @@ bool HealthMonitor::check_leader_health()
}
}
// MON_MSGR2_NOT_ENABLED
if (g_conf().get_val<bool>("ms_bind_msgr2") &&
mon->monmap->get_required_features().contains_all(
ceph::features::mon::FEATURE_NAUTILUS)) {
list<string> details;
for (auto& i : mon->monmap->mon_info) {
if (!i.second.public_addrs.has_msgr2()) {
ostringstream ds;
ds << "mon." << i.first << " is not bound to a msgr2 port, only "
<< i.second.public_addrs;
details.push_back(ds.str());
}
}
if (!details.empty()) {
ostringstream ss;
ss << details.size() << " monitors have not enabled msgr2";
auto& d = next.add("MON_MSGR2_NOT_ENABLED", HEALTH_WARN, ss.str());
d.detail.swap(details);
}
}
if (next != leader_checks) {
changed = true;
leader_checks = next;

View File

@ -451,6 +451,9 @@ COMMAND("mon set-rank " \
"name=rank,type=CephInt",
"set the rank for the specified mon",
"mon", "rw")
COMMAND("mon enable-msgr2",
"enable the msgr2 protocol on port 3300",
"mon", "rw")
/*
* OSD commands

View File

@ -334,10 +334,17 @@ public:
}
int get_rank(const entity_addr_t& a) const {
string n = get_name(a);
if (n.empty())
return -1;
return get_rank(n);
if (!n.empty()) {
return get_rank(n);
}
return -1;
}
int get_rank(const entity_addrvec_t& av) const {
string n = get_name(av);
if (!n.empty()) {
return get_rank(n);
}
return -1;
}
bool get_addr_name(const entity_addr_t& a, string& name) {
if (addr_mons.count(a) == 0)
@ -358,6 +365,7 @@ public:
void set_addrvec(const string& n, const entity_addrvec_t& a) {
ceph_assert(mon_info.count(n));
mon_info[n].public_addrs = a;
calc_addr_mons();
}
void encode(bufferlist& blist, uint64_t con_features) const;

View File

@ -30,6 +30,7 @@
#include "common/version.h"
#include "common/blkdev.h"
#include "common/cmdparse.h"
#include "common/signal.h"
#include "osd/OSDMap.h"
@ -1029,6 +1030,54 @@ void Monitor::wait_for_paxos_write()
}
}
void Monitor::respawn()
{
dout(0) << __func__ << dendl;
char *new_argv[orig_argc+1];
dout(1) << " e: '" << orig_argv[0] << "'" << dendl;
for (int i=0; i<orig_argc; i++) {
new_argv[i] = (char *)orig_argv[i];
dout(1) << " " << i << ": '" << orig_argv[i] << "'" << dendl;
}
new_argv[orig_argc] = NULL;
/* Determine the path to our executable, test if Linux /proc/self/exe exists.
* This allows us to exec the same executable even if it has since been
* unlinked.
*/
char exe_path[PATH_MAX] = "";
#ifdef PROCPREFIX
if (readlink(PROCPREFIX "/proc/self/exe", exe_path, PATH_MAX-1) != -1) {
dout(1) << "respawning with exe " << exe_path << dendl;
strcpy(exe_path, PROCPREFIX "/proc/self/exe");
} else {
#else
{
#endif
/* Print CWD for the user's interest */
char buf[PATH_MAX];
char *cwd = getcwd(buf, sizeof(buf));
ceph_assert(cwd);
dout(1) << " cwd " << cwd << dendl;
/* Fall back to a best-effort: just running in our CWD */
strncpy(exe_path, orig_argv[0], PATH_MAX-1);
}
dout(1) << " exe_path " << exe_path << dendl;
unblock_all_signals(NULL);
execv(exe_path, new_argv);
dout(0) << "respawn execv " << orig_argv[0]
<< " failed with " << cpp_strerror(errno) << dendl;
// We have to assert out here, because suicide() returns, and callers
// to respawn expect it never to return.
ceph_abort();
}
void Monitor::bootstrap()
{
dout(10) << "bootstrap" << dendl;
@ -1045,7 +1094,7 @@ void Monitor::bootstrap()
dout(10) << "monmap " << *monmap << dendl;
// note my rank
int newrank = monmap->get_rank(messenger->get_myaddr());
int newrank = monmap->get_rank(messenger->get_myaddrs());
if (newrank < 0 && rank >= 0) {
// was i ever part of the quorum?
if (has_ever_joined) {
@ -1053,6 +1102,14 @@ void Monitor::bootstrap()
exit(0);
}
}
if (newrank >= 0 &&
monmap->get_addrs(newrank) != messenger->get_myaddrs()) {
dout(0) << " monmap addrs for rank " << newrank << " changed, i am "
<< messenger->get_myaddrs()
<< ", monmap is " << monmap->get_addrs(newrank) << ", respawning"
<< dendl;
respawn();
}
if (newrank != rank) {
dout(0) << " my rank is now " << newrank << " (was " << rank << ")" << dendl;
messenger->set_myname(entity_name_t::MON(newrank));
@ -1799,7 +1856,8 @@ void Monitor::handle_probe_probe(MonOpRequestRef op)
void Monitor::handle_probe_reply(MonOpRequestRef op)
{
MMonProbe *m = static_cast<MMonProbe*>(op->get_req());
dout(10) << "handle_probe_reply " << m->get_source_inst() << *m << dendl;
dout(10) << "handle_probe_reply " << m->get_source_inst()
<< " " << *m << dendl;
dout(10) << " monmap is " << *monmap << dendl;
// discover name and addrs during probing or electing states.
@ -1840,8 +1898,10 @@ void Monitor::handle_probe_reply(MonOpRequestRef op)
bootstrap();
return;
}
} else {
} else if (peer_name.size()) {
dout(10) << " peer name is " << peer_name << dendl;
} else {
dout(10) << " peer " << m->get_source_addr() << " not in map" << dendl;
}
// new initial peer?

View File

@ -125,6 +125,9 @@ public:
class Monitor : public Dispatcher,
public md_config_obs_t {
public:
int orig_argc = 0;
const char **orig_argv = nullptr;
// me
string name;
int rank;
@ -593,6 +596,7 @@ private:
void _reset(); ///< called from bootstrap, start_, or join_election
void wait_for_paxos_write();
void _finish_svc_election(); ///< called by {win,lose}_election
void respawn();
public:
void bootstrap();
void join_election();

View File

@ -732,6 +732,32 @@ bool MonmapMonitor::prepare_command(MonOpRequestRef op)
pending_map.set_rank(name, rank);
pending_map.last_changed = ceph_clock_now();
propose = true;
} else if (prefix == "mon enable-msgr2") {
if (!monmap.get_required_features().contains_all(
ceph::features::mon::FEATURE_NAUTILUS)) {
err = -EACCES;
ss << "all monitors must be running nautilus to enable v2";
goto reply;
}
for (auto& i : pending_map.mon_info) {
if (i.second.public_addrs.v.size() == 1 &&
i.second.public_addrs.front().is_legacy() &&
i.second.public_addrs.front().get_port() == CEPH_MON_PORT_LEGACY) {
entity_addrvec_t av;
entity_addr_t a = i.second.public_addrs.front();
a.set_type(entity_addr_t::TYPE_MSGR2);
a.set_port(CEPH_MON_PORT_IANA);
av.v.push_back(a);
av.v.push_back(i.second.public_addrs.front());
dout(10) << " setting mon." << i.first
<< " addrs " << i.second.public_addrs
<< " -> " << av << dendl;
pending_map.set_addrvec(i.first, av);
propose = true;
pending_map.last_changed = ceph_clock_now();
}
}
err = 0;
} else {
ss << "unknown command " << prefix;
err = -EINVAL;

View File

@ -601,34 +601,42 @@ AsyncConnectionRef AsyncMessenger::create_connect(
return conn;
}
ConnectionRef AsyncMessenger::connect_to(int type, const entity_addrvec_t& addrs)
{
Mutex::Locker l(lock);
if (*my_addrs == addrs ||
(addrs.v.size() == 1 &&
my_addrs->contains(addrs.front()))) {
// local
return local_connection;
}
AsyncConnectionRef conn = _lookup_conn(addrs);
if (conn) {
ldout(cct, 10) << __func__ << " " << addrs << " existing " << conn << dendl;
} else {
conn = create_connect(addrs, type);
ldout(cct, 10) << __func__ << " " << addrs << " new " << conn << dendl;
}
return conn;
}
ConnectionRef AsyncMessenger::get_loopback_connection()
{
return local_connection;
}
int AsyncMessenger::_send_to(Message *m, int type, const entity_addrvec_t& addrs)
entity_addrvec_t AsyncMessenger::_filter_addrs(int type,
const entity_addrvec_t& addrs)
{
if (did_bind &&
!get_myaddrs().has_msgr2() &&
get_mytype() == type) {
// if we are bound to v1 only, and we are connecting to a peer, we cannot
// use the peer's v2 address (yet). otherwise the connection is assymetrical,
// because they would have to use v1 to connect to us, and we would use v2,
// and connection race detection etc would totally break down (among other
// things).
ldout(cct, 10) << __func__ << " " << addrs << " type " << type
<< " limiting to v1 ()" << dendl;
entity_addrvec_t r;
for (auto& i : addrs.v) {
if (i.is_msgr2()) {
continue;
}
r.v.push_back(i);
}
return r;
} else {
return addrs;
}
}
int AsyncMessenger::send_to(Message *m, int type, const entity_addrvec_t& addrs)
{
Mutex::Locker l(lock);
FUNCTRACE(cct);
ceph_assert(m);
@ -648,11 +656,35 @@ int AsyncMessenger::_send_to(Message *m, int type, const entity_addrvec_t& addrs
return -EINVAL;
}
AsyncConnectionRef conn = _lookup_conn(addrs);
submit_message(m, conn, addrs, type);
auto av = _filter_addrs(type, addrs);
AsyncConnectionRef conn = _lookup_conn(av);
submit_message(m, conn, av, type);
return 0;
}
ConnectionRef AsyncMessenger::connect_to(int type, const entity_addrvec_t& addrs)
{
Mutex::Locker l(lock);
if (*my_addrs == addrs ||
(addrs.v.size() == 1 &&
my_addrs->contains(addrs.front()))) {
// local
return local_connection;
}
auto av = _filter_addrs(type, addrs);
AsyncConnectionRef conn = _lookup_conn(av);
if (conn) {
ldout(cct, 10) << __func__ << " " << av << " existing " << conn << dendl;
} else {
conn = create_connect(av, type);
ldout(cct, 10) << __func__ << " " << av << " new " << conn << dendl;
}
return conn;
}
void AsyncMessenger::submit_message(Message *m, AsyncConnectionRef con,
const entity_addrvec_t& dest_addrs,
int dest_type)

View File

@ -137,11 +137,7 @@ public:
* @defgroup Messaging
* @{
*/
int send_to(Message *m, int type, const entity_addrvec_t& addrs) override {
Mutex::Locker l(lock);
return _send_to(m, type, addrs);
}
int send_to(Message *m, int type, const entity_addrvec_t& addrs) override;
/** @} // Messaging */
@ -216,10 +212,12 @@ private:
void submit_message(Message *m, AsyncConnectionRef con,
const entity_addrvec_t& dest_addrs, int dest_type);
int _send_to(Message *m, int type, const entity_addrvec_t& addrs);
void _finish_bind(const entity_addrvec_t& bind_addrs,
const entity_addrvec_t& listen_addrs);
entity_addrvec_t _filter_addrs(int type,
const entity_addrvec_t& addrs);
private:
static const uint64_t ReapDeadConnectionThreshold = 5;

View File

@ -609,6 +609,14 @@ struct entity_addrvec_t {
}
return entity_addr_t();
}
bool has_msgr2() const {
for (auto& a : v) {
if (a.is_msgr2()) {
return true;
}
}
return false;
}
bool parse(const char *s, const char **end = 0);