mirror of
https://github.com/ceph/ceph
synced 2025-01-31 07:22:56 +00:00
mon: new health check framework
Signed-off-by: Sage Weil <sage@redhat.com>
This commit is contained in:
parent
0b59a7f8ad
commit
8e815abe4e
@ -309,7 +309,6 @@ OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT, 5) // exponential backoff for cl
|
||||
OPTION(mon_timecheck_interval, OPT_FLOAT, 300.0) // on leader, timecheck (clock drift check) interval (seconds)
|
||||
OPTION(mon_timecheck_skew_interval, OPT_FLOAT, 30.0) // on leader, timecheck (clock drift check) interval when in presence of a skew (seconds)
|
||||
OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info)
|
||||
OPTION(mon_health_max_detail, OPT_INT, 50) // max detailed pgs to report in health detail
|
||||
OPTION(mon_pg_min_inactive, OPT_U64, 1) // the number of PGs which have to be inactive longer than 'mon_pg_stuck_threshold' before health goes into ERR. 0 means disabled, never go into ERR.
|
||||
OPTION(mon_pg_warn_min_per_osd, OPT_INT, 30) // min # pgs per (in) osd before we warn the admin
|
||||
OPTION(mon_pg_warn_max_per_osd, OPT_INT, 300) // max # pgs per (in) osd before we warn the admin
|
||||
@ -352,6 +351,8 @@ OPTION(mon_health_data_update_interval, OPT_FLOAT, 60.0)
|
||||
OPTION(mon_health_to_clog, OPT_BOOL, true)
|
||||
OPTION(mon_health_to_clog_interval, OPT_INT, 3600)
|
||||
OPTION(mon_health_to_clog_tick_interval, OPT_DOUBLE, 60.0)
|
||||
OPTION(mon_health_preluminous_compat, OPT_BOOL, false)
|
||||
OPTION(mon_health_max_detail, OPT_INT, 50) // max detailed pgs to report in health detail
|
||||
OPTION(mon_data_avail_crit, OPT_INT, 5)
|
||||
OPTION(mon_data_avail_warn, OPT_INT, 30)
|
||||
OPTION(mon_data_size_warn, OPT_U64, 15*1024*1024*1024) // issue a warning when the monitor's data store goes over 15GB (in bytes)
|
||||
|
47
src/messages/MMonHealthChecks.h
Normal file
47
src/messages/MMonHealthChecks.h
Normal file
@ -0,0 +1,47 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
|
||||
#ifndef CEPH_MMON_HEALTH_CHECKS_H
|
||||
#define CEPH_MMON_HEALTH_CHECKS_H
|
||||
|
||||
#include "messages/PaxosServiceMessage.h"
|
||||
#include "mon/health_check.h"
|
||||
|
||||
struct MMonHealthChecks : public PaxosServiceMessage
|
||||
{
|
||||
static const int HEAD_VERSION = 1;
|
||||
static const int COMPAT_VERSION = 1;
|
||||
|
||||
health_check_map_t health_checks;
|
||||
|
||||
MMonHealthChecks()
|
||||
: PaxosServiceMessage(MSG_MON_HEALTH_CHECKS, HEAD_VERSION, COMPAT_VERSION) {
|
||||
}
|
||||
MMonHealthChecks(health_check_map_t& m)
|
||||
: PaxosServiceMessage(MSG_MON_HEALTH_CHECKS, HEAD_VERSION, COMPAT_VERSION),
|
||||
health_checks(m) {
|
||||
}
|
||||
|
||||
private:
|
||||
~MMonHealthChecks() override { }
|
||||
|
||||
public:
|
||||
const char *get_type_name() const override { return "mon_health_checks"; }
|
||||
void print(ostream &o) const override {
|
||||
o << "mon_health_checks(" << health_checks.checks.size() << " checks)";
|
||||
}
|
||||
|
||||
void decode_payload() override {
|
||||
bufferlist::iterator p = payload.begin();
|
||||
paxos_decode(p);
|
||||
::decode(health_checks, p);
|
||||
}
|
||||
|
||||
void encode_payload(uint64_t features) override {
|
||||
paxos_encode();
|
||||
::encode(health_checks, payload);
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
#endif
|
@ -15,6 +15,7 @@ set(lib_mon_srcs
|
||||
LogMonitor.cc
|
||||
AuthMonitor.cc
|
||||
Elector.cc
|
||||
HealthMonitor.cc
|
||||
OldHealthMonitor.cc
|
||||
DataHealthService.cc
|
||||
PGMonitor.cc
|
||||
|
355
src/mon/HealthMonitor.cc
Normal file
355
src/mon/HealthMonitor.cc
Normal file
@ -0,0 +1,355 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
/*
|
||||
* Ceph - scalable distributed file system
|
||||
*
|
||||
* Copyright (C) 2013 Inktank, Inc
|
||||
*
|
||||
* This is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License version 2.1, as published by the Free Software
|
||||
* Foundation. See file COPYING.
|
||||
*
|
||||
*/
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <limits.h>
|
||||
#include <sstream>
|
||||
#include <boost/regex.hpp>
|
||||
|
||||
#include "include/assert.h"
|
||||
#include "include/stringify.h"
|
||||
|
||||
#include "mon/Monitor.h"
|
||||
#include "mon/HealthService.h"
|
||||
#include "mon/HealthMonitor.h"
|
||||
#include "mon/DataHealthService.h"
|
||||
|
||||
#include "messages/MMonHealth.h"
|
||||
#include "messages/MMonHealthChecks.h"
|
||||
|
||||
#include "common/Formatter.h"
|
||||
|
||||
#define dout_subsys ceph_subsys_mon
|
||||
#undef dout_prefix
|
||||
#define dout_prefix _prefix(_dout, mon, this)
|
||||
static ostream& _prefix(std::ostream *_dout, const Monitor *mon,
|
||||
const HealthMonitor *hmon) {
|
||||
return *_dout << "mon." << mon->name << "@" << mon->rank
|
||||
<< "(" << mon->get_state_name() << ").health ";
|
||||
}
|
||||
|
||||
HealthMonitor::HealthMonitor(Monitor *m, Paxos *p, const string& service_name)
|
||||
: PaxosService(m, p, service_name) {
|
||||
}
|
||||
|
||||
void HealthMonitor::init()
|
||||
{
|
||||
dout(10) << __func__ << dendl;
|
||||
}
|
||||
|
||||
void HealthMonitor::create_initial()
|
||||
{
|
||||
dout(10) << __func__ << dendl;
|
||||
}
|
||||
|
||||
void HealthMonitor::update_from_paxos(bool *need_bootstrap)
|
||||
{
|
||||
version = get_last_committed();
|
||||
dout(10) << __func__ << dendl;
|
||||
load_health();
|
||||
|
||||
bufferlist qbl;
|
||||
mon->store->get(service_name, "quorum", qbl);
|
||||
if (qbl.length()) {
|
||||
auto p = qbl.begin();
|
||||
::decode(quorum_checks, p);
|
||||
} else {
|
||||
quorum_checks.clear();
|
||||
}
|
||||
|
||||
bufferlist lbl;
|
||||
mon->store->get(service_name, "leader", lbl);
|
||||
if (lbl.length()) {
|
||||
auto p = lbl.begin();
|
||||
::decode(leader_checks, p);
|
||||
} else {
|
||||
leader_checks.clear();
|
||||
}
|
||||
|
||||
dout(20) << "dump:";
|
||||
JSONFormatter jf(true);
|
||||
jf.open_object_section("health");
|
||||
jf.open_object_section("quorum_health");
|
||||
for (auto& p : quorum_checks) {
|
||||
string s = string("mon.") + stringify(p.first);
|
||||
jf.dump_object(s.c_str(), p.second);
|
||||
}
|
||||
jf.close_section();
|
||||
jf.dump_object("leader_health", leader_checks);
|
||||
jf.close_section();
|
||||
jf.flush(*_dout);
|
||||
*_dout << dendl;
|
||||
}
|
||||
|
||||
void HealthMonitor::create_pending()
|
||||
{
|
||||
dout(10) << " " << version << dendl;
|
||||
}
|
||||
|
||||
void HealthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
|
||||
{
|
||||
++version;
|
||||
dout(10) << " " << version << dendl;
|
||||
put_last_committed(t, version);
|
||||
|
||||
bufferlist qbl;
|
||||
::encode(quorum_checks, qbl);
|
||||
t->put(service_name, "quorum", qbl);
|
||||
bufferlist lbl;
|
||||
::encode(leader_checks, lbl);
|
||||
t->put(service_name, "leader", lbl);
|
||||
|
||||
health_check_map_t pending_health;
|
||||
|
||||
// combine per-mon details carefully...
|
||||
map<string,set<string>> names; // code -> <mon names>
|
||||
for (auto p : quorum_checks) {
|
||||
for (auto q : p.second.checks) {
|
||||
names[q.first].insert(mon->monmap->get_name(p.first));
|
||||
}
|
||||
pending_health.merge(p.second);
|
||||
}
|
||||
for (auto p : pending_health.checks) {
|
||||
p.second.summary = boost::regex_replace(
|
||||
p.second.summary,
|
||||
boost::regex("%num%"), stringify(names[p.first].size()));
|
||||
p.second.summary = boost::regex_replace(
|
||||
p.second.summary,
|
||||
boost::regex("%names%"), stringify(names[p.first]));
|
||||
p.second.summary = boost::regex_replace(
|
||||
p.second.summary,
|
||||
boost::regex("%plurals%"),
|
||||
names[p.first].size() > 1 ? "s" : "");
|
||||
p.second.summary = boost::regex_replace(
|
||||
p.second.summary,
|
||||
boost::regex("%isorare%"),
|
||||
names[p.first].size() > 1 ? "are" : "is");
|
||||
}
|
||||
|
||||
pending_health.merge(leader_checks);
|
||||
encode_health(pending_health, t);
|
||||
}
|
||||
|
||||
version_t HealthMonitor::get_trim_to()
|
||||
{
|
||||
// we don't actually need *any* old states, but keep a few.
|
||||
if (version > 5) {
|
||||
return version - 5;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool HealthMonitor::preprocess_query(MonOpRequestRef op)
|
||||
{
|
||||
switch (op->get_req()->get_type()) {
|
||||
case MSG_MON_HEALTH:
|
||||
{
|
||||
MMonHealth *hm = static_cast<MMonHealth*>(op->get_req());
|
||||
int service_type = hm->get_service_type();
|
||||
if (services.count(service_type) == 0) {
|
||||
dout(1) << __func__ << " service type " << service_type
|
||||
<< " not registered -- drop message!" << dendl;
|
||||
return false;
|
||||
}
|
||||
return services[service_type]->service_dispatch(op);
|
||||
}
|
||||
|
||||
case MSG_MON_HEALTH_CHECKS:
|
||||
return preprocess_health_checks(op);
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
bool HealthMonitor::prepare_update(MonOpRequestRef op)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
|
||||
bool HealthMonitor::preprocess_health_checks(MonOpRequestRef op)
|
||||
{
|
||||
MMonHealthChecks *m = static_cast<MMonHealthChecks*>(op->get_req());
|
||||
quorum_checks[m->get_source().num()] = m->health_checks;
|
||||
return true;
|
||||
}
|
||||
|
||||
void HealthMonitor::tick()
|
||||
{
|
||||
if (!is_active()) {
|
||||
return;
|
||||
}
|
||||
dout(10) << __func__ << dendl;
|
||||
bool changed = false;
|
||||
if (check_member_health()) {
|
||||
changed = true;
|
||||
}
|
||||
if (mon->is_leader()) {
|
||||
if (check_leader_health()) {
|
||||
changed = true;
|
||||
}
|
||||
}
|
||||
if (changed) {
|
||||
propose_pending();
|
||||
}
|
||||
}
|
||||
|
||||
bool HealthMonitor::check_member_health()
|
||||
{
|
||||
dout(20) << __func__ << dendl;
|
||||
bool changed = false;
|
||||
|
||||
// snapshot of usage
|
||||
DataStats stats;
|
||||
get_fs_stats(stats.fs_stats, g_conf->mon_data.c_str());
|
||||
map<string,uint64_t> extra;
|
||||
uint64_t store_size = mon->store->get_estimated_size(extra);
|
||||
assert(store_size > 0);
|
||||
stats.store_stats.bytes_total = store_size;
|
||||
stats.store_stats.bytes_sst = extra["sst"];
|
||||
stats.store_stats.bytes_log = extra["log"];
|
||||
stats.store_stats.bytes_misc = extra["misc"];
|
||||
stats.last_update = ceph_clock_now();
|
||||
dout(10) << __func__ << " avail " << stats.fs_stats.avail_percent << "%"
|
||||
<< " total " << prettybyte_t(stats.fs_stats.byte_total)
|
||||
<< ", used " << prettybyte_t(stats.fs_stats.byte_used)
|
||||
<< ", avail " << prettybyte_t(stats.fs_stats.byte_avail) << dendl;
|
||||
|
||||
// MON_DISK_{LOW,CRIT,BIG}
|
||||
health_check_map_t next;
|
||||
if (stats.fs_stats.avail_percent <= g_conf->mon_data_avail_crit) {
|
||||
stringstream ss, ss2;
|
||||
ss << "mon%plurals% %names% %isorare% very low on available space";
|
||||
auto& d = next.add("MON_DISK_CRIT", HEALTH_ERR, ss.str());
|
||||
ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent
|
||||
<< "% avail";
|
||||
d.detail.push_back(ss2.str());
|
||||
} else if (stats.fs_stats.avail_percent <= g_conf->mon_data_avail_warn) {
|
||||
stringstream ss, ss2;
|
||||
ss << "mon%plurals% %names% %isorare% low on available space";
|
||||
auto& d = next.add("MON_DISK_LOW", HEALTH_ERR, ss.str());
|
||||
ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent
|
||||
<< "% avail";
|
||||
d.detail.push_back(ss2.str());
|
||||
}
|
||||
if (stats.store_stats.bytes_total >= g_conf->mon_data_size_warn) {
|
||||
stringstream ss, ss2;
|
||||
ss << "mon%plurals% %names% %isorare% using a lot of disk space";
|
||||
auto& d = next.add("MON_DISK_BIG", HEALTH_WARN, ss.str());
|
||||
ss2 << "mon." << mon->name << " is "
|
||||
<< prettybyte_t(stats.store_stats.bytes_total)
|
||||
<< " >= mon_data_size_warn ("
|
||||
<< prettybyte_t(g_conf->mon_data_size_warn) << ")";
|
||||
d.detail.push_back(ss2.str());
|
||||
}
|
||||
|
||||
auto p = quorum_checks.find(mon->rank);
|
||||
if (p == quorum_checks.end() ||
|
||||
p->second != next) {
|
||||
if (mon->is_leader()) {
|
||||
// prepare to propose
|
||||
quorum_checks[mon->rank] = next;
|
||||
changed = true;
|
||||
} else {
|
||||
// tell the leader
|
||||
mon->messenger->send_message(new MMonHealthChecks(next),
|
||||
mon->monmap->get_inst(mon->get_leader()));
|
||||
}
|
||||
}
|
||||
return changed;
|
||||
}
|
||||
|
||||
bool HealthMonitor::check_leader_health()
|
||||
{
|
||||
dout(20) << __func__ << dendl;
|
||||
bool changed = false;
|
||||
|
||||
// prune quorum_health
|
||||
{
|
||||
auto& qset = mon->get_quorum();
|
||||
auto p = quorum_checks.begin();
|
||||
while (p != quorum_checks.end()) {
|
||||
if (qset.count(p->first) == 0) {
|
||||
p = quorum_checks.erase(p);
|
||||
changed = true;
|
||||
} else {
|
||||
++p;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
health_check_map_t next;
|
||||
|
||||
// MON_DOWN
|
||||
{
|
||||
int max = mon->monmap->size();
|
||||
int actual = mon->get_quorum().size();
|
||||
if (actual < max) {
|
||||
ostringstream ss;
|
||||
ss << (max-actual) << "/" << max << " mons down, quorum "
|
||||
<< mon->get_quorum_names();
|
||||
auto& d = next.add("MON_DOWN", HEALTH_WARN, ss.str());
|
||||
set<int> q = mon->get_quorum();
|
||||
for (int i=0; i<max; i++) {
|
||||
if (q.count(i) == 0) {
|
||||
ostringstream ss;
|
||||
ss << "mon." << mon->monmap->get_name(i) << " (rank " << i
|
||||
<< ") addr " << mon->monmap->get_addr(i)
|
||||
<< " is down (out of quorum)";
|
||||
d.detail.push_back(ss.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// MON_CLOCK_SKEW
|
||||
if (!mon->timecheck_skews.empty()) {
|
||||
list<string> warns;
|
||||
list<string> details;
|
||||
for (map<entity_inst_t,double>::iterator i = mon->timecheck_skews.begin();
|
||||
i != mon->timecheck_skews.end(); ++i) {
|
||||
entity_inst_t inst = i->first;
|
||||
double skew = i->second;
|
||||
double latency = mon->timecheck_latencies[inst];
|
||||
string name = mon->monmap->get_name(inst.addr);
|
||||
ostringstream tcss;
|
||||
health_status_t tcstatus = mon->timecheck_status(tcss, skew, latency);
|
||||
if (tcstatus != HEALTH_OK) {
|
||||
warns.push_back(name);
|
||||
ostringstream tmp_ss;
|
||||
tmp_ss << "mon." << name
|
||||
<< " addr " << inst.addr << " " << tcss.str()
|
||||
<< " (latency " << latency << "s)";
|
||||
details.push_back(tmp_ss.str());
|
||||
}
|
||||
}
|
||||
if (!warns.empty()) {
|
||||
ostringstream ss;
|
||||
ss << "clock skew detected on";
|
||||
while (!warns.empty()) {
|
||||
ss << " mon." << warns.front();
|
||||
warns.pop_front();
|
||||
if (!warns.empty())
|
||||
ss << ",";
|
||||
}
|
||||
auto& d = next.add("MON_CLOCK_SKEW", HEALTH_WARN,
|
||||
"monitor clock skew detected");
|
||||
d.detail.swap(details);
|
||||
}
|
||||
}
|
||||
|
||||
if (next != leader_checks) {
|
||||
changed = true;
|
||||
leader_checks = next;
|
||||
}
|
||||
return changed;
|
||||
}
|
71
src/mon/HealthMonitor.h
Normal file
71
src/mon/HealthMonitor.h
Normal file
@ -0,0 +1,71 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
/*
|
||||
* Ceph - scalable distributed file system
|
||||
*
|
||||
* Copyright (C) 2013 Inktank, Inc
|
||||
*
|
||||
* This is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License version 2.1, as published by the Free Software
|
||||
* Foundation. See file COPYING.
|
||||
*
|
||||
*/
|
||||
#ifndef CEPH_HEALTH_MONITOR_H
|
||||
#define CEPH_HEALTH_MONITOR_H
|
||||
|
||||
#include "mon/PaxosService.h"
|
||||
|
||||
//forward declaration
|
||||
namespace ceph { class Formatter; }
|
||||
class HealthService;
|
||||
|
||||
class HealthMonitor : public PaxosService
|
||||
{
|
||||
map<int,HealthService*> services;
|
||||
version_t version = 0;
|
||||
map<int,health_check_map_t> quorum_checks; // for each quorum member
|
||||
health_check_map_t leader_checks; // leader only
|
||||
|
||||
public:
|
||||
HealthMonitor(Monitor *m, Paxos *p, const string& service_name);
|
||||
~HealthMonitor() override {
|
||||
assert(services.empty());
|
||||
}
|
||||
|
||||
/**
|
||||
* @defgroup HealthMonitor_Inherited_h Inherited abstract methods
|
||||
* @{
|
||||
*/
|
||||
void init() override;
|
||||
|
||||
void get_health(
|
||||
list<pair<health_status_t,string> >& summary,
|
||||
list<pair<health_status_t,string> > *detail,
|
||||
CephContext *cct) const override {}
|
||||
|
||||
bool preprocess_query(MonOpRequestRef op) override;
|
||||
bool prepare_update(MonOpRequestRef op) override;
|
||||
|
||||
bool preprocess_health_checks(MonOpRequestRef op);
|
||||
bool prepare_health_checks(MonOpRequestRef op);
|
||||
|
||||
bool check_leader_health();
|
||||
bool check_member_health();
|
||||
|
||||
void create_initial() override;
|
||||
void update_from_paxos(bool *need_bootstrap) override;
|
||||
void create_pending() override;
|
||||
void encode_pending(MonitorDBStore::TransactionRef t) override;
|
||||
version_t get_trim_to() override;
|
||||
|
||||
void encode_full(MonitorDBStore::TransactionRef t) override { }
|
||||
|
||||
void tick() override;
|
||||
|
||||
/**
|
||||
* @} // HealthMonitor_Inherited_h
|
||||
*/
|
||||
};
|
||||
|
||||
#endif // CEPH_HEALTH_MONITOR_H
|
@ -60,6 +60,8 @@ void MgrMonitor::update_from_paxos(bool *need_bootstrap)
|
||||
dout(4) << "active server: " << map.active_addr
|
||||
<< "(" << map.active_gid << ")" << dendl;
|
||||
|
||||
load_health();
|
||||
|
||||
if (map.available) {
|
||||
first_seen_inactive = utime_t();
|
||||
} else {
|
||||
@ -86,6 +88,18 @@ void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t)
|
||||
pending_map.encode(bl, mon->get_quorum_con_features());
|
||||
put_version(t, pending_map.epoch, bl);
|
||||
put_last_committed(t, pending_map.epoch);
|
||||
|
||||
health_check_map_t next;
|
||||
if (!pending_map.available) {
|
||||
health_status_t level = HEALTH_WARN;
|
||||
utime_t now = ceph_clock_now();
|
||||
if (first_seen_inactive != utime_t() &&
|
||||
now - first_seen_inactive > g_conf->mon_mgr_inactive_grace) {
|
||||
level = HEALTH_ERR;
|
||||
}
|
||||
next.add("MGR_DOWN", level, "no active mgr");
|
||||
}
|
||||
encode_health(next, t);
|
||||
}
|
||||
|
||||
bool MgrMonitor::check_caps(MonOpRequestRef op, const uuid_d& fsid)
|
||||
|
@ -71,7 +71,7 @@ MonPGStatService *MgrStatMonitor::get_pg_stat_service()
|
||||
|
||||
void MgrStatMonitor::create_initial()
|
||||
{
|
||||
dout(10) << dendl;
|
||||
dout(10) << __func__ << dendl;
|
||||
version = 0;
|
||||
service_map.epoch = 1;
|
||||
::encode(service_map, pending_service_map_bl, CEPH_FEATURES_ALL);
|
||||
|
@ -78,6 +78,7 @@
|
||||
#include "MgrStatMonitor.h"
|
||||
#include "mon/QuorumService.h"
|
||||
#include "mon/OldHealthMonitor.h"
|
||||
#include "mon/HealthMonitor.h"
|
||||
#include "mon/ConfigKeyService.h"
|
||||
#include "common/config.h"
|
||||
#include "common/cmdparse.h"
|
||||
@ -204,6 +205,7 @@ Monitor::Monitor(CephContext* cct_, string nm, MonitorDBStore *s,
|
||||
paxos_service[PAXOS_AUTH] = new AuthMonitor(this, paxos, "auth");
|
||||
paxos_service[PAXOS_MGR] = new MgrMonitor(this, paxos, "mgr");
|
||||
paxos_service[PAXOS_MGRSTAT] = new MgrStatMonitor(this, paxos, "mgrstat");
|
||||
paxos_service[PAXOS_HEALTH] = new HealthMonitor(this, paxos, "health");
|
||||
|
||||
health_monitor = new OldHealthMonitor(this);
|
||||
config_key_service = new ConfigKeyService(this, paxos);
|
||||
@ -2445,6 +2447,115 @@ void Monitor::do_health_to_clog(bool force)
|
||||
health_status_cache.summary = summary;
|
||||
}
|
||||
|
||||
health_status_t Monitor::get_health_status(
|
||||
bool want_detail,
|
||||
Formatter *f,
|
||||
std::string *plain,
|
||||
const char *sep1,
|
||||
const char *sep2)
|
||||
{
|
||||
health_status_t r = HEALTH_OK;
|
||||
bool compat = g_conf->mon_health_preluminous_compat;
|
||||
if (f) {
|
||||
f->open_object_section("health");
|
||||
f->open_object_section("checks");
|
||||
}
|
||||
|
||||
string summary;
|
||||
string *psummary = f ? nullptr : &summary;
|
||||
for (auto& svc : paxos_service) {
|
||||
r = std::min(r, svc->get_health_checks().dump_summary(
|
||||
f, psummary, sep2, want_detail));
|
||||
}
|
||||
|
||||
if (f) {
|
||||
f->close_section();
|
||||
f->dump_stream("status") << r;
|
||||
} else {
|
||||
// one-liner: HEALTH_FOO[ thing1[; thing2 ...]]
|
||||
*plain = stringify(r);
|
||||
if (summary.size()) {
|
||||
*plain += sep1;
|
||||
*plain += summary;
|
||||
}
|
||||
*plain += "\n";
|
||||
}
|
||||
|
||||
if (f && compat) {
|
||||
f->open_array_section("summary");
|
||||
for (auto& svc : paxos_service) {
|
||||
svc->get_health_checks().dump_summary_compat(f);
|
||||
}
|
||||
f->close_section();
|
||||
f->dump_stream("overall_status") << r;
|
||||
}
|
||||
|
||||
if (want_detail) {
|
||||
if (f && compat) {
|
||||
f->open_array_section("detail");
|
||||
}
|
||||
|
||||
for (auto& svc : paxos_service) {
|
||||
svc->get_health_checks().dump_detail(f, plain, compat);
|
||||
}
|
||||
|
||||
if (f && compat) {
|
||||
f->close_section();
|
||||
}
|
||||
}
|
||||
if (f) {
|
||||
f->close_section();
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
void Monitor::log_health(
|
||||
const health_check_map_t& updated,
|
||||
const health_check_map_t& previous,
|
||||
MonitorDBStore::TransactionRef t)
|
||||
{
|
||||
if (!g_conf->mon_health_to_clog) {
|
||||
return;
|
||||
}
|
||||
// FIXME: log atomically as part of @t instead of using clog.
|
||||
dout(10) << __func__ << " updated " << updated.checks.size()
|
||||
<< " previous " << previous.checks.size()
|
||||
<< dendl;
|
||||
for (auto& p : updated.checks) {
|
||||
auto q = previous.checks.find(p.first);
|
||||
if (q == previous.checks.end()) {
|
||||
// new
|
||||
ostringstream ss;
|
||||
ss << p.second.severity << " " << p.first << ": "
|
||||
<< p.second.summary;
|
||||
if (p.second.severity == HEALTH_WARN)
|
||||
clog->warn() << ss.str();
|
||||
else
|
||||
clog->error() << ss.str();
|
||||
} else {
|
||||
if (p.second.summary != q->second.summary ||
|
||||
p.second.severity != q->second.severity) {
|
||||
// summary or severity changed (ignore detail changes at this level)
|
||||
ostringstream ss;
|
||||
ss << p.second.severity << " " << p.first << " (update): "
|
||||
<< p.second.summary;
|
||||
if (p.second.severity == HEALTH_WARN)
|
||||
clog->warn() << ss.str();
|
||||
else
|
||||
clog->error() << ss.str();
|
||||
}
|
||||
}
|
||||
}
|
||||
for (auto& p : previous.checks) {
|
||||
if (!updated.checks.count(p.first)) {
|
||||
// cleared
|
||||
ostringstream ss;
|
||||
ss << HEALTH_OK << " " << p.first << ": " << p.second.summary;
|
||||
clog->info() << ss.str();
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
health_status_t Monitor::get_health(list<string>& status,
|
||||
bufferlist *detailbl,
|
||||
Formatter *f)
|
||||
@ -2550,12 +2661,9 @@ void Monitor::get_cluster_status(stringstream &ss, Formatter *f)
|
||||
if (f)
|
||||
f->open_object_section("status");
|
||||
|
||||
// reply with the status for all the components
|
||||
list<string> health;
|
||||
get_health(health, NULL, f);
|
||||
|
||||
if (f) {
|
||||
f->dump_stream("fsid") << monmap->get_fsid();
|
||||
get_health_status(false, f, nullptr);
|
||||
f->dump_unsigned("election_epoch", get_epoch());
|
||||
{
|
||||
f->open_array_section("quorum");
|
||||
@ -2579,7 +2687,6 @@ void Monitor::get_cluster_status(stringstream &ss, Formatter *f)
|
||||
f->open_object_section("fsmap");
|
||||
mdsmon()->get_fsmap().print_summary(f, NULL);
|
||||
f->close_section();
|
||||
|
||||
f->open_object_section("mgrmap");
|
||||
mgrmon()->get_map().print_summary(f, nullptr);
|
||||
f->close_section();
|
||||
@ -2587,11 +2694,21 @@ void Monitor::get_cluster_status(stringstream &ss, Formatter *f)
|
||||
f->dump_object("servicemap", mgrstatmon()->get_service_map());
|
||||
f->close_section();
|
||||
} else {
|
||||
|
||||
ss << " cluster:\n";
|
||||
ss << " id: " << monmap->get_fsid() << "\n";
|
||||
ss << " health: " << joinify(health.begin(), health.end(),
|
||||
string("\n ")) << "\n";
|
||||
|
||||
string health;
|
||||
if (osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
|
||||
get_health_status(false, nullptr, &health,
|
||||
"\n ", "\n ");
|
||||
} else {
|
||||
list<string> ls;
|
||||
get_health(ls, NULL, f);
|
||||
health = joinify(ls.begin(), ls.end(),
|
||||
string("\n "));
|
||||
}
|
||||
ss << " health: " << health << "\n";
|
||||
|
||||
ss << "\n \n services:\n";
|
||||
{
|
||||
size_t maxlen = 3;
|
||||
@ -3112,25 +3229,35 @@ void Monitor::handle_command(MonOpRequestRef op)
|
||||
}
|
||||
rdata.append(ds);
|
||||
} else if (prefix == "health") {
|
||||
list<string> health_str;
|
||||
get_health(health_str, detail == "detail" ? &rdata : NULL, f.get());
|
||||
if (f) {
|
||||
f->flush(ds);
|
||||
ds << '\n';
|
||||
} else {
|
||||
assert(!health_str.empty());
|
||||
ds << health_str.front();
|
||||
health_str.pop_front();
|
||||
if (!health_str.empty()) {
|
||||
ds << ' ';
|
||||
ds << joinify(health_str.begin(), health_str.end(), string("; "));
|
||||
if (osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
|
||||
string plain;
|
||||
get_health_status(detail == "detail", f.get(), f ? nullptr : &plain);
|
||||
if (f) {
|
||||
f->flush(rdata);
|
||||
} else {
|
||||
rdata.append(plain);
|
||||
}
|
||||
} else {
|
||||
list<string> health_str;
|
||||
get_health(health_str, detail == "detail" ? &rdata : NULL, f.get());
|
||||
if (f) {
|
||||
f->flush(ds);
|
||||
ds << '\n';
|
||||
} else {
|
||||
assert(!health_str.empty());
|
||||
ds << health_str.front();
|
||||
health_str.pop_front();
|
||||
if (!health_str.empty()) {
|
||||
ds << ' ';
|
||||
ds << joinify(health_str.begin(), health_str.end(), string("; "));
|
||||
}
|
||||
}
|
||||
bufferlist comb;
|
||||
comb.append(ds);
|
||||
if (detail == "detail")
|
||||
comb.append(rdata);
|
||||
rdata = comb;
|
||||
}
|
||||
bufferlist comb;
|
||||
comb.append(ds);
|
||||
if (detail == "detail")
|
||||
comb.append(rdata);
|
||||
rdata = comb;
|
||||
} else if (prefix == "df") {
|
||||
bool verbose = (detail == "detail");
|
||||
if (f)
|
||||
@ -4119,6 +4246,11 @@ void Monitor::dispatch_op(MonOpRequestRef op)
|
||||
health_monitor->dispatch(op);
|
||||
break;
|
||||
|
||||
case MSG_MON_HEALTH_CHECKS:
|
||||
op->set_type_service();
|
||||
paxos_service[PAXOS_HEALTH]->dispatch(op);
|
||||
break;
|
||||
|
||||
default:
|
||||
dealt_with = false;
|
||||
break;
|
||||
|
@ -32,6 +32,7 @@
|
||||
|
||||
#include "common/Timer.h"
|
||||
|
||||
#include "health_check.h"
|
||||
#include "MonMap.h"
|
||||
#include "Elector.h"
|
||||
#include "Paxos.h"
|
||||
@ -497,6 +498,7 @@ private:
|
||||
version_t timecheck_round;
|
||||
unsigned int timecheck_acks;
|
||||
utime_t timecheck_round_start;
|
||||
friend class HealthMonitor;
|
||||
/* When we hit a skew we will start a new round based off of
|
||||
* 'mon_timecheck_skew_interval'. Each new round will be backed off
|
||||
* until we hit 'mon_timecheck_interval' -- which is the typical
|
||||
@ -649,6 +651,10 @@ public:
|
||||
return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT];
|
||||
}
|
||||
|
||||
class MgrStatMonitor *healthmon() {
|
||||
return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT];
|
||||
}
|
||||
|
||||
friend class Paxos;
|
||||
friend class OSDMonitor;
|
||||
friend class MDSMonitor;
|
||||
@ -738,6 +744,18 @@ public:
|
||||
*/
|
||||
health_status_t get_health(list<string>& status, bufferlist *detailbl,
|
||||
Formatter *f);
|
||||
|
||||
health_status_t get_health_status(
|
||||
bool want_detail,
|
||||
Formatter *f,
|
||||
std::string *plain,
|
||||
const char *sep1 = " ",
|
||||
const char *sep2 = "; ");
|
||||
void log_health(
|
||||
const health_check_map_t& updated,
|
||||
const health_check_map_t& previous,
|
||||
MonitorDBStore::TransactionRef t);
|
||||
|
||||
void get_cluster_status(stringstream &ss, Formatter *f);
|
||||
|
||||
void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version);
|
||||
|
@ -275,6 +275,8 @@ void OSDMonitor::update_from_paxos(bool *need_bootstrap)
|
||||
mapping_job.reset();
|
||||
}
|
||||
|
||||
load_health();
|
||||
|
||||
/*
|
||||
* We will possibly have a stashed latest that *we* wrote, and we will
|
||||
* always be sure to have the oldest full map in the first..last range
|
||||
@ -1101,6 +1103,19 @@ void OSDMonitor::encode_pending(MonitorDBStore::TransactionRef t)
|
||||
::encode(pending_creatings, creatings_bl);
|
||||
t->put(OSD_PG_CREATING_PREFIX, "creating", creatings_bl);
|
||||
}
|
||||
|
||||
// health
|
||||
_check_health(tmp, t);
|
||||
}
|
||||
|
||||
void OSDMonitor::_check_health(
|
||||
const OSDMap& nextmap,
|
||||
MonitorDBStore::TransactionRef t)
|
||||
{
|
||||
dout(20) << __func__ << dendl;
|
||||
health_check_map_t next;
|
||||
#warning write me
|
||||
encode_health(next, t);
|
||||
}
|
||||
|
||||
void OSDMonitor::trim_creating_pgs(creating_pgs_t* creating_pgs,
|
||||
|
@ -162,6 +162,8 @@ public:
|
||||
FAST_READ_DEFAULT
|
||||
};
|
||||
|
||||
void _check_health(const OSDMap& next, MonitorDBStore::TransactionRef t);
|
||||
|
||||
// svc
|
||||
public:
|
||||
void create_initial() override;
|
||||
|
@ -431,3 +431,12 @@ void PaxosService::trim(MonitorDBStore::TransactionRef t,
|
||||
}
|
||||
}
|
||||
|
||||
void PaxosService::load_health()
|
||||
{
|
||||
bufferlist bl;
|
||||
mon->store->get("health", service_name, bl);
|
||||
if (bl.length()) {
|
||||
auto p = bl.begin();
|
||||
::decode(health_checks, p);
|
||||
}
|
||||
}
|
||||
|
@ -77,15 +77,23 @@ protected:
|
||||
*/
|
||||
bool have_pending;
|
||||
|
||||
protected:
|
||||
/**
|
||||
* health checks for this service
|
||||
*
|
||||
* Child must populate this during encode_pending() by calling encode_health().
|
||||
*/
|
||||
health_check_map_t health_checks;
|
||||
public:
|
||||
const health_check_map_t& get_health_checks() {
|
||||
return health_checks;
|
||||
}
|
||||
|
||||
protected:
|
||||
/**
|
||||
* format of our state in leveldb, 0 for default
|
||||
*/
|
||||
version_t format_version;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @defgroup PaxosService_h_callbacks Callback classes
|
||||
* @{
|
||||
@ -428,6 +436,15 @@ public:
|
||||
list<pair<health_status_t,string> > *detail,
|
||||
CephContext *cct) const { }
|
||||
|
||||
void encode_health(const health_check_map_t& next,
|
||||
MonitorDBStore::TransactionRef t) {
|
||||
bufferlist bl;
|
||||
::encode(next, bl);
|
||||
t->put("health", service_name, bl);
|
||||
mon->log_health(next, health_checks, t);
|
||||
}
|
||||
void load_health();
|
||||
|
||||
private:
|
||||
/**
|
||||
* @defgroup PaxosService_h_store_keys Set of keys that are usually used on
|
||||
|
192
src/mon/health_check.h
Normal file
192
src/mon/health_check.h
Normal file
@ -0,0 +1,192 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <map>
|
||||
|
||||
#include "include/health.h"
|
||||
#include "common/Formatter.h"
|
||||
|
||||
struct health_check_t {
|
||||
health_status_t severity;
|
||||
std::string summary;
|
||||
std::list<std::string> detail;
|
||||
|
||||
DENC(health_check_t, v, p) {
|
||||
DENC_START(1, 1, p);
|
||||
denc(v.severity, p);
|
||||
denc(v.summary, p);
|
||||
denc(v.detail, p);
|
||||
DENC_FINISH(p);
|
||||
}
|
||||
|
||||
friend bool operator==(const health_check_t& l,
|
||||
const health_check_t& r) {
|
||||
return l.severity == r.severity &&
|
||||
l.summary == r.summary &&
|
||||
l.detail == r.detail;
|
||||
}
|
||||
friend bool operator!=(const health_check_t& l,
|
||||
const health_check_t& r) {
|
||||
return !(l == r);
|
||||
}
|
||||
|
||||
void dump(Formatter *f) const {
|
||||
f->dump_stream("severity") << severity;
|
||||
f->dump_string("summary", summary);
|
||||
f->open_array_section("detail");
|
||||
for (auto& p : detail) {
|
||||
f->dump_string("item", p);
|
||||
}
|
||||
f->close_section();
|
||||
}
|
||||
|
||||
static void generate_test_instances(list<health_check_t*>& ls) {
|
||||
ls.push_back(new health_check_t);
|
||||
ls.push_back(new health_check_t);
|
||||
ls.back()->severity = HEALTH_ERR;
|
||||
ls.back()->summary = "summarization";
|
||||
ls.back()->detail = {"one", "two", "three"};
|
||||
}
|
||||
};
|
||||
WRITE_CLASS_DENC(health_check_t)
|
||||
|
||||
|
||||
struct health_check_map_t {
|
||||
map<std::string,health_check_t> checks;
|
||||
|
||||
DENC(health_check_map_t, v, p) {
|
||||
DENC_START(1, 1, p);
|
||||
denc(v.checks, p);
|
||||
DENC_FINISH(p);
|
||||
}
|
||||
|
||||
void dump(Formatter *f) const {
|
||||
for (auto& p : checks) {
|
||||
f->dump_object(p.first.c_str(), p.second);
|
||||
}
|
||||
}
|
||||
|
||||
static void generate_test_instances(list<health_check_map_t*>& ls) {
|
||||
ls.push_back(new health_check_map_t);
|
||||
ls.push_back(new health_check_map_t);
|
||||
{
|
||||
auto& d = ls.back()->add("FOO", HEALTH_WARN, "foo");
|
||||
d.detail.push_back("a");
|
||||
d.detail.push_back("b");
|
||||
}
|
||||
{
|
||||
auto& d = ls.back()->add("BAR", HEALTH_ERR, "bar!");
|
||||
d.detail.push_back("c");
|
||||
d.detail.push_back("d");
|
||||
}
|
||||
}
|
||||
|
||||
void clear() {
|
||||
checks.clear();
|
||||
}
|
||||
void swap(health_check_map_t& other) {
|
||||
checks.swap(other.checks);
|
||||
}
|
||||
|
||||
health_check_t& add(const std::string& code,
|
||||
health_status_t severity,
|
||||
const std::string& summary) {
|
||||
assert(checks.count(code) == 0);
|
||||
health_check_t& r = checks[code];
|
||||
r.severity = severity;
|
||||
r.summary = summary;
|
||||
return r;
|
||||
}
|
||||
|
||||
void merge(const health_check_map_t& o) {
|
||||
for (auto& p : o.checks) {
|
||||
auto q = checks.find(p.first);
|
||||
if (q == checks.end()) {
|
||||
// new check
|
||||
checks[p.first] = p.second;
|
||||
} else {
|
||||
// merge details, and hope the summary matches!
|
||||
q->second.detail.insert(
|
||||
q->second.detail.end(),
|
||||
p.second.detail.begin(),
|
||||
p.second.detail.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
health_status_t dump_summary(Formatter *f, std::string *plain,
|
||||
const char *sep, bool detail) const {
|
||||
health_status_t r = HEALTH_OK;
|
||||
for (auto& p : checks) {
|
||||
if (r > p.second.severity) {
|
||||
r = p.second.severity;
|
||||
}
|
||||
if (f) {
|
||||
f->open_object_section(p.first.c_str());
|
||||
f->dump_stream("severity") << p.second.severity;
|
||||
f->dump_string("message", p.second.summary);
|
||||
if (detail) {
|
||||
f->open_array_section("detail");
|
||||
for (auto& d : p.second.detail) {
|
||||
f->dump_string("item", d);
|
||||
}
|
||||
f->close_section();
|
||||
}
|
||||
f->close_section();
|
||||
} else {
|
||||
if (!plain->empty()) {
|
||||
*plain += sep;
|
||||
}
|
||||
*plain += p.second.summary;
|
||||
}
|
||||
}
|
||||
return r;
|
||||
}
|
||||
|
||||
void dump_summary_compat(Formatter *f) const {
|
||||
for (auto& p : checks) {
|
||||
f->open_object_section("item");
|
||||
f->dump_stream("severity") << p.second.severity;
|
||||
f->dump_string("summary", p.second.summary);
|
||||
f->close_section();
|
||||
}
|
||||
}
|
||||
|
||||
void dump_detail(Formatter *f, std::string *plain, bool compat) const {
|
||||
for (auto& p : checks) {
|
||||
if (f) {
|
||||
if (compat) {
|
||||
// this is sloppy, but the best we can do: just dump all of the
|
||||
// individual checks' details together
|
||||
for (auto& d : p.second.detail) {
|
||||
f->dump_string("item", d);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
if (!compat) {
|
||||
*plain += p.first + " " + p.second.summary + "\n";
|
||||
}
|
||||
for (auto& d : p.second.detail) {
|
||||
if (!compat) {
|
||||
*plain += " ";
|
||||
}
|
||||
*plain += d;
|
||||
*plain += "\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
friend bool operator==(const health_check_map_t& l,
|
||||
const health_check_map_t& r) {
|
||||
return l.checks == r.checks;
|
||||
}
|
||||
friend bool operator!=(const health_check_map_t& l,
|
||||
const health_check_map_t& r) {
|
||||
return !(l == r);
|
||||
}
|
||||
};
|
||||
WRITE_CLASS_DENC(health_check_map_t)
|
@ -31,7 +31,8 @@
|
||||
#define PAXOS_AUTH 5
|
||||
#define PAXOS_MGR 6
|
||||
#define PAXOS_MGRSTAT 7
|
||||
#define PAXOS_NUM 8
|
||||
#define PAXOS_HEALTH 8
|
||||
#define PAXOS_NUM 9
|
||||
|
||||
inline const char *get_paxos_name(int p) {
|
||||
switch (p) {
|
||||
@ -43,6 +44,7 @@ inline const char *get_paxos_name(int p) {
|
||||
case PAXOS_AUTH: return "auth";
|
||||
case PAXOS_MGR: return "mgr";
|
||||
case PAXOS_MGRSTAT: return "mgrstat";
|
||||
case PAXOS_HEALTH: return "health";
|
||||
default: ceph_abort(); return 0;
|
||||
}
|
||||
}
|
||||
|
@ -96,6 +96,7 @@ using namespace std;
|
||||
#include "messages/MMonGetVersion.h"
|
||||
#include "messages/MMonGetVersionReply.h"
|
||||
#include "messages/MMonHealth.h"
|
||||
#include "messages/MMonHealthChecks.h"
|
||||
#include "messages/MMonMetadata.h"
|
||||
#include "messages/MDataPing.h"
|
||||
#include "messages/MAuth.h"
|
||||
@ -783,6 +784,11 @@ Message *decode_message(CephContext *cct, int crcflags,
|
||||
case MSG_MON_HEALTH:
|
||||
m = new MMonHealth();
|
||||
break;
|
||||
|
||||
case MSG_MON_HEALTH_CHECKS:
|
||||
m = new MMonHealthChecks();
|
||||
break;
|
||||
|
||||
#if defined(HAVE_XIO)
|
||||
case MSG_DATA_PING:
|
||||
m = new MDataPing();
|
||||
|
@ -183,6 +183,8 @@
|
||||
// Special
|
||||
#define MSG_NOP 0x607
|
||||
|
||||
#define MSG_MON_HEALTH_CHECKS 0x608
|
||||
|
||||
// *** ceph-mgr <-> OSD/MDS daemons ***
|
||||
#define MSG_MGR_OPEN 0x700
|
||||
#define MSG_MGR_CONFIGURE 0x701
|
||||
|
Loading…
Reference in New Issue
Block a user