mirror of
https://github.com/ceph/ceph
synced 2025-02-23 19:17:37 +00:00
mon/health_checks: associate a count with health_alert_t
0 means this is a singleton. Otherwise, we can sum this up, either via merge() or get_or_add(). We always structure this so the count goes toward zero (more healthy), so if a value is too low, then we count how much too low it is. Signed-off-by: Sage Weil <sage@redhat.com>
This commit is contained in:
parent
7834cd8146
commit
d0eb22f3ba
@ -386,7 +386,7 @@ void FSMap::get_health_checks(health_check_map_t *checks) const
|
||||
if (!stuck_failed.empty()) {
|
||||
health_check_t& fscheck = checks->get_or_add(
|
||||
"FS_WITH_FAILED_MDS", HEALTH_WARN,
|
||||
"%num% filesystem%plurals% %hasorhave% a failed mds daemon");
|
||||
"%num% filesystem%plurals% %hasorhave% a failed mds daemon", 1);
|
||||
ostringstream ss;
|
||||
ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
|
||||
<< " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
|
||||
@ -402,7 +402,7 @@ void FSMap::get_health_checks(health_check_map_t *checks) const
|
||||
if (standby_count_wanted) {
|
||||
std::ostringstream oss, dss;
|
||||
oss << "insufficient standby MDS daemons available";
|
||||
auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str());
|
||||
auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str(), 1);
|
||||
dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted
|
||||
<< " more";
|
||||
d.detail.push_back(dss.str());
|
||||
|
@ -435,7 +435,8 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const
|
||||
// MDS_DAMAGE
|
||||
if (!damaged.empty()) {
|
||||
health_check_t& check = checks->get_or_add("MDS_DAMAGE", HEALTH_ERR,
|
||||
"%num% mds daemon%plurals% damaged");
|
||||
"%num% mds daemon%plurals% damaged",
|
||||
damaged.size());
|
||||
for (auto p : damaged) {
|
||||
std::ostringstream oss;
|
||||
oss << "fs " << fs_name << " mds." << p << " is damaged";
|
||||
@ -447,7 +448,7 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const
|
||||
if (is_degraded()) {
|
||||
health_check_t& fscheck = checks->get_or_add(
|
||||
"FS_DEGRADED", HEALTH_WARN,
|
||||
"%num% filesystem%plurals% %isorare% degraded");
|
||||
"%num% filesystem%plurals% %isorare% degraded", 1);
|
||||
ostringstream ss;
|
||||
ss << "fs " << fs_name << " is degraded";
|
||||
fscheck.detail.push_back(ss.str());
|
||||
@ -478,7 +479,7 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const
|
||||
if ((mds_rank_t)get_num_in_mds() < get_max_mds()) {
|
||||
health_check_t& check = checks->add(
|
||||
"MDS_UP_LESS_THAN_MAX", HEALTH_WARN,
|
||||
"%num% filesystem%plurals% %isorare% online with fewer MDS than max_mds");
|
||||
"%num% filesystem%plurals% %isorare% online with fewer MDS than max_mds", 1);
|
||||
stringstream ss;
|
||||
ss << "fs " << fs_name << " has " << get_num_in_mds()
|
||||
<< " MDS online, but wants " << get_max_mds();
|
||||
@ -489,7 +490,7 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const
|
||||
if ((mds_rank_t)get_num_up_mds() == 0 && get_max_mds() > 0) {
|
||||
health_check_t &check = checks->add(
|
||||
"MDS_ALL_DOWN", HEALTH_ERR,
|
||||
"%num% filesystem%plurals% %isorare% offline");
|
||||
"%num% filesystem%plurals% %isorare% offline", 1);
|
||||
stringstream ss;
|
||||
ss << "fs " << fs_name << " is offline because no MDS is active for it.";
|
||||
check.detail.push_back(ss.str());
|
||||
@ -499,7 +500,7 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const
|
||||
was_snaps_ever_allowed() && !allows_multimds_snaps()) {
|
||||
health_check_t &check = checks->add(
|
||||
"MULTIMDS_WITH_OLDSNAPS", HEALTH_ERR,
|
||||
"%num% filesystem%plurals% %isorare% multi-active mds with old snapshots");
|
||||
"%num% filesystem%plurals% %isorare% multi-active mds with old snapshots", 1);
|
||||
stringstream ss;
|
||||
ss << "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
|
||||
check.detail.push_back(ss.str());
|
||||
|
@ -267,6 +267,7 @@ ceph_set_health_checks(BaseMgrModule *self, PyObject *args)
|
||||
health_status_t severity = HEALTH_OK;
|
||||
string summary;
|
||||
list<string> detail;
|
||||
int64_t count = 0;
|
||||
PyObject *infols = PyDict_Items(check_info);
|
||||
for (int j = 0; j < PyList_Size(infols); ++j) {
|
||||
PyObject *pair = PyList_GET_ITEM(infols, j);
|
||||
@ -301,6 +302,14 @@ ceph_set_health_checks(BaseMgrModule *self, PyObject *args)
|
||||
} else {
|
||||
summary = std::move(vs);
|
||||
}
|
||||
} else if (ks == "count") {
|
||||
if (PyLong_Check(v)) {
|
||||
count = PyLong_AsLong(v);
|
||||
} else {
|
||||
derr << __func__ << " check " << check_name
|
||||
<< " count value not long" << dendl;
|
||||
continue;
|
||||
}
|
||||
} else if (ks == "detail") {
|
||||
if (!PyList_Check(v)) {
|
||||
derr << __func__ << " check " << check_name
|
||||
@ -322,7 +331,7 @@ ceph_set_health_checks(BaseMgrModule *self, PyObject *args)
|
||||
<< " unexpected key " << k << dendl;
|
||||
}
|
||||
}
|
||||
auto& d = out_checks.add(check_name, severity, summary);
|
||||
auto& d = out_checks.add(check_name, severity, summary, count);
|
||||
d.detail.swap(detail);
|
||||
}
|
||||
|
||||
|
@ -18,7 +18,7 @@ class SlowOps final : public DaemonHealthMetricCollector {
|
||||
return type == daemon_metric::SLOW_OPS;
|
||||
}
|
||||
health_check_t& _get_check(health_check_map_t& cm) const override {
|
||||
return cm.get_or_add("SLOW_OPS", HEALTH_WARN, "");
|
||||
return cm.get_or_add("SLOW_OPS", HEALTH_WARN, "", 1);
|
||||
}
|
||||
bool _update(const DaemonKey& daemon,
|
||||
const DaemonHealthMetric& metric) override {
|
||||
@ -61,7 +61,7 @@ class PendingPGs final : public DaemonHealthMetricCollector {
|
||||
return type == daemon_metric::PENDING_CREATING_PGS;
|
||||
}
|
||||
health_check_t& _get_check(health_check_map_t& cm) const override {
|
||||
return cm.get_or_add("PENDING_CREATING_PGS", HEALTH_WARN, "");
|
||||
return cm.get_or_add("PENDING_CREATING_PGS", HEALTH_WARN, "", 1);
|
||||
}
|
||||
bool _update(const DaemonKey& osd,
|
||||
const DaemonHealthMetric& metric) override {
|
||||
|
@ -386,7 +386,8 @@ void PyModuleRegistry::get_health_checks(health_check_map_t *checks)
|
||||
ss << dependency_modules.size()
|
||||
<< " mgr modules have failed dependencies";
|
||||
}
|
||||
auto& d = checks->add("MGR_MODULE_DEPENDENCY", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("MGR_MODULE_DEPENDENCY", HEALTH_WARN, ss.str(),
|
||||
dependency_modules.size());
|
||||
for (auto& i : dependency_modules) {
|
||||
std::ostringstream ss;
|
||||
ss << "Module '" << i.first << "' has failed dependency: " << i.second;
|
||||
@ -402,7 +403,8 @@ void PyModuleRegistry::get_health_checks(health_check_map_t *checks)
|
||||
} else if (failed_modules.size() > 1) {
|
||||
ss << failed_modules.size() << " mgr modules have failed";
|
||||
}
|
||||
auto& d = checks->add("MGR_MODULE_ERROR", HEALTH_ERR, ss.str());
|
||||
auto& d = checks->add("MGR_MODULE_ERROR", HEALTH_ERR, ss.str(),
|
||||
failed_modules.size());
|
||||
for (auto& i : failed_modules) {
|
||||
std::ostringstream ss;
|
||||
ss << "Module '" << i.first << "' has failed: " << i.second;
|
||||
|
@ -418,7 +418,8 @@ void AuthMonitor::encode_pending(MonitorDBStore::TransactionRef t)
|
||||
if (bad_detail.size()) {
|
||||
ostringstream ss;
|
||||
ss << bad_detail.size() << " auth entities have invalid capabilities";
|
||||
health_check_t *check = &next.add("AUTH_BAD_CAPS", HEALTH_ERR, ss.str());
|
||||
health_check_t *check = &next.add("AUTH_BAD_CAPS", HEALTH_ERR, ss.str(),
|
||||
bad_detail.size());
|
||||
for (auto& i : bad_detail) {
|
||||
for (auto& j : i.second) {
|
||||
check->detail.push_back(j);
|
||||
|
@ -570,14 +570,14 @@ bool HealthMonitor::check_member_health()
|
||||
if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_crit) {
|
||||
stringstream ss, ss2;
|
||||
ss << "mon%plurals% %names% %isorare% very low on available space";
|
||||
auto& d = next.add("MON_DISK_CRIT", HEALTH_ERR, ss.str());
|
||||
auto& d = next.add("MON_DISK_CRIT", HEALTH_ERR, ss.str(), 1);
|
||||
ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent
|
||||
<< "% avail";
|
||||
d.detail.push_back(ss2.str());
|
||||
} else if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_warn) {
|
||||
stringstream ss, ss2;
|
||||
ss << "mon%plurals% %names% %isorare% low on available space";
|
||||
auto& d = next.add("MON_DISK_LOW", HEALTH_WARN, ss.str());
|
||||
auto& d = next.add("MON_DISK_LOW", HEALTH_WARN, ss.str(), 1);
|
||||
ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent
|
||||
<< "% avail";
|
||||
d.detail.push_back(ss2.str());
|
||||
@ -585,7 +585,7 @@ bool HealthMonitor::check_member_health()
|
||||
if (stats.store_stats.bytes_total >= g_conf()->mon_data_size_warn) {
|
||||
stringstream ss, ss2;
|
||||
ss << "mon%plurals% %names% %isorare% using a lot of disk space";
|
||||
auto& d = next.add("MON_DISK_BIG", HEALTH_WARN, ss.str());
|
||||
auto& d = next.add("MON_DISK_BIG", HEALTH_WARN, ss.str(), 1);
|
||||
ss2 << "mon." << mon->name << " is "
|
||||
<< byte_u_t(stats.store_stats.bytes_total)
|
||||
<< " >= mon_data_size_warn ("
|
||||
@ -611,7 +611,7 @@ bool HealthMonitor::check_member_health()
|
||||
g_conf()->mon_osd_down_out_interval == 0) {
|
||||
ostringstream ss, ds;
|
||||
ss << "mon%plurals% %names% %hasorhave% mon_osd_down_out_interval set to 0";
|
||||
auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str());
|
||||
auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str(), 1);
|
||||
ds << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
|
||||
d.detail.push_back(ds.str());
|
||||
}
|
||||
@ -669,7 +669,7 @@ bool HealthMonitor::check_leader_health()
|
||||
ostringstream ss;
|
||||
ss << (max-actual) << "/" << max << " mons down, quorum "
|
||||
<< mon->get_quorum_names();
|
||||
auto& d = next.add("MON_DOWN", HEALTH_WARN, ss.str());
|
||||
auto& d = next.add("MON_DOWN", HEALTH_WARN, ss.str(), max - actual);
|
||||
set<int> q = mon->get_quorum();
|
||||
for (int i=0; i<max; i++) {
|
||||
if (q.count(i) == 0) {
|
||||
@ -710,7 +710,7 @@ bool HealthMonitor::check_leader_health()
|
||||
if (!warns.empty())
|
||||
ss << ",";
|
||||
}
|
||||
auto& d = next.add("MON_CLOCK_SKEW", HEALTH_WARN, ss.str());
|
||||
auto& d = next.add("MON_CLOCK_SKEW", HEALTH_WARN, ss.str(), details.size());
|
||||
d.detail.swap(details);
|
||||
}
|
||||
}
|
||||
@ -732,7 +732,8 @@ bool HealthMonitor::check_leader_health()
|
||||
if (!details.empty()) {
|
||||
ostringstream ss;
|
||||
ss << details.size() << " monitors have not enabled msgr2";
|
||||
auto& d = next.add("MON_MSGR2_NOT_ENABLED", HEALTH_WARN, ss.str());
|
||||
auto& d = next.add("MON_MSGR2_NOT_ENABLED", HEALTH_WARN, ss.str(),
|
||||
details.size());
|
||||
d.detail.swap(details);
|
||||
}
|
||||
}
|
||||
|
@ -218,7 +218,8 @@ void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
|
||||
health_check_t *check = &new_checks.get_or_add(
|
||||
mds_metric_name(metric.type),
|
||||
metric.sev,
|
||||
mds_metric_summary(metric.type));
|
||||
mds_metric_summary(metric.type),
|
||||
1);
|
||||
ostringstream ss;
|
||||
ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
|
||||
bool first = true;
|
||||
|
@ -322,7 +322,7 @@ void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t)
|
||||
if (pending_map.active_gid == 0) {
|
||||
auto level = should_warn_about_mgr_down();
|
||||
if (level != HEALTH_OK) {
|
||||
next.add("MGR_DOWN", level, "no active mgr");
|
||||
next.add("MGR_DOWN", level, "no active mgr", 0);
|
||||
} else {
|
||||
dout(10) << __func__ << " no health warning (never active and new cluster)"
|
||||
<< dendl;
|
||||
|
@ -2605,19 +2605,20 @@ void PGMap::get_health_checks(
|
||||
// Compose summary message saying how many PGs in what states led
|
||||
// to this health check failing
|
||||
std::vector<std::string> pg_msgs;
|
||||
int64_t count = 0;
|
||||
for (const auto &j : i.second.states) {
|
||||
std::ostringstream msg;
|
||||
msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first);
|
||||
pg_msgs.push_back(msg.str());
|
||||
count += j.second;
|
||||
}
|
||||
summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", "));
|
||||
|
||||
|
||||
|
||||
health_check_t *check = &checks->add(
|
||||
health_code,
|
||||
sev,
|
||||
summary);
|
||||
summary,
|
||||
count);
|
||||
|
||||
// Compose list of PGs contributing to this health check failing
|
||||
for (const auto &j : i.second.pg_messages) {
|
||||
@ -2629,7 +2630,8 @@ void PGMap::get_health_checks(
|
||||
if (pg_sum.stats.sum.num_scrub_errors) {
|
||||
ostringstream ss;
|
||||
ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors";
|
||||
checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str());
|
||||
checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str(),
|
||||
pg_sum.stats.sum.num_scrub_errors);
|
||||
}
|
||||
|
||||
// LARGE_OMAP_OBJECTS
|
||||
@ -2656,7 +2658,8 @@ void PGMap::get_health_checks(
|
||||
if (!detail.empty()) {
|
||||
ostringstream ss;
|
||||
ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects";
|
||||
auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str(),
|
||||
pg_sum.stats.sum.num_large_omap_objects);
|
||||
stringstream tip;
|
||||
tip << "Search the cluster log for 'Large omap object found' for more "
|
||||
<< "details.";
|
||||
@ -2711,7 +2714,8 @@ void PGMap::get_health_checks(
|
||||
if (!detail.empty()) {
|
||||
ostringstream ss;
|
||||
ss << num_pools << " cache pools at or near target size";
|
||||
auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str(),
|
||||
num_pools);
|
||||
d.detail.swap(detail);
|
||||
}
|
||||
}
|
||||
@ -2727,7 +2731,8 @@ void PGMap::get_health_checks(
|
||||
ostringstream ss;
|
||||
ss << "too few PGs per OSD (" << per
|
||||
<< " < min " << min_pg_per_osd << ")";
|
||||
checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str());
|
||||
checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str(),
|
||||
min_pg_per_osd - per);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2739,7 +2744,8 @@ void PGMap::get_health_checks(
|
||||
ostringstream ss;
|
||||
ss << "too many PGs per OSD (" << per
|
||||
<< " > max " << max_pg_per_osd << ")";
|
||||
checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str());
|
||||
checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str(),
|
||||
per - max_pg_per_osd);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2750,7 +2756,8 @@ void PGMap::get_health_checks(
|
||||
ostringstream ss;
|
||||
ss << "OSD count " << osdmap.get_num_osds()
|
||||
<< " < osd_pool_default_size " << osd_pool_default_size;
|
||||
checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str());
|
||||
checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str(),
|
||||
osd_pool_default_size - osdmap.get_num_osds());
|
||||
}
|
||||
|
||||
// SMALLER_PGP_NUM
|
||||
@ -2803,14 +2810,16 @@ void PGMap::get_health_checks(
|
||||
if (!pgp_detail.empty()) {
|
||||
ostringstream ss;
|
||||
ss << pgp_detail.size() << " pools have pg_num > pgp_num";
|
||||
auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str(),
|
||||
pgp_detail.size());
|
||||
d.detail.swap(pgp_detail);
|
||||
}
|
||||
if (!many_detail.empty()) {
|
||||
ostringstream ss;
|
||||
ss << many_detail.size() << " pools have many more objects per pg than"
|
||||
<< " average";
|
||||
auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str(),
|
||||
many_detail.size());
|
||||
d.detail.swap(many_detail);
|
||||
}
|
||||
}
|
||||
@ -2880,13 +2889,13 @@ void PGMap::get_health_checks(
|
||||
if (full_pools) {
|
||||
ostringstream ss;
|
||||
ss << full_pools << " pools full";
|
||||
auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str());
|
||||
auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str(), full_pools);
|
||||
d.detail.swap(full_detail);
|
||||
}
|
||||
if (nearfull_pools) {
|
||||
ostringstream ss;
|
||||
ss << nearfull_pools << " pools nearfull";
|
||||
auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str(), nearfull_pools);
|
||||
d.detail.swap(nearfull_detail);
|
||||
}
|
||||
}
|
||||
@ -2903,7 +2912,8 @@ void PGMap::get_health_checks(
|
||||
ss << pg_sum.stats.sum.num_objects_misplaced
|
||||
<< "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced ("
|
||||
<< b << "%)";
|
||||
checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str());
|
||||
checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str(),
|
||||
pg_sum.stats.sum.num_objects_misplaced);
|
||||
}
|
||||
|
||||
// OBJECT_UNFOUND
|
||||
@ -2916,7 +2926,8 @@ void PGMap::get_health_checks(
|
||||
ostringstream ss;
|
||||
ss << pg_sum.stats.sum.num_objects_unfound
|
||||
<< "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)";
|
||||
auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str(),
|
||||
pg_sum.stats.sum.num_objects_unfound);
|
||||
|
||||
for (auto& p : pg_stat) {
|
||||
if (p.second.stats.sum.num_objects_unfound) {
|
||||
@ -2988,7 +2999,7 @@ void PGMap::get_health_checks(
|
||||
ostringstream ss;
|
||||
ss << warn << " slow requests are blocked > "
|
||||
<< cct->_conf->mon_osd_warn_op_age << " sec";
|
||||
auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str(), warn);
|
||||
d.detail.swap(warn_detail);
|
||||
int left = max;
|
||||
for (auto& p : warn_osd_by_max) {
|
||||
@ -3010,7 +3021,7 @@ void PGMap::get_health_checks(
|
||||
ostringstream ss;
|
||||
ss << error << " stuck requests are blocked > "
|
||||
<< err_age << " sec";
|
||||
auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str());
|
||||
auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str(), error);
|
||||
d.detail.swap(error_detail);
|
||||
int left = max;
|
||||
for (auto& p : error_osd_by_max) {
|
||||
@ -3071,7 +3082,7 @@ void PGMap::get_health_checks(
|
||||
} else if (asum.first == "BLUESTORE_NO_PER_POOL_OMAP") {
|
||||
summary = " reporting legacy (not per-pool) BlueStore omap usage stats";
|
||||
}
|
||||
auto& d = checks->add(asum.first, HEALTH_WARN, summary);
|
||||
auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first);
|
||||
for (auto& s : asum.second.second) {
|
||||
d.detail.push_back(s);
|
||||
}
|
||||
@ -3140,7 +3151,7 @@ void PGMap::get_health_checks(
|
||||
if (detail_total) {
|
||||
ostringstream ss;
|
||||
ss << detail_total << " pgs not scrubbed in time";
|
||||
auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str(), detail_total);
|
||||
|
||||
if (!detail.empty()) {
|
||||
d.detail.swap(detail);
|
||||
@ -3155,7 +3166,8 @@ void PGMap::get_health_checks(
|
||||
if (deep_detail_total) {
|
||||
ostringstream ss;
|
||||
ss << deep_detail_total << " pgs not deep-scrubbed in time";
|
||||
auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str(),
|
||||
deep_detail_total);
|
||||
|
||||
if (!deep_detail.empty()) {
|
||||
d.detail.swap(deep_detail);
|
||||
@ -3196,7 +3208,8 @@ void PGMap::get_health_checks(
|
||||
if (!detail.empty()) {
|
||||
ostringstream ss;
|
||||
ss << detail.size() << " pool(s) do not have an application enabled";
|
||||
auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str(),
|
||||
detail.size());
|
||||
stringstream tip;
|
||||
tip << "use 'ceph osd pool application enable <pool-name> "
|
||||
<< "<app-name>', where <app-name> is 'cephfs', 'rbd', 'rgw', "
|
||||
@ -3244,7 +3257,8 @@ void PGMap::get_health_checks(
|
||||
|
||||
stringstream ss;
|
||||
ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)";
|
||||
auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str(),
|
||||
snaptrimq_exceeded);
|
||||
detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\".");
|
||||
d.detail.swap(detail);
|
||||
}
|
||||
|
@ -14,12 +14,16 @@ struct health_check_t {
|
||||
health_status_t severity;
|
||||
std::string summary;
|
||||
std::list<std::string> detail;
|
||||
int64_t count = 0;
|
||||
|
||||
DENC(health_check_t, v, p) {
|
||||
DENC_START(1, 1, p);
|
||||
DENC_START(2, 1, p);
|
||||
denc(v.severity, p);
|
||||
denc(v.summary, p);
|
||||
denc(v.detail, p);
|
||||
if (struct_v >= 2) {
|
||||
denc(v.count, p);
|
||||
}
|
||||
DENC_FINISH(p);
|
||||
}
|
||||
|
||||
@ -27,7 +31,8 @@ struct health_check_t {
|
||||
const health_check_t& r) {
|
||||
return l.severity == r.severity &&
|
||||
l.summary == r.summary &&
|
||||
l.detail == r.detail;
|
||||
l.detail == r.detail &&
|
||||
l.count == r.count;
|
||||
}
|
||||
friend bool operator!=(const health_check_t& l,
|
||||
const health_check_t& r) {
|
||||
@ -39,6 +44,7 @@ struct health_check_t {
|
||||
|
||||
f->open_object_section("summary");
|
||||
f->dump_string("message", summary);
|
||||
f->dump_int("count", count);
|
||||
f->close_section();
|
||||
|
||||
if (want_detail) {
|
||||
@ -58,6 +64,7 @@ struct health_check_t {
|
||||
ls.back()->severity = HEALTH_ERR;
|
||||
ls.back()->summary = "summarization";
|
||||
ls.back()->detail = {"one", "two", "three"};
|
||||
ls.back()->count = 42;
|
||||
}
|
||||
};
|
||||
WRITE_CLASS_DENC(health_check_t)
|
||||
@ -117,14 +124,15 @@ struct health_check_map_t {
|
||||
ls.push_back(new health_check_map_t);
|
||||
ls.push_back(new health_check_map_t);
|
||||
{
|
||||
auto& d = ls.back()->add("FOO", HEALTH_WARN, "foo");
|
||||
auto& d = ls.back()->add("FOO", HEALTH_WARN, "foo", 2);
|
||||
d.detail.push_back("a");
|
||||
d.detail.push_back("b");
|
||||
}
|
||||
{
|
||||
auto& d = ls.back()->add("BAR", HEALTH_ERR, "bar!");
|
||||
auto& d = ls.back()->add("BAR", HEALTH_ERR, "bar!", 3);
|
||||
d.detail.push_back("c");
|
||||
d.detail.push_back("d");
|
||||
d.detail.push_back("e");
|
||||
}
|
||||
}
|
||||
|
||||
@ -140,19 +148,23 @@ struct health_check_map_t {
|
||||
|
||||
health_check_t& add(const std::string& code,
|
||||
health_status_t severity,
|
||||
const std::string& summary) {
|
||||
const std::string& summary,
|
||||
int64_t count) {
|
||||
ceph_assert(checks.count(code) == 0);
|
||||
health_check_t& r = checks[code];
|
||||
r.severity = severity;
|
||||
r.summary = summary;
|
||||
r.count = count;
|
||||
return r;
|
||||
}
|
||||
health_check_t& get_or_add(const std::string& code,
|
||||
health_status_t severity,
|
||||
const std::string& summary) {
|
||||
const std::string& summary,
|
||||
int64_t count) {
|
||||
health_check_t& r = checks[code];
|
||||
r.severity = severity;
|
||||
r.summary = summary;
|
||||
r.count += count;
|
||||
return r;
|
||||
}
|
||||
|
||||
@ -168,6 +180,7 @@ struct health_check_map_t {
|
||||
q->second.detail.end(),
|
||||
p.second.detail.begin(),
|
||||
p.second.detail.end());
|
||||
q->second.count += p.second.count;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -5568,7 +5568,8 @@ void OSDMap::check_health(health_check_map_t *checks) const
|
||||
string err = string("OSD_") +
|
||||
string(crush->get_type_name(type)) + "_DOWN";
|
||||
boost::to_upper(err);
|
||||
auto& d = checks->add(err, HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add(err, HEALTH_WARN, ss.str(),
|
||||
subtree_type_down[type].size());
|
||||
for (auto j = subtree_type_down[type].rbegin();
|
||||
j != subtree_type_down[type].rend();
|
||||
++j) {
|
||||
@ -5591,7 +5592,8 @@ void OSDMap::check_health(health_check_map_t *checks) const
|
||||
}
|
||||
ostringstream ss;
|
||||
ss << down_in_osds.size() << " osds down";
|
||||
auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str(),
|
||||
down_in_osds.size());
|
||||
for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) {
|
||||
ostringstream ss;
|
||||
ss << "osd." << *it << " (";
|
||||
@ -5604,7 +5606,8 @@ void OSDMap::check_health(health_check_map_t *checks) const
|
||||
if (!osds.empty()) {
|
||||
ostringstream ss;
|
||||
ss << osds.size() << " osds exist in the crush map but not in the osdmap";
|
||||
auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str(),
|
||||
osds.size());
|
||||
for (auto osd : osds) {
|
||||
ostringstream ss;
|
||||
ss << "osd." << osd << " exists in crush map but not in osdmap";
|
||||
@ -5634,7 +5637,7 @@ void OSDMap::check_health(health_check_map_t *checks) const
|
||||
out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : "";
|
||||
out += nodeepscrub ? "nodeep-scrub" : "";
|
||||
auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK,
|
||||
"Some pool(s) have the " + out + " flag(s) set");
|
||||
"Some pool(s) have the " + out + " flag(s) set", 0);
|
||||
d.detail.splice(d.detail.end(), scrub_messages);
|
||||
}
|
||||
|
||||
@ -5673,7 +5676,7 @@ void OSDMap::check_health(health_check_map_t *checks) const
|
||||
}
|
||||
if (!detail.empty()) {
|
||||
auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR,
|
||||
"full ratio(s) out of order");
|
||||
"full ratio(s) out of order", 0);
|
||||
d.detail.swap(detail);
|
||||
}
|
||||
}
|
||||
@ -5688,7 +5691,7 @@ void OSDMap::check_health(health_check_map_t *checks) const
|
||||
if (full.size()) {
|
||||
ostringstream ss;
|
||||
ss << full.size() << " full osd(s)";
|
||||
auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str());
|
||||
auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str(), full.size());
|
||||
for (auto& i: full) {
|
||||
ostringstream ss;
|
||||
ss << "osd." << i << " is full";
|
||||
@ -5698,7 +5701,8 @@ void OSDMap::check_health(health_check_map_t *checks) const
|
||||
if (backfillfull.size()) {
|
||||
ostringstream ss;
|
||||
ss << backfillfull.size() << " backfillfull osd(s)";
|
||||
auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str(),
|
||||
backfillfull.size());
|
||||
for (auto& i: backfillfull) {
|
||||
ostringstream ss;
|
||||
ss << "osd." << i << " is backfill full";
|
||||
@ -5708,7 +5712,7 @@ void OSDMap::check_health(health_check_map_t *checks) const
|
||||
if (nearfull.size()) {
|
||||
ostringstream ss;
|
||||
ss << nearfull.size() << " nearfull osd(s)";
|
||||
auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str(), nearfull.size());
|
||||
for (auto& i: nearfull) {
|
||||
ostringstream ss;
|
||||
ss << "osd." << i << " is near full";
|
||||
@ -5739,9 +5743,10 @@ void OSDMap::check_health(health_check_map_t *checks) const
|
||||
CEPH_OSDMAP_NOREBALANCE;
|
||||
if (test_flag(warn_flags)) {
|
||||
ostringstream ss;
|
||||
ss << get_flag_string(get_flags() & warn_flags)
|
||||
<< " flag(s) set";
|
||||
checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str());
|
||||
string s = get_flag_string(get_flags() & warn_flags);
|
||||
ss << s << " flag(s) set";
|
||||
checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str(),
|
||||
s.size() /* kludgey but sufficient */);
|
||||
}
|
||||
}
|
||||
|
||||
@ -5787,7 +5792,7 @@ void OSDMap::check_health(health_check_map_t *checks) const
|
||||
if (!detail.empty()) {
|
||||
ostringstream ss;
|
||||
ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set";
|
||||
auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str(), detail.size());
|
||||
d.detail.swap(detail);
|
||||
}
|
||||
}
|
||||
@ -5799,7 +5804,7 @@ void OSDMap::check_health(health_check_map_t *checks) const
|
||||
ostringstream ss;
|
||||
ss << "crush map has legacy tunables (require " << min
|
||||
<< ", min is " << g_conf()->mon_crush_min_required_version << ")";
|
||||
auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str(), 0);
|
||||
d.detail.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
|
||||
}
|
||||
}
|
||||
@ -5809,7 +5814,7 @@ void OSDMap::check_health(health_check_map_t *checks) const
|
||||
if (crush->get_straw_calc_version() == 0) {
|
||||
ostringstream ss;
|
||||
ss << "crush map has straw_calc_version=0";
|
||||
auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str(), 0);
|
||||
d.detail.push_back(
|
||||
"see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables");
|
||||
}
|
||||
@ -5832,7 +5837,8 @@ void OSDMap::check_health(health_check_map_t *checks) const
|
||||
if (!detail.empty()) {
|
||||
ostringstream ss;
|
||||
ss << detail.size() << " cache pools are missing hit_sets";
|
||||
auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str(),
|
||||
detail.size());
|
||||
d.detail.swap(detail);
|
||||
}
|
||||
}
|
||||
@ -5841,7 +5847,7 @@ void OSDMap::check_health(health_check_map_t *checks) const
|
||||
if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) {
|
||||
ostringstream ss;
|
||||
ss << "'sortbitwise' flag is not set";
|
||||
checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str());
|
||||
checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str(), 0);
|
||||
}
|
||||
|
||||
// OSD_UPGRADE_FINISHED
|
||||
@ -5876,19 +5882,21 @@ void OSDMap::check_health(health_check_map_t *checks) const
|
||||
if (!full_detail.empty()) {
|
||||
ostringstream ss;
|
||||
ss << full_detail.size() << " pool(s) full";
|
||||
auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str(), full_detail.size());
|
||||
d.detail.swap(full_detail);
|
||||
}
|
||||
if (!backfillfull_detail.empty()) {
|
||||
ostringstream ss;
|
||||
ss << backfillfull_detail.size() << " pool(s) backfillfull";
|
||||
auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str(),
|
||||
backfillfull_detail.size());
|
||||
d.detail.swap(backfillfull_detail);
|
||||
}
|
||||
if (!nearfull_detail.empty()) {
|
||||
ostringstream ss;
|
||||
ss << nearfull_detail.size() << " pool(s) nearfull";
|
||||
auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str());
|
||||
auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str(),
|
||||
nearfull_detail.size());
|
||||
d.detail.swap(nearfull_detail);
|
||||
}
|
||||
}
|
||||
|
@ -87,6 +87,7 @@ class Module(MgrModule):
|
||||
health_checks['RECENT_CRASH'] = {
|
||||
'severity': 'warning',
|
||||
'summary': '%d daemons have recently crashed' % (num),
|
||||
'count': num,
|
||||
'detail': detail,
|
||||
}
|
||||
self.set_health_checks(health_checks)
|
||||
|
@ -553,6 +553,7 @@ class Module(MgrModule):
|
||||
checks[warning] = {
|
||||
'severity': 'warning',
|
||||
'summary': HEALTH_MESSAGES[warning] % n,
|
||||
'count': len(ls),
|
||||
'detail': ls,
|
||||
}
|
||||
self.set_health_checks(checks)
|
||||
|
@ -896,6 +896,7 @@ class MgrModule(ceph_module.BaseMgrModule):
|
||||
'CHECK_FOO': {
|
||||
'severity': 'warning', # or 'error'
|
||||
'summary': 'summary string',
|
||||
'count': 4, # quantify badness
|
||||
'detail': [ 'list', 'of', 'detail', 'strings' ],
|
||||
},
|
||||
'CHECK_BAR': {
|
||||
|
@ -401,6 +401,7 @@ class PgAutoscaler(MgrModule):
|
||||
health_checks['POOL_TOO_FEW_PGS'] = {
|
||||
'severity': 'warning',
|
||||
'summary': summary,
|
||||
'count': len(too_few),
|
||||
'detail': too_few
|
||||
}
|
||||
if too_many:
|
||||
@ -409,6 +410,7 @@ class PgAutoscaler(MgrModule):
|
||||
health_checks['POOL_TOO_MANY_PGS'] = {
|
||||
'severity': 'warning',
|
||||
'summary': summary,
|
||||
'count': len(too_many),
|
||||
'detail': too_many
|
||||
}
|
||||
|
||||
@ -436,6 +438,7 @@ class PgAutoscaler(MgrModule):
|
||||
health_checks['POOL_TARGET_SIZE_RATIO_OVERCOMMITTED'] = {
|
||||
'severity': 'warning',
|
||||
'summary': "%d subtrees have overcommitted pool target_size_ratio" % len(too_much_target_ratio),
|
||||
'count': len(too_much_target_ratio),
|
||||
'detail': too_much_target_ratio,
|
||||
}
|
||||
|
||||
@ -465,6 +468,7 @@ class PgAutoscaler(MgrModule):
|
||||
health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = {
|
||||
'severity': 'warning',
|
||||
'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes),
|
||||
'count': len(too_much_target_bytes),
|
||||
'detail': too_much_target_bytes,
|
||||
}
|
||||
|
||||
|
@ -177,6 +177,7 @@ class Module(MgrModule):
|
||||
self._health[check] = {
|
||||
"severity": str(info["severity"]),
|
||||
"summary": str(info["summary"]),
|
||||
"count": 123,
|
||||
"detail": [str(m) for m in info["detail"]]
|
||||
}
|
||||
except Exception as e:
|
||||
|
Loading…
Reference in New Issue
Block a user