diff --git a/src/mds/FSMap.cc b/src/mds/FSMap.cc index 0ba39f2f7be..0776caed54c 100644 --- a/src/mds/FSMap.cc +++ b/src/mds/FSMap.cc @@ -386,7 +386,7 @@ void FSMap::get_health_checks(health_check_map_t *checks) const if (!stuck_failed.empty()) { health_check_t& fscheck = checks->get_or_add( "FS_WITH_FAILED_MDS", HEALTH_WARN, - "%num% filesystem%plurals% %hasorhave% a failed mds daemon"); + "%num% filesystem%plurals% %hasorhave% a failed mds daemon", 1); ostringstream ss; ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size() << " failed mds" << (stuck_failed.size() > 1 ? "s" : ""); @@ -402,7 +402,7 @@ void FSMap::get_health_checks(health_check_map_t *checks) const if (standby_count_wanted) { std::ostringstream oss, dss; oss << "insufficient standby MDS daemons available"; - auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str()); + auto& d = checks->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN, oss.str(), 1); dss << "have " << standby_daemons.size() << "; want " << standby_count_wanted << " more"; d.detail.push_back(dss.str()); diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index 4de759fa11f..fb695d0f4d9 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -435,7 +435,8 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const // MDS_DAMAGE if (!damaged.empty()) { health_check_t& check = checks->get_or_add("MDS_DAMAGE", HEALTH_ERR, - "%num% mds daemon%plurals% damaged"); + "%num% mds daemon%plurals% damaged", + damaged.size()); for (auto p : damaged) { std::ostringstream oss; oss << "fs " << fs_name << " mds." << p << " is damaged"; @@ -447,7 +448,7 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const if (is_degraded()) { health_check_t& fscheck = checks->get_or_add( "FS_DEGRADED", HEALTH_WARN, - "%num% filesystem%plurals% %isorare% degraded"); + "%num% filesystem%plurals% %isorare% degraded", 1); ostringstream ss; ss << "fs " << fs_name << " is degraded"; fscheck.detail.push_back(ss.str()); @@ -478,7 +479,7 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const if ((mds_rank_t)get_num_in_mds() < get_max_mds()) { health_check_t& check = checks->add( "MDS_UP_LESS_THAN_MAX", HEALTH_WARN, - "%num% filesystem%plurals% %isorare% online with fewer MDS than max_mds"); + "%num% filesystem%plurals% %isorare% online with fewer MDS than max_mds", 1); stringstream ss; ss << "fs " << fs_name << " has " << get_num_in_mds() << " MDS online, but wants " << get_max_mds(); @@ -489,7 +490,7 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const if ((mds_rank_t)get_num_up_mds() == 0 && get_max_mds() > 0) { health_check_t &check = checks->add( "MDS_ALL_DOWN", HEALTH_ERR, - "%num% filesystem%plurals% %isorare% offline"); + "%num% filesystem%plurals% %isorare% offline", 1); stringstream ss; ss << "fs " << fs_name << " is offline because no MDS is active for it."; check.detail.push_back(ss.str()); @@ -499,7 +500,7 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const was_snaps_ever_allowed() && !allows_multimds_snaps()) { health_check_t &check = checks->add( "MULTIMDS_WITH_OLDSNAPS", HEALTH_ERR, - "%num% filesystem%plurals% %isorare% multi-active mds with old snapshots"); + "%num% filesystem%plurals% %isorare% multi-active mds with old snapshots", 1); stringstream ss; ss << "multi-active mds while there are snapshots possibly created by pre-mimic MDS"; check.detail.push_back(ss.str()); diff --git a/src/mgr/BaseMgrModule.cc b/src/mgr/BaseMgrModule.cc index ddc387f6f55..d18069236f4 100644 --- a/src/mgr/BaseMgrModule.cc +++ b/src/mgr/BaseMgrModule.cc @@ -267,6 +267,7 @@ ceph_set_health_checks(BaseMgrModule *self, PyObject *args) health_status_t severity = HEALTH_OK; string summary; list detail; + int64_t count = 0; PyObject *infols = PyDict_Items(check_info); for (int j = 0; j < PyList_Size(infols); ++j) { PyObject *pair = PyList_GET_ITEM(infols, j); @@ -301,6 +302,14 @@ ceph_set_health_checks(BaseMgrModule *self, PyObject *args) } else { summary = std::move(vs); } + } else if (ks == "count") { + if (PyLong_Check(v)) { + count = PyLong_AsLong(v); + } else { + derr << __func__ << " check " << check_name + << " count value not long" << dendl; + continue; + } } else if (ks == "detail") { if (!PyList_Check(v)) { derr << __func__ << " check " << check_name @@ -322,7 +331,7 @@ ceph_set_health_checks(BaseMgrModule *self, PyObject *args) << " unexpected key " << k << dendl; } } - auto& d = out_checks.add(check_name, severity, summary); + auto& d = out_checks.add(check_name, severity, summary, count); d.detail.swap(detail); } diff --git a/src/mgr/DaemonHealthMetricCollector.cc b/src/mgr/DaemonHealthMetricCollector.cc index beba5dad815..0f4d5bd643e 100644 --- a/src/mgr/DaemonHealthMetricCollector.cc +++ b/src/mgr/DaemonHealthMetricCollector.cc @@ -18,7 +18,7 @@ class SlowOps final : public DaemonHealthMetricCollector { return type == daemon_metric::SLOW_OPS; } health_check_t& _get_check(health_check_map_t& cm) const override { - return cm.get_or_add("SLOW_OPS", HEALTH_WARN, ""); + return cm.get_or_add("SLOW_OPS", HEALTH_WARN, "", 1); } bool _update(const DaemonKey& daemon, const DaemonHealthMetric& metric) override { @@ -61,7 +61,7 @@ class PendingPGs final : public DaemonHealthMetricCollector { return type == daemon_metric::PENDING_CREATING_PGS; } health_check_t& _get_check(health_check_map_t& cm) const override { - return cm.get_or_add("PENDING_CREATING_PGS", HEALTH_WARN, ""); + return cm.get_or_add("PENDING_CREATING_PGS", HEALTH_WARN, "", 1); } bool _update(const DaemonKey& osd, const DaemonHealthMetric& metric) override { diff --git a/src/mgr/PyModuleRegistry.cc b/src/mgr/PyModuleRegistry.cc index 16c43e40e07..ac92c9282cc 100644 --- a/src/mgr/PyModuleRegistry.cc +++ b/src/mgr/PyModuleRegistry.cc @@ -386,7 +386,8 @@ void PyModuleRegistry::get_health_checks(health_check_map_t *checks) ss << dependency_modules.size() << " mgr modules have failed dependencies"; } - auto& d = checks->add("MGR_MODULE_DEPENDENCY", HEALTH_WARN, ss.str()); + auto& d = checks->add("MGR_MODULE_DEPENDENCY", HEALTH_WARN, ss.str(), + dependency_modules.size()); for (auto& i : dependency_modules) { std::ostringstream ss; ss << "Module '" << i.first << "' has failed dependency: " << i.second; @@ -402,7 +403,8 @@ void PyModuleRegistry::get_health_checks(health_check_map_t *checks) } else if (failed_modules.size() > 1) { ss << failed_modules.size() << " mgr modules have failed"; } - auto& d = checks->add("MGR_MODULE_ERROR", HEALTH_ERR, ss.str()); + auto& d = checks->add("MGR_MODULE_ERROR", HEALTH_ERR, ss.str(), + failed_modules.size()); for (auto& i : failed_modules) { std::ostringstream ss; ss << "Module '" << i.first << "' has failed: " << i.second; diff --git a/src/mon/AuthMonitor.cc b/src/mon/AuthMonitor.cc index 89f88b65cda..4025a926ee5 100644 --- a/src/mon/AuthMonitor.cc +++ b/src/mon/AuthMonitor.cc @@ -418,7 +418,8 @@ void AuthMonitor::encode_pending(MonitorDBStore::TransactionRef t) if (bad_detail.size()) { ostringstream ss; ss << bad_detail.size() << " auth entities have invalid capabilities"; - health_check_t *check = &next.add("AUTH_BAD_CAPS", HEALTH_ERR, ss.str()); + health_check_t *check = &next.add("AUTH_BAD_CAPS", HEALTH_ERR, ss.str(), + bad_detail.size()); for (auto& i : bad_detail) { for (auto& j : i.second) { check->detail.push_back(j); diff --git a/src/mon/HealthMonitor.cc b/src/mon/HealthMonitor.cc index 18600acbe25..a750eba9ef0 100644 --- a/src/mon/HealthMonitor.cc +++ b/src/mon/HealthMonitor.cc @@ -570,14 +570,14 @@ bool HealthMonitor::check_member_health() if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_crit) { stringstream ss, ss2; ss << "mon%plurals% %names% %isorare% very low on available space"; - auto& d = next.add("MON_DISK_CRIT", HEALTH_ERR, ss.str()); + auto& d = next.add("MON_DISK_CRIT", HEALTH_ERR, ss.str(), 1); ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent << "% avail"; d.detail.push_back(ss2.str()); } else if (stats.fs_stats.avail_percent <= g_conf()->mon_data_avail_warn) { stringstream ss, ss2; ss << "mon%plurals% %names% %isorare% low on available space"; - auto& d = next.add("MON_DISK_LOW", HEALTH_WARN, ss.str()); + auto& d = next.add("MON_DISK_LOW", HEALTH_WARN, ss.str(), 1); ss2 << "mon." << mon->name << " has " << stats.fs_stats.avail_percent << "% avail"; d.detail.push_back(ss2.str()); @@ -585,7 +585,7 @@ bool HealthMonitor::check_member_health() if (stats.store_stats.bytes_total >= g_conf()->mon_data_size_warn) { stringstream ss, ss2; ss << "mon%plurals% %names% %isorare% using a lot of disk space"; - auto& d = next.add("MON_DISK_BIG", HEALTH_WARN, ss.str()); + auto& d = next.add("MON_DISK_BIG", HEALTH_WARN, ss.str(), 1); ss2 << "mon." << mon->name << " is " << byte_u_t(stats.store_stats.bytes_total) << " >= mon_data_size_warn (" @@ -611,7 +611,7 @@ bool HealthMonitor::check_member_health() g_conf()->mon_osd_down_out_interval == 0) { ostringstream ss, ds; ss << "mon%plurals% %names% %hasorhave% mon_osd_down_out_interval set to 0"; - auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str()); + auto& d = next.add("OSD_NO_DOWN_OUT_INTERVAL", HEALTH_WARN, ss.str(), 1); ds << "mon." << mon->name << " has mon_osd_down_out_interval set to 0"; d.detail.push_back(ds.str()); } @@ -669,7 +669,7 @@ bool HealthMonitor::check_leader_health() ostringstream ss; ss << (max-actual) << "/" << max << " mons down, quorum " << mon->get_quorum_names(); - auto& d = next.add("MON_DOWN", HEALTH_WARN, ss.str()); + auto& d = next.add("MON_DOWN", HEALTH_WARN, ss.str(), max - actual); set q = mon->get_quorum(); for (int i=0; i pg_msgs; + int64_t count = 0; for (const auto &j : i.second.states) { std::ostringstream msg; msg << j.second << (j.second > 1 ? " pgs " : " pg ") << state_name(j.first); pg_msgs.push_back(msg.str()); + count += j.second; } summary += joinify(pg_msgs.begin(), pg_msgs.end(), std::string(", ")); - - health_check_t *check = &checks->add( health_code, sev, - summary); + summary, + count); // Compose list of PGs contributing to this health check failing for (const auto &j : i.second.pg_messages) { @@ -2629,7 +2630,8 @@ void PGMap::get_health_checks( if (pg_sum.stats.sum.num_scrub_errors) { ostringstream ss; ss << pg_sum.stats.sum.num_scrub_errors << " scrub errors"; - checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str()); + checks->add("OSD_SCRUB_ERRORS", HEALTH_ERR, ss.str(), + pg_sum.stats.sum.num_scrub_errors); } // LARGE_OMAP_OBJECTS @@ -2656,7 +2658,8 @@ void PGMap::get_health_checks( if (!detail.empty()) { ostringstream ss; ss << pg_sum.stats.sum.num_large_omap_objects << " large omap objects"; - auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str()); + auto& d = checks->add("LARGE_OMAP_OBJECTS", HEALTH_WARN, ss.str(), + pg_sum.stats.sum.num_large_omap_objects); stringstream tip; tip << "Search the cluster log for 'Large omap object found' for more " << "details."; @@ -2711,7 +2714,8 @@ void PGMap::get_health_checks( if (!detail.empty()) { ostringstream ss; ss << num_pools << " cache pools at or near target size"; - auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str()); + auto& d = checks->add("CACHE_POOL_NEAR_FULL", HEALTH_WARN, ss.str(), + num_pools); d.detail.swap(detail); } } @@ -2727,7 +2731,8 @@ void PGMap::get_health_checks( ostringstream ss; ss << "too few PGs per OSD (" << per << " < min " << min_pg_per_osd << ")"; - checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str()); + checks->add("TOO_FEW_PGS", HEALTH_WARN, ss.str(), + min_pg_per_osd - per); } } @@ -2739,7 +2744,8 @@ void PGMap::get_health_checks( ostringstream ss; ss << "too many PGs per OSD (" << per << " > max " << max_pg_per_osd << ")"; - checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str()); + checks->add("TOO_MANY_PGS", HEALTH_WARN, ss.str(), + per - max_pg_per_osd); } } @@ -2750,7 +2756,8 @@ void PGMap::get_health_checks( ostringstream ss; ss << "OSD count " << osdmap.get_num_osds() << " < osd_pool_default_size " << osd_pool_default_size; - checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str()); + checks->add("TOO_FEW_OSDS", HEALTH_WARN, ss.str(), + osd_pool_default_size - osdmap.get_num_osds()); } // SMALLER_PGP_NUM @@ -2803,14 +2810,16 @@ void PGMap::get_health_checks( if (!pgp_detail.empty()) { ostringstream ss; ss << pgp_detail.size() << " pools have pg_num > pgp_num"; - auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str()); + auto& d = checks->add("SMALLER_PGP_NUM", HEALTH_WARN, ss.str(), + pgp_detail.size()); d.detail.swap(pgp_detail); } if (!many_detail.empty()) { ostringstream ss; ss << many_detail.size() << " pools have many more objects per pg than" << " average"; - auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str()); + auto& d = checks->add("MANY_OBJECTS_PER_PG", HEALTH_WARN, ss.str(), + many_detail.size()); d.detail.swap(many_detail); } } @@ -2880,13 +2889,13 @@ void PGMap::get_health_checks( if (full_pools) { ostringstream ss; ss << full_pools << " pools full"; - auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str()); + auto& d = checks->add("POOL_FULL", HEALTH_ERR, ss.str(), full_pools); d.detail.swap(full_detail); } if (nearfull_pools) { ostringstream ss; ss << nearfull_pools << " pools nearfull"; - auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str()); + auto& d = checks->add("POOL_NEAR_FULL", HEALTH_WARN, ss.str(), nearfull_pools); d.detail.swap(nearfull_detail); } } @@ -2903,7 +2912,8 @@ void PGMap::get_health_checks( ss << pg_sum.stats.sum.num_objects_misplaced << "/" << pg_sum.stats.sum.num_object_copies << " objects misplaced (" << b << "%)"; - checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str()); + checks->add("OBJECT_MISPLACED", HEALTH_WARN, ss.str(), + pg_sum.stats.sum.num_objects_misplaced); } // OBJECT_UNFOUND @@ -2916,7 +2926,8 @@ void PGMap::get_health_checks( ostringstream ss; ss << pg_sum.stats.sum.num_objects_unfound << "/" << pg_sum.stats.sum.num_objects << " objects unfound (" << b << "%)"; - auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str()); + auto& d = checks->add("OBJECT_UNFOUND", HEALTH_WARN, ss.str(), + pg_sum.stats.sum.num_objects_unfound); for (auto& p : pg_stat) { if (p.second.stats.sum.num_objects_unfound) { @@ -2988,7 +2999,7 @@ void PGMap::get_health_checks( ostringstream ss; ss << warn << " slow requests are blocked > " << cct->_conf->mon_osd_warn_op_age << " sec"; - auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str()); + auto& d = checks->add("REQUEST_SLOW", HEALTH_WARN, ss.str(), warn); d.detail.swap(warn_detail); int left = max; for (auto& p : warn_osd_by_max) { @@ -3010,7 +3021,7 @@ void PGMap::get_health_checks( ostringstream ss; ss << error << " stuck requests are blocked > " << err_age << " sec"; - auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str()); + auto& d = checks->add("REQUEST_STUCK", HEALTH_ERR, ss.str(), error); d.detail.swap(error_detail); int left = max; for (auto& p : error_osd_by_max) { @@ -3071,7 +3082,7 @@ void PGMap::get_health_checks( } else if (asum.first == "BLUESTORE_NO_PER_POOL_OMAP") { summary = " reporting legacy (not per-pool) BlueStore omap usage stats"; } - auto& d = checks->add(asum.first, HEALTH_WARN, summary); + auto& d = checks->add(asum.first, HEALTH_WARN, summary, asum.second.first); for (auto& s : asum.second.second) { d.detail.push_back(s); } @@ -3140,7 +3151,7 @@ void PGMap::get_health_checks( if (detail_total) { ostringstream ss; ss << detail_total << " pgs not scrubbed in time"; - auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str()); + auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str(), detail_total); if (!detail.empty()) { d.detail.swap(detail); @@ -3155,7 +3166,8 @@ void PGMap::get_health_checks( if (deep_detail_total) { ostringstream ss; ss << deep_detail_total << " pgs not deep-scrubbed in time"; - auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str()); + auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str(), + deep_detail_total); if (!deep_detail.empty()) { d.detail.swap(deep_detail); @@ -3196,7 +3208,8 @@ void PGMap::get_health_checks( if (!detail.empty()) { ostringstream ss; ss << detail.size() << " pool(s) do not have an application enabled"; - auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str()); + auto& d = checks->add("POOL_APP_NOT_ENABLED", HEALTH_WARN, ss.str(), + detail.size()); stringstream tip; tip << "use 'ceph osd pool application enable " << "', where is 'cephfs', 'rbd', 'rgw', " @@ -3244,7 +3257,8 @@ void PGMap::get_health_checks( stringstream ss; ss << "snap trim queue for " << snaptrimq_exceeded << " pg(s) >= " << snapthreshold << " (mon_osd_snap_trim_queue_warn_on)"; - auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str()); + auto& d = checks->add("PG_SLOW_SNAP_TRIMMING", HEALTH_WARN, ss.str(), + snaptrimq_exceeded); detail.push_back("try decreasing \"osd snap trim sleep\" and/or increasing \"osd pg max concurrent snap trims\"."); d.detail.swap(detail); } diff --git a/src/mon/health_check.h b/src/mon/health_check.h index 4215a8bdbca..107f67293d3 100644 --- a/src/mon/health_check.h +++ b/src/mon/health_check.h @@ -14,12 +14,16 @@ struct health_check_t { health_status_t severity; std::string summary; std::list detail; + int64_t count = 0; DENC(health_check_t, v, p) { - DENC_START(1, 1, p); + DENC_START(2, 1, p); denc(v.severity, p); denc(v.summary, p); denc(v.detail, p); + if (struct_v >= 2) { + denc(v.count, p); + } DENC_FINISH(p); } @@ -27,7 +31,8 @@ struct health_check_t { const health_check_t& r) { return l.severity == r.severity && l.summary == r.summary && - l.detail == r.detail; + l.detail == r.detail && + l.count == r.count; } friend bool operator!=(const health_check_t& l, const health_check_t& r) { @@ -39,6 +44,7 @@ struct health_check_t { f->open_object_section("summary"); f->dump_string("message", summary); + f->dump_int("count", count); f->close_section(); if (want_detail) { @@ -58,6 +64,7 @@ struct health_check_t { ls.back()->severity = HEALTH_ERR; ls.back()->summary = "summarization"; ls.back()->detail = {"one", "two", "three"}; + ls.back()->count = 42; } }; WRITE_CLASS_DENC(health_check_t) @@ -117,14 +124,15 @@ struct health_check_map_t { ls.push_back(new health_check_map_t); ls.push_back(new health_check_map_t); { - auto& d = ls.back()->add("FOO", HEALTH_WARN, "foo"); + auto& d = ls.back()->add("FOO", HEALTH_WARN, "foo", 2); d.detail.push_back("a"); d.detail.push_back("b"); } { - auto& d = ls.back()->add("BAR", HEALTH_ERR, "bar!"); + auto& d = ls.back()->add("BAR", HEALTH_ERR, "bar!", 3); d.detail.push_back("c"); d.detail.push_back("d"); + d.detail.push_back("e"); } } @@ -140,19 +148,23 @@ struct health_check_map_t { health_check_t& add(const std::string& code, health_status_t severity, - const std::string& summary) { + const std::string& summary, + int64_t count) { ceph_assert(checks.count(code) == 0); health_check_t& r = checks[code]; r.severity = severity; r.summary = summary; + r.count = count; return r; } health_check_t& get_or_add(const std::string& code, health_status_t severity, - const std::string& summary) { + const std::string& summary, + int64_t count) { health_check_t& r = checks[code]; r.severity = severity; r.summary = summary; + r.count += count; return r; } @@ -168,6 +180,7 @@ struct health_check_map_t { q->second.detail.end(), p.second.detail.begin(), p.second.detail.end()); + q->second.count += p.second.count; } } } diff --git a/src/osd/OSDMap.cc b/src/osd/OSDMap.cc index 6e6732cb4e2..3982dae9827 100644 --- a/src/osd/OSDMap.cc +++ b/src/osd/OSDMap.cc @@ -5568,7 +5568,8 @@ void OSDMap::check_health(health_check_map_t *checks) const string err = string("OSD_") + string(crush->get_type_name(type)) + "_DOWN"; boost::to_upper(err); - auto& d = checks->add(err, HEALTH_WARN, ss.str()); + auto& d = checks->add(err, HEALTH_WARN, ss.str(), + subtree_type_down[type].size()); for (auto j = subtree_type_down[type].rbegin(); j != subtree_type_down[type].rend(); ++j) { @@ -5591,7 +5592,8 @@ void OSDMap::check_health(health_check_map_t *checks) const } ostringstream ss; ss << down_in_osds.size() << " osds down"; - auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str()); + auto& d = checks->add("OSD_DOWN", HEALTH_WARN, ss.str(), + down_in_osds.size()); for (auto it = down_in_osds.begin(); it != down_in_osds.end(); ++it) { ostringstream ss; ss << "osd." << *it << " ("; @@ -5604,7 +5606,8 @@ void OSDMap::check_health(health_check_map_t *checks) const if (!osds.empty()) { ostringstream ss; ss << osds.size() << " osds exist in the crush map but not in the osdmap"; - auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str()); + auto& d = checks->add("OSD_ORPHAN", HEALTH_WARN, ss.str(), + osds.size()); for (auto osd : osds) { ostringstream ss; ss << "osd." << osd << " exists in crush map but not in osdmap"; @@ -5634,7 +5637,7 @@ void OSDMap::check_health(health_check_map_t *checks) const out += noscrub ? string("noscrub") + (nodeepscrub ? ", " : "") : ""; out += nodeepscrub ? "nodeep-scrub" : ""; auto& d = checks->add("POOL_SCRUB_FLAGS", HEALTH_OK, - "Some pool(s) have the " + out + " flag(s) set"); + "Some pool(s) have the " + out + " flag(s) set", 0); d.detail.splice(d.detail.end(), scrub_messages); } @@ -5673,7 +5676,7 @@ void OSDMap::check_health(health_check_map_t *checks) const } if (!detail.empty()) { auto& d = checks->add("OSD_OUT_OF_ORDER_FULL", HEALTH_ERR, - "full ratio(s) out of order"); + "full ratio(s) out of order", 0); d.detail.swap(detail); } } @@ -5688,7 +5691,7 @@ void OSDMap::check_health(health_check_map_t *checks) const if (full.size()) { ostringstream ss; ss << full.size() << " full osd(s)"; - auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str()); + auto& d = checks->add("OSD_FULL", HEALTH_ERR, ss.str(), full.size()); for (auto& i: full) { ostringstream ss; ss << "osd." << i << " is full"; @@ -5698,7 +5701,8 @@ void OSDMap::check_health(health_check_map_t *checks) const if (backfillfull.size()) { ostringstream ss; ss << backfillfull.size() << " backfillfull osd(s)"; - auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str()); + auto& d = checks->add("OSD_BACKFILLFULL", HEALTH_WARN, ss.str(), + backfillfull.size()); for (auto& i: backfillfull) { ostringstream ss; ss << "osd." << i << " is backfill full"; @@ -5708,7 +5712,7 @@ void OSDMap::check_health(health_check_map_t *checks) const if (nearfull.size()) { ostringstream ss; ss << nearfull.size() << " nearfull osd(s)"; - auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str()); + auto& d = checks->add("OSD_NEARFULL", HEALTH_WARN, ss.str(), nearfull.size()); for (auto& i: nearfull) { ostringstream ss; ss << "osd." << i << " is near full"; @@ -5739,9 +5743,10 @@ void OSDMap::check_health(health_check_map_t *checks) const CEPH_OSDMAP_NOREBALANCE; if (test_flag(warn_flags)) { ostringstream ss; - ss << get_flag_string(get_flags() & warn_flags) - << " flag(s) set"; - checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str()); + string s = get_flag_string(get_flags() & warn_flags); + ss << s << " flag(s) set"; + checks->add("OSDMAP_FLAGS", HEALTH_WARN, ss.str(), + s.size() /* kludgey but sufficient */); } } @@ -5787,7 +5792,7 @@ void OSDMap::check_health(health_check_map_t *checks) const if (!detail.empty()) { ostringstream ss; ss << detail.size() << " OSDs or CRUSH {nodes, device-classes} have {NOUP,NODOWN,NOIN,NOOUT} flags set"; - auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str()); + auto& d = checks->add("OSD_FLAGS", HEALTH_WARN, ss.str(), detail.size()); d.detail.swap(detail); } } @@ -5799,7 +5804,7 @@ void OSDMap::check_health(health_check_map_t *checks) const ostringstream ss; ss << "crush map has legacy tunables (require " << min << ", min is " << g_conf()->mon_crush_min_required_version << ")"; - auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str()); + auto& d = checks->add("OLD_CRUSH_TUNABLES", HEALTH_WARN, ss.str(), 0); d.detail.push_back("see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables"); } } @@ -5809,7 +5814,7 @@ void OSDMap::check_health(health_check_map_t *checks) const if (crush->get_straw_calc_version() == 0) { ostringstream ss; ss << "crush map has straw_calc_version=0"; - auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str()); + auto& d = checks->add("OLD_CRUSH_STRAW_CALC_VERSION", HEALTH_WARN, ss.str(), 0); d.detail.push_back( "see http://docs.ceph.com/docs/master/rados/operations/crush-map/#tunables"); } @@ -5832,7 +5837,8 @@ void OSDMap::check_health(health_check_map_t *checks) const if (!detail.empty()) { ostringstream ss; ss << detail.size() << " cache pools are missing hit_sets"; - auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str()); + auto& d = checks->add("CACHE_POOL_NO_HIT_SET", HEALTH_WARN, ss.str(), + detail.size()); d.detail.swap(detail); } } @@ -5841,7 +5847,7 @@ void OSDMap::check_health(health_check_map_t *checks) const if (!test_flag(CEPH_OSDMAP_SORTBITWISE)) { ostringstream ss; ss << "'sortbitwise' flag is not set"; - checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str()); + checks->add("OSD_NO_SORTBITWISE", HEALTH_WARN, ss.str(), 0); } // OSD_UPGRADE_FINISHED @@ -5876,19 +5882,21 @@ void OSDMap::check_health(health_check_map_t *checks) const if (!full_detail.empty()) { ostringstream ss; ss << full_detail.size() << " pool(s) full"; - auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str()); + auto& d = checks->add("POOL_FULL", HEALTH_WARN, ss.str(), full_detail.size()); d.detail.swap(full_detail); } if (!backfillfull_detail.empty()) { ostringstream ss; ss << backfillfull_detail.size() << " pool(s) backfillfull"; - auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str()); + auto& d = checks->add("POOL_BACKFILLFULL", HEALTH_WARN, ss.str(), + backfillfull_detail.size()); d.detail.swap(backfillfull_detail); } if (!nearfull_detail.empty()) { ostringstream ss; ss << nearfull_detail.size() << " pool(s) nearfull"; - auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str()); + auto& d = checks->add("POOL_NEARFULL", HEALTH_WARN, ss.str(), + nearfull_detail.size()); d.detail.swap(nearfull_detail); } } diff --git a/src/pybind/mgr/crash/module.py b/src/pybind/mgr/crash/module.py index d605dd34fe8..ba3931e6c44 100644 --- a/src/pybind/mgr/crash/module.py +++ b/src/pybind/mgr/crash/module.py @@ -87,6 +87,7 @@ class Module(MgrModule): health_checks['RECENT_CRASH'] = { 'severity': 'warning', 'summary': '%d daemons have recently crashed' % (num), + 'count': num, 'detail': detail, } self.set_health_checks(health_checks) diff --git a/src/pybind/mgr/devicehealth/module.py b/src/pybind/mgr/devicehealth/module.py index b657dc55f1b..febb14ee9b3 100644 --- a/src/pybind/mgr/devicehealth/module.py +++ b/src/pybind/mgr/devicehealth/module.py @@ -553,6 +553,7 @@ class Module(MgrModule): checks[warning] = { 'severity': 'warning', 'summary': HEALTH_MESSAGES[warning] % n, + 'count': len(ls), 'detail': ls, } self.set_health_checks(checks) diff --git a/src/pybind/mgr/mgr_module.py b/src/pybind/mgr/mgr_module.py index 731108397c8..b34bc523c92 100644 --- a/src/pybind/mgr/mgr_module.py +++ b/src/pybind/mgr/mgr_module.py @@ -896,6 +896,7 @@ class MgrModule(ceph_module.BaseMgrModule): 'CHECK_FOO': { 'severity': 'warning', # or 'error' 'summary': 'summary string', + 'count': 4, # quantify badness 'detail': [ 'list', 'of', 'detail', 'strings' ], }, 'CHECK_BAR': { diff --git a/src/pybind/mgr/pg_autoscaler/module.py b/src/pybind/mgr/pg_autoscaler/module.py index e21ecfdacdc..2c45c992174 100644 --- a/src/pybind/mgr/pg_autoscaler/module.py +++ b/src/pybind/mgr/pg_autoscaler/module.py @@ -401,6 +401,7 @@ class PgAutoscaler(MgrModule): health_checks['POOL_TOO_FEW_PGS'] = { 'severity': 'warning', 'summary': summary, + 'count': len(too_few), 'detail': too_few } if too_many: @@ -409,6 +410,7 @@ class PgAutoscaler(MgrModule): health_checks['POOL_TOO_MANY_PGS'] = { 'severity': 'warning', 'summary': summary, + 'count': len(too_many), 'detail': too_many } @@ -436,6 +438,7 @@ class PgAutoscaler(MgrModule): health_checks['POOL_TARGET_SIZE_RATIO_OVERCOMMITTED'] = { 'severity': 'warning', 'summary': "%d subtrees have overcommitted pool target_size_ratio" % len(too_much_target_ratio), + 'count': len(too_much_target_ratio), 'detail': too_much_target_ratio, } @@ -465,6 +468,7 @@ class PgAutoscaler(MgrModule): health_checks['POOL_TARGET_SIZE_BYTES_OVERCOMMITTED'] = { 'severity': 'warning', 'summary': "%d subtrees have overcommitted pool target_size_bytes" % len(too_much_target_bytes), + 'count': len(too_much_target_bytes), 'detail': too_much_target_bytes, } diff --git a/src/pybind/mgr/selftest/module.py b/src/pybind/mgr/selftest/module.py index a5a2cc10273..a3fb51cde13 100644 --- a/src/pybind/mgr/selftest/module.py +++ b/src/pybind/mgr/selftest/module.py @@ -177,6 +177,7 @@ class Module(MgrModule): self._health[check] = { "severity": str(info["severity"]), "summary": str(info["summary"]), + "count": 123, "detail": [str(m) for m in info["detail"]] } except Exception as e: