mirror of
https://github.com/ceph/ceph
synced 2025-03-30 07:19:14 +00:00
mon: Fix scrub health warning handling and change config to a ratio
Make this mon_warn code clearer since it involves 2 values Code used mon scrub interval instead of pg scrub interval Rename config values to include _pg_ and ratio to make it more clear Fix scrub warniing handling use per-pool intervals when specified Fixes: http://tracker.ceph.com/issues/37264 Signed-off-by: David Zafman <dzafman@redhat.com>
This commit is contained in:
parent
aa4fc19785
commit
6a9895b97a
@ -160,6 +160,11 @@
|
||||
* The 'cephfs-data-scan scan_links' now automatically repair inotables and
|
||||
snaptable.
|
||||
|
||||
* Configuration values mon_warn_not_scrubbed/mon_warn_not_deep_scrubbed have been
|
||||
renamed. They are now mon_warn_pg_not_scrubbed_ratio/mon_warn_pg_not_deep_scrubbed_ratio
|
||||
respectively. This is to clarify that these warnings are related to pg scrubbing
|
||||
and are a ratio of the related interval. These options are now enabled by default.
|
||||
|
||||
>=13.1.0
|
||||
--------
|
||||
|
||||
|
@ -741,8 +741,8 @@ _______________
|
||||
|
||||
One or more PGs has not been scrubbed recently. PGs are normally
|
||||
scrubbed every ``mon_scrub_interval`` seconds, and this warning
|
||||
triggers when ``mon_warn_not_scrubbed`` such intervals have elapsed
|
||||
without a scrub.
|
||||
triggers when ``mon_warn_pg_not_scrubbed_ratio`` percentage of interval has elapsed
|
||||
without a scrub since it was due.
|
||||
|
||||
PGs will not scrub if they are not flagged as *clean*, which may
|
||||
happen if they are misplaced or degraded (see *PG_AVAILABILITY* and
|
||||
@ -757,8 +757,8 @@ ____________________
|
||||
|
||||
One or more PGs has not been deep scrubbed recently. PGs are normally
|
||||
scrubbed every ``osd_deep_mon_scrub_interval`` seconds, and this warning
|
||||
triggers when ``mon_warn_not_deep_scrubbed`` such intervals have elapsed
|
||||
without a scrub.
|
||||
triggers when ``mon_warn_pg_not_deep_scrubbed_ratio`` percentage of interval has elapsed
|
||||
without a scrub since it was due.
|
||||
|
||||
PGs will not (deep) scrub if they are not flagged as *clean*, which may
|
||||
happen if they are misplaced or degraded (see *PG_AVAILABILITY* and
|
||||
|
@ -274,8 +274,8 @@ OPTION(mon_health_to_clog_tick_interval, OPT_DOUBLE)
|
||||
OPTION(mon_data_avail_crit, OPT_INT)
|
||||
OPTION(mon_data_avail_warn, OPT_INT)
|
||||
OPTION(mon_data_size_warn, OPT_U64) // issue a warning when the monitor's data store goes over 15GB (in bytes)
|
||||
OPTION(mon_warn_not_scrubbed, OPT_INT)
|
||||
OPTION(mon_warn_not_deep_scrubbed, OPT_INT)
|
||||
OPTION(mon_warn_pg_not_scrubbed_ratio, OPT_FLOAT)
|
||||
OPTION(mon_warn_pg_not_deep_scrubbed_ratio, OPT_FLOAT)
|
||||
OPTION(mon_scrub_interval, OPT_INT) // once a day
|
||||
OPTION(mon_scrub_timeout, OPT_INT) // let's give it 5 minutes; why not.
|
||||
OPTION(mon_scrub_max_keys, OPT_INT) // max number of keys to scrub each time
|
||||
|
@ -1715,16 +1715,18 @@ std::vector<Option> get_global_options() {
|
||||
.add_service("mon")
|
||||
.set_description("issue MON_DISK_BIG health warning when mon database is above this size"),
|
||||
|
||||
Option("mon_warn_not_scrubbed", Option::TYPE_INT, Option::LEVEL_ADVANCED)
|
||||
.set_default(0)
|
||||
.add_service("mon")
|
||||
.set_description("if non-zero, issue PG_NOT_SCRUBBED when PG(s) have not been scrubbed for more than this long beyond the configured mon_scrub_interval (seconds)")
|
||||
.add_see_also("osd_scrub_min_interval"),
|
||||
Option("mon_warn_pg_not_scrubbed_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
|
||||
.set_default(0.5)
|
||||
.set_min(0)
|
||||
.set_description("Percentage of the scrub max interval past the scrub max interval to warn")
|
||||
.set_long_description("")
|
||||
.add_see_also("osd_scrub_max_interval"),
|
||||
|
||||
Option("mon_warn_not_deep_scrubbed", Option::TYPE_INT, Option::LEVEL_ADVANCED)
|
||||
.set_default(0)
|
||||
.add_service("mon")
|
||||
.set_description("if non-zero, issue PG_NOT_DEEP_SCRUBBED when PG(s) have not been scrubbed for more than this long beyond the configured mon_scrub_interval (seconds)")
|
||||
Option("mon_warn_pg_not_deep_scrubbed_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
|
||||
.set_default(0.75)
|
||||
.set_min(0)
|
||||
.set_description("Percentage of the deep scrub interval past the deep scrub interval to warn")
|
||||
.set_long_description("")
|
||||
.add_see_also("osd_deep_scrub_interval"),
|
||||
|
||||
Option("mon_scrub_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)
|
||||
|
106
src/mon/PGMap.cc
106
src/mon/PGMap.cc
@ -2941,24 +2941,28 @@ void PGMap::get_health_checks(
|
||||
|
||||
// PG_NOT_SCRUBBED
|
||||
// PG_NOT_DEEP_SCRUBBED
|
||||
{
|
||||
if (cct->_conf->mon_warn_not_scrubbed ||
|
||||
cct->_conf->mon_warn_not_deep_scrubbed) {
|
||||
list<string> detail, deep_detail;
|
||||
int detail_max = max, deep_detail_max = max;
|
||||
int detail_more = 0, deep_detail_more = 0;
|
||||
int detail_total = 0, deep_detail_total = 0;
|
||||
const double age = cct->_conf->mon_warn_not_scrubbed +
|
||||
cct->_conf->mon_scrub_interval;
|
||||
utime_t cutoff = now;
|
||||
cutoff -= age;
|
||||
const double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
|
||||
cct->_conf->osd_deep_scrub_interval;
|
||||
utime_t deep_cutoff = now;
|
||||
deep_cutoff -= deep_age;
|
||||
for (auto& p : pg_stat) {
|
||||
if (cct->_conf->mon_warn_not_scrubbed &&
|
||||
p.second.last_scrub_stamp < cutoff) {
|
||||
if (cct->_conf->mon_warn_pg_not_scrubbed_ratio ||
|
||||
cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
|
||||
list<string> detail, deep_detail;
|
||||
int detail_max = max, deep_detail_max = max;
|
||||
int detail_more = 0, deep_detail_more = 0;
|
||||
int detail_total = 0, deep_detail_total = 0;
|
||||
for (auto& p : pg_stat) {
|
||||
int64_t pnum = p.first.pool();
|
||||
auto pool = osdmap.get_pg_pool(pnum);
|
||||
if (!pool)
|
||||
continue;
|
||||
if (cct->_conf->mon_warn_pg_not_scrubbed_ratio) {
|
||||
double scrub_max_interval = 0;
|
||||
pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
|
||||
if (scrub_max_interval <= 0) {
|
||||
scrub_max_interval = cct->_conf->osd_scrub_max_interval;
|
||||
}
|
||||
const double age = (cct->_conf->mon_warn_pg_not_scrubbed_ratio * scrub_max_interval) +
|
||||
scrub_max_interval;
|
||||
utime_t cutoff = now;
|
||||
cutoff -= age;
|
||||
if (p.second.last_scrub_stamp < cutoff) {
|
||||
if (detail_max > 0) {
|
||||
ostringstream ss;
|
||||
ss << "pg " << p.first << " not scrubbed since "
|
||||
@ -2970,8 +2974,18 @@ void PGMap::get_health_checks(
|
||||
}
|
||||
++detail_total;
|
||||
}
|
||||
if (cct->_conf->mon_warn_not_deep_scrubbed &&
|
||||
p.second.last_deep_scrub_stamp < deep_cutoff) {
|
||||
}
|
||||
if (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
|
||||
double deep_scrub_interval = 0;
|
||||
pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
|
||||
if (deep_scrub_interval <= 0) {
|
||||
deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
|
||||
}
|
||||
double deep_age = (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio * deep_scrub_interval) +
|
||||
deep_scrub_interval;
|
||||
utime_t deep_cutoff = now;
|
||||
deep_cutoff -= deep_age;
|
||||
if (p.second.last_deep_scrub_stamp < deep_cutoff) {
|
||||
if (deep_detail_max > 0) {
|
||||
ostringstream ss;
|
||||
ss << "pg " << p.first << " not deep-scrubbed since "
|
||||
@ -2982,36 +2996,36 @@ void PGMap::get_health_checks(
|
||||
++deep_detail_more;
|
||||
}
|
||||
++deep_detail_total;
|
||||
}
|
||||
}
|
||||
if (detail_total) {
|
||||
ostringstream ss;
|
||||
ss << detail_total << " pgs not scrubbed for " << age;
|
||||
auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
|
||||
|
||||
if (!detail.empty()) {
|
||||
d.detail.swap(detail);
|
||||
|
||||
if (detail_more) {
|
||||
ostringstream ss;
|
||||
ss << detail_more << " more pgs... ";
|
||||
d.detail.push_back(ss.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (deep_detail_total) {
|
||||
ostringstream ss;
|
||||
ss << deep_detail_total << " pgs not deep-scrubbed for " << deep_age;
|
||||
auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
|
||||
}
|
||||
if (detail_total) {
|
||||
ostringstream ss;
|
||||
ss << detail_total << " pgs not scrubbed in time";
|
||||
auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
|
||||
|
||||
if (!deep_detail.empty()) {
|
||||
d.detail.swap(deep_detail);
|
||||
if (!detail.empty()) {
|
||||
d.detail.swap(detail);
|
||||
|
||||
if (deep_detail_more) {
|
||||
ostringstream ss;
|
||||
ss << deep_detail_more << " more pgs... ";
|
||||
d.detail.push_back(ss.str());
|
||||
}
|
||||
if (detail_more) {
|
||||
ostringstream ss;
|
||||
ss << detail_more << " more pgs... ";
|
||||
d.detail.push_back(ss.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
if (deep_detail_total) {
|
||||
ostringstream ss;
|
||||
ss << deep_detail_total << " pgs not deep-scrubbed in time";
|
||||
auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
|
||||
|
||||
if (!deep_detail.empty()) {
|
||||
d.detail.swap(deep_detail);
|
||||
|
||||
if (deep_detail_more) {
|
||||
ostringstream ss;
|
||||
ss << deep_detail_more << " more pgs... ";
|
||||
d.detail.push_back(ss.str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user