1
0
mirror of https://github.com/ceph/ceph synced 2025-03-30 07:19:14 +00:00

mon: Fix scrub health warning handling and change config to a ratio

Make this mon_warn code clearer since it involves 2 values
Code used mon scrub interval instead of pg scrub interval
Rename config values to include _pg_ and ratio to make it more clear
Fix scrub warniing handling use per-pool intervals when specified

Fixes: http://tracker.ceph.com/issues/37264

Signed-off-by: David Zafman <dzafman@redhat.com>
This commit is contained in:
David Zafman 2019-01-15 12:12:39 -08:00
parent aa4fc19785
commit 6a9895b97a
5 changed files with 82 additions and 61 deletions

View File

@ -160,6 +160,11 @@
* The 'cephfs-data-scan scan_links' now automatically repair inotables and
snaptable.
* Configuration values mon_warn_not_scrubbed/mon_warn_not_deep_scrubbed have been
renamed. They are now mon_warn_pg_not_scrubbed_ratio/mon_warn_pg_not_deep_scrubbed_ratio
respectively. This is to clarify that these warnings are related to pg scrubbing
and are a ratio of the related interval. These options are now enabled by default.
>=13.1.0
--------

View File

@ -741,8 +741,8 @@ _______________
One or more PGs has not been scrubbed recently. PGs are normally
scrubbed every ``mon_scrub_interval`` seconds, and this warning
triggers when ``mon_warn_not_scrubbed`` such intervals have elapsed
without a scrub.
triggers when ``mon_warn_pg_not_scrubbed_ratio`` percentage of interval has elapsed
without a scrub since it was due.
PGs will not scrub if they are not flagged as *clean*, which may
happen if they are misplaced or degraded (see *PG_AVAILABILITY* and
@ -757,8 +757,8 @@ ____________________
One or more PGs has not been deep scrubbed recently. PGs are normally
scrubbed every ``osd_deep_mon_scrub_interval`` seconds, and this warning
triggers when ``mon_warn_not_deep_scrubbed`` such intervals have elapsed
without a scrub.
triggers when ``mon_warn_pg_not_deep_scrubbed_ratio`` percentage of interval has elapsed
without a scrub since it was due.
PGs will not (deep) scrub if they are not flagged as *clean*, which may
happen if they are misplaced or degraded (see *PG_AVAILABILITY* and

View File

@ -274,8 +274,8 @@ OPTION(mon_health_to_clog_tick_interval, OPT_DOUBLE)
OPTION(mon_data_avail_crit, OPT_INT)
OPTION(mon_data_avail_warn, OPT_INT)
OPTION(mon_data_size_warn, OPT_U64) // issue a warning when the monitor's data store goes over 15GB (in bytes)
OPTION(mon_warn_not_scrubbed, OPT_INT)
OPTION(mon_warn_not_deep_scrubbed, OPT_INT)
OPTION(mon_warn_pg_not_scrubbed_ratio, OPT_FLOAT)
OPTION(mon_warn_pg_not_deep_scrubbed_ratio, OPT_FLOAT)
OPTION(mon_scrub_interval, OPT_INT) // once a day
OPTION(mon_scrub_timeout, OPT_INT) // let's give it 5 minutes; why not.
OPTION(mon_scrub_max_keys, OPT_INT) // max number of keys to scrub each time

View File

@ -1715,16 +1715,18 @@ std::vector<Option> get_global_options() {
.add_service("mon")
.set_description("issue MON_DISK_BIG health warning when mon database is above this size"),
Option("mon_warn_not_scrubbed", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
.add_service("mon")
.set_description("if non-zero, issue PG_NOT_SCRUBBED when PG(s) have not been scrubbed for more than this long beyond the configured mon_scrub_interval (seconds)")
.add_see_also("osd_scrub_min_interval"),
Option("mon_warn_pg_not_scrubbed_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(0.5)
.set_min(0)
.set_description("Percentage of the scrub max interval past the scrub max interval to warn")
.set_long_description("")
.add_see_also("osd_scrub_max_interval"),
Option("mon_warn_not_deep_scrubbed", Option::TYPE_INT, Option::LEVEL_ADVANCED)
.set_default(0)
.add_service("mon")
.set_description("if non-zero, issue PG_NOT_DEEP_SCRUBBED when PG(s) have not been scrubbed for more than this long beyond the configured mon_scrub_interval (seconds)")
Option("mon_warn_pg_not_deep_scrubbed_ratio", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(0.75)
.set_min(0)
.set_description("Percentage of the deep scrub interval past the deep scrub interval to warn")
.set_long_description("")
.add_see_also("osd_deep_scrub_interval"),
Option("mon_scrub_interval", Option::TYPE_INT, Option::LEVEL_ADVANCED)

View File

@ -2941,24 +2941,28 @@ void PGMap::get_health_checks(
// PG_NOT_SCRUBBED
// PG_NOT_DEEP_SCRUBBED
{
if (cct->_conf->mon_warn_not_scrubbed ||
cct->_conf->mon_warn_not_deep_scrubbed) {
list<string> detail, deep_detail;
int detail_max = max, deep_detail_max = max;
int detail_more = 0, deep_detail_more = 0;
int detail_total = 0, deep_detail_total = 0;
const double age = cct->_conf->mon_warn_not_scrubbed +
cct->_conf->mon_scrub_interval;
utime_t cutoff = now;
cutoff -= age;
const double deep_age = cct->_conf->mon_warn_not_deep_scrubbed +
cct->_conf->osd_deep_scrub_interval;
utime_t deep_cutoff = now;
deep_cutoff -= deep_age;
for (auto& p : pg_stat) {
if (cct->_conf->mon_warn_not_scrubbed &&
p.second.last_scrub_stamp < cutoff) {
if (cct->_conf->mon_warn_pg_not_scrubbed_ratio ||
cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
list<string> detail, deep_detail;
int detail_max = max, deep_detail_max = max;
int detail_more = 0, deep_detail_more = 0;
int detail_total = 0, deep_detail_total = 0;
for (auto& p : pg_stat) {
int64_t pnum = p.first.pool();
auto pool = osdmap.get_pg_pool(pnum);
if (!pool)
continue;
if (cct->_conf->mon_warn_pg_not_scrubbed_ratio) {
double scrub_max_interval = 0;
pool->opts.get(pool_opts_t::SCRUB_MAX_INTERVAL, &scrub_max_interval);
if (scrub_max_interval <= 0) {
scrub_max_interval = cct->_conf->osd_scrub_max_interval;
}
const double age = (cct->_conf->mon_warn_pg_not_scrubbed_ratio * scrub_max_interval) +
scrub_max_interval;
utime_t cutoff = now;
cutoff -= age;
if (p.second.last_scrub_stamp < cutoff) {
if (detail_max > 0) {
ostringstream ss;
ss << "pg " << p.first << " not scrubbed since "
@ -2970,8 +2974,18 @@ void PGMap::get_health_checks(
}
++detail_total;
}
if (cct->_conf->mon_warn_not_deep_scrubbed &&
p.second.last_deep_scrub_stamp < deep_cutoff) {
}
if (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio) {
double deep_scrub_interval = 0;
pool->opts.get(pool_opts_t::DEEP_SCRUB_INTERVAL, &deep_scrub_interval);
if (deep_scrub_interval <= 0) {
deep_scrub_interval = cct->_conf->osd_deep_scrub_interval;
}
double deep_age = (cct->_conf->mon_warn_pg_not_deep_scrubbed_ratio * deep_scrub_interval) +
deep_scrub_interval;
utime_t deep_cutoff = now;
deep_cutoff -= deep_age;
if (p.second.last_deep_scrub_stamp < deep_cutoff) {
if (deep_detail_max > 0) {
ostringstream ss;
ss << "pg " << p.first << " not deep-scrubbed since "
@ -2982,36 +2996,36 @@ void PGMap::get_health_checks(
++deep_detail_more;
}
++deep_detail_total;
}
}
if (detail_total) {
ostringstream ss;
ss << detail_total << " pgs not scrubbed for " << age;
auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
if (!detail.empty()) {
d.detail.swap(detail);
if (detail_more) {
ostringstream ss;
ss << detail_more << " more pgs... ";
d.detail.push_back(ss.str());
}
}
}
if (deep_detail_total) {
ostringstream ss;
ss << deep_detail_total << " pgs not deep-scrubbed for " << deep_age;
auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
}
if (detail_total) {
ostringstream ss;
ss << detail_total << " pgs not scrubbed in time";
auto& d = checks->add("PG_NOT_SCRUBBED", HEALTH_WARN, ss.str());
if (!deep_detail.empty()) {
d.detail.swap(deep_detail);
if (!detail.empty()) {
d.detail.swap(detail);
if (deep_detail_more) {
ostringstream ss;
ss << deep_detail_more << " more pgs... ";
d.detail.push_back(ss.str());
}
if (detail_more) {
ostringstream ss;
ss << detail_more << " more pgs... ";
d.detail.push_back(ss.str());
}
}
}
if (deep_detail_total) {
ostringstream ss;
ss << deep_detail_total << " pgs not deep-scrubbed in time";
auto& d = checks->add("PG_NOT_DEEP_SCRUBBED", HEALTH_WARN, ss.str());
if (!deep_detail.empty()) {
d.detail.swap(deep_detail);
if (deep_detail_more) {
ostringstream ss;
ss << deep_detail_more << " more pgs... ";
d.detail.push_back(ss.str());
}
}
}