From b9a675a293b25ffe07293c73c93843166c005e81 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Mon, 27 Feb 2012 17:49:13 -0800 Subject: [PATCH] mon: report pgs stuck inactive/unclean/stale in health check Signed-off-by: Josh Durgin Reviewed-by: Sage Weil --- src/common/config_opts.h | 1 + src/mon/PGMonitor.cc | 30 +++++++++++++++++++++++++++++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 942eae5cb31..e2a84d152c5 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -100,6 +100,7 @@ OPTION(mon_clock_drift_allowed, OPT_FLOAT, .050) // allowed clock drift between OPTION(mon_clock_drift_warn_backoff, OPT_FLOAT, 5) // exponential backoff for clock drift warnings OPTION(mon_accept_timeout, OPT_FLOAT, 10.0) // on leader, if paxos update isn't accepted OPTION(mon_pg_create_interval, OPT_FLOAT, 30.0) // no more than every 30s +OPTION(mon_pg_stuck_threshold, OPT_INT, 300) // number of seconds after which pgs can be considered inactive, unclean, or stale (see doc/control.rst under dump_stuck for more info) OPTION(mon_osd_full_ratio, OPT_FLOAT, .95) // what % full makes an OSD "full" OPTION(mon_osd_nearfull_ratio, OPT_FLOAT, .85) // what % full makes an OSD near full OPTION(mon_globalid_prealloc, OPT_INT, 100) // how many globalids to prealloc diff --git a/src/mon/PGMonitor.cc b/src/mon/PGMonitor.cc index ce64eb09989..69cf6d1e4ef 100644 --- a/src/mon/PGMonitor.cc +++ b/src/mon/PGMonitor.cc @@ -1151,7 +1151,35 @@ enum health_status_t PGMonitor::get_health(std::ostream &ss) const note["repair"] += p->second; if (p->first & PG_STATE_SPLITTING) note["splitting"] += p->second; + if (p->first & PG_STATE_RECOVERING) + note["recovering"] += p->second; + if (p->first & PG_STATE_INCOMPLETE) + note["incomplete"] += p->second; + if (p->first & PG_STATE_BACKFILL) + note["backfill"] += p->second; } + + hash_map stuck_pgs; + utime_t now(ceph_clock_now(g_ceph_context)); + utime_t cutoff = now - utime_t(g_conf->mon_pg_stuck_threshold, 0); + + pg_map.get_stuck_stats(PGMap::STUCK_INACTIVE, cutoff, stuck_pgs); + if (!stuck_pgs.empty()) { + note["stuck inactive"] = stuck_pgs.size(); + } + stuck_pgs.clear(); + + pg_map.get_stuck_stats(PGMap::STUCK_UNCLEAN, cutoff, stuck_pgs); + if (!stuck_pgs.empty()) { + note["stuck unclean"] = stuck_pgs.size(); + } + stuck_pgs.clear(); + + pg_map.get_stuck_stats(PGMap::STUCK_STALE, cutoff, stuck_pgs); + if (!stuck_pgs.empty()) { + note["stuck stale"] = stuck_pgs.size(); + } + if (!note.empty()) { ret = HEALTH_WARN; for (map::iterator p = note.begin(); p != note.end(); p++) { @@ -1192,7 +1220,7 @@ int PGMonitor::dump_stuck_pg_stats(ostream& ss, { string format = "plain"; string val; - int threshold = 300; + int threshold = g_conf->mon_pg_stuck_threshold; int seconds; ostringstream err;