From d0192e59623f0ad06496f7ddfd647c028607c665 Mon Sep 17 00:00:00 2001 From: John Spray Date: Sun, 16 Jul 2017 12:08:32 -0400 Subject: [PATCH] mds: revise health checks for _FAILED/_DEGRADED No longer output MDS_* versions as well as FS_* versions, because it was noisy and the important message is about the availability (or not) of the filesystem. Revise the _FAILED check to only raise the message if there are not suitable replacements available for failed ranks. This avoids a spurious health check failure when a rank has been failed (e.g. by the admin) but it will be replaced at the next tick(). After this change, doing a "ceph mds fail" when a standby is available just gives you a single FS_DEGRADED health check from the point of the "fail" to when the replacement is active. Signed-off-by: John Spray --- src/mds/FSMap.cc | 27 +++++++++++++++++++++++++++ src/mds/MDSMap.cc | 27 --------------------------- 2 files changed, 27 insertions(+), 27 deletions(-) diff --git a/src/mds/FSMap.cc b/src/mds/FSMap.cc index 10abd5d2c0d..f44eadf3c3f 100644 --- a/src/mds/FSMap.cc +++ b/src/mds/FSMap.cc @@ -341,7 +341,34 @@ void FSMap::get_health_checks(health_check_map_t *checks) const for (const auto &i : filesystems) { const auto &fs = i.second; health_check_map_t fschecks; + fs->mds_map.get_health_checks(&fschecks); + + // Some of the failed ranks might be transient (i.e. there are standbys + // ready to replace them). We will report only on "stuck" failed, i.e. + // ranks which are failed and have no standby replacement available. + std::set stuck_failed; + + for (const auto &rank : fs->mds_map.failed) { + const mds_gid_t replacement = find_replacement_for( + {fs->fscid, rank}, {}, g_conf->mon_force_standby_active); + if (replacement == MDS_GID_NONE) { + stuck_failed.insert(rank); + } + } + + // FS_WITH_FAILED_MDS + // MDS_FAILED + if (!stuck_failed.empty()) { + health_check_t& fscheck = checks->add( + "FS_WITH_FAILED_MDS", HEALTH_WARN, + "%num% filesystem%plurals% %isorare% have a failed mds daemon"); + ostringstream ss; + ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size() + << " failed mds" << (stuck_failed.size() > 1 ? "s" : ""); + fscheck.detail.push_back(ss.str()); + } + checks->merge(fschecks); standby_count_wanted = std::max( standby_count_wanted, diff --git a/src/mds/MDSMap.cc b/src/mds/MDSMap.cc index bd54469756f..e53338f656f 100644 --- a/src/mds/MDSMap.cc +++ b/src/mds/MDSMap.cc @@ -408,26 +408,6 @@ void MDSMap::get_health(list >& summary, void MDSMap::get_health_checks(health_check_map_t *checks) const { - // FS_WITH_FAILED_MDS - // MDS_FAILED - if (!failed.empty()) { - health_check_t& fscheck = checks->add( - "FS_WITH_FAILED_MDS", HEALTH_WARN, - "%num% filesystem%plurals% %isorare% have a failed mds daemon"); - ostringstream ss; - ss << "fs " << fs_name << " has " << failed.size() << " failed mds" - << (failed.size() > 1 ? "s" : ""); - fscheck.detail.push_back(ss.str()); - - health_check_t& check = checks->add("MDS_FAILED", HEALTH_ERR, - "%num% mds daemon%plurals% down"); - for (auto p : failed) { - std::ostringstream oss; - oss << "fs " << fs_name << " mds." << p << " has failed"; - check.detail.push_back(oss.str()); - } - } - // MDS_DAMAGED if (!damaged.empty()) { health_check_t& check = checks->add("MDS_DAMAGED", HEALTH_ERR, @@ -440,7 +420,6 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const } // FS_DEGRADED - // MDS_DEGRADED if (is_degraded()) { health_check_t& fscheck = checks->add( "FS_DEGRADED", HEALTH_WARN, @@ -469,12 +448,6 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const if (ss.str().length()) detail.push_back(ss.str()); } - if (!detail.empty()) { - health_check_t& check = checks->add( - "MDS_DEGRADED", HEALTH_WARN, - "%num% mds daemon%plurals% %isorare% degraded"); - check.detail.insert(check.detail.end(), detail.begin(), detail.end()); - } } }