mds: revise health checks for _FAILED/_DEGRADED

No longer output MDS_* versions as well as FS_* versions,
because it was noisy and the important message is about
the availability (or not) of the filesystem.

Revise the _FAILED check to only raise the message if
there are not suitable replacements available for failed
ranks.  This avoids a spurious health check failure when
a rank has been failed (e.g. by the admin) but it will
be replaced at the next tick().

After this change, doing a "ceph mds fail" when a standby
is available just gives you a single FS_DEGRADED health
check from the point of the "fail" to when the replacement
is active.

Signed-off-by: John Spray <john.spray@redhat.com>
This commit is contained in:
John Spray 2017-07-16 12:08:32 -04:00
parent beb4c38ae7
commit d0192e5962
2 changed files with 27 additions and 27 deletions

View File

@ -341,7 +341,34 @@ void FSMap::get_health_checks(health_check_map_t *checks) const
for (const auto &i : filesystems) {
const auto &fs = i.second;
health_check_map_t fschecks;
fs->mds_map.get_health_checks(&fschecks);
// Some of the failed ranks might be transient (i.e. there are standbys
// ready to replace them). We will report only on "stuck" failed, i.e.
// ranks which are failed and have no standby replacement available.
std::set<mds_rank_t> stuck_failed;
for (const auto &rank : fs->mds_map.failed) {
const mds_gid_t replacement = find_replacement_for(
{fs->fscid, rank}, {}, g_conf->mon_force_standby_active);
if (replacement == MDS_GID_NONE) {
stuck_failed.insert(rank);
}
}
// FS_WITH_FAILED_MDS
// MDS_FAILED
if (!stuck_failed.empty()) {
health_check_t& fscheck = checks->add(
"FS_WITH_FAILED_MDS", HEALTH_WARN,
"%num% filesystem%plurals% %isorare% have a failed mds daemon");
ostringstream ss;
ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
<< " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
fscheck.detail.push_back(ss.str());
}
checks->merge(fschecks);
standby_count_wanted = std::max(
standby_count_wanted,

View File

@ -408,26 +408,6 @@ void MDSMap::get_health(list<pair<health_status_t,string> >& summary,
void MDSMap::get_health_checks(health_check_map_t *checks) const
{
// FS_WITH_FAILED_MDS
// MDS_FAILED
if (!failed.empty()) {
health_check_t& fscheck = checks->add(
"FS_WITH_FAILED_MDS", HEALTH_WARN,
"%num% filesystem%plurals% %isorare% have a failed mds daemon");
ostringstream ss;
ss << "fs " << fs_name << " has " << failed.size() << " failed mds"
<< (failed.size() > 1 ? "s" : "");
fscheck.detail.push_back(ss.str());
health_check_t& check = checks->add("MDS_FAILED", HEALTH_ERR,
"%num% mds daemon%plurals% down");
for (auto p : failed) {
std::ostringstream oss;
oss << "fs " << fs_name << " mds." << p << " has failed";
check.detail.push_back(oss.str());
}
}
// MDS_DAMAGED
if (!damaged.empty()) {
health_check_t& check = checks->add("MDS_DAMAGED", HEALTH_ERR,
@ -440,7 +420,6 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const
}
// FS_DEGRADED
// MDS_DEGRADED
if (is_degraded()) {
health_check_t& fscheck = checks->add(
"FS_DEGRADED", HEALTH_WARN,
@ -469,12 +448,6 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const
if (ss.str().length())
detail.push_back(ss.str());
}
if (!detail.empty()) {
health_check_t& check = checks->add(
"MDS_DEGRADED", HEALTH_WARN,
"%num% mds daemon%plurals% %isorare% degraded");
check.detail.insert(check.detail.end(), detail.begin(), detail.end());
}
}
}