mirror of
https://github.com/ceph/ceph
synced 2024-12-18 01:16:55 +00:00
mds: revise health checks for _FAILED/_DEGRADED
No longer output MDS_* versions as well as FS_* versions, because it was noisy and the important message is about the availability (or not) of the filesystem. Revise the _FAILED check to only raise the message if there are not suitable replacements available for failed ranks. This avoids a spurious health check failure when a rank has been failed (e.g. by the admin) but it will be replaced at the next tick(). After this change, doing a "ceph mds fail" when a standby is available just gives you a single FS_DEGRADED health check from the point of the "fail" to when the replacement is active. Signed-off-by: John Spray <john.spray@redhat.com>
This commit is contained in:
parent
beb4c38ae7
commit
d0192e5962
@ -341,7 +341,34 @@ void FSMap::get_health_checks(health_check_map_t *checks) const
|
||||
for (const auto &i : filesystems) {
|
||||
const auto &fs = i.second;
|
||||
health_check_map_t fschecks;
|
||||
|
||||
fs->mds_map.get_health_checks(&fschecks);
|
||||
|
||||
// Some of the failed ranks might be transient (i.e. there are standbys
|
||||
// ready to replace them). We will report only on "stuck" failed, i.e.
|
||||
// ranks which are failed and have no standby replacement available.
|
||||
std::set<mds_rank_t> stuck_failed;
|
||||
|
||||
for (const auto &rank : fs->mds_map.failed) {
|
||||
const mds_gid_t replacement = find_replacement_for(
|
||||
{fs->fscid, rank}, {}, g_conf->mon_force_standby_active);
|
||||
if (replacement == MDS_GID_NONE) {
|
||||
stuck_failed.insert(rank);
|
||||
}
|
||||
}
|
||||
|
||||
// FS_WITH_FAILED_MDS
|
||||
// MDS_FAILED
|
||||
if (!stuck_failed.empty()) {
|
||||
health_check_t& fscheck = checks->add(
|
||||
"FS_WITH_FAILED_MDS", HEALTH_WARN,
|
||||
"%num% filesystem%plurals% %isorare% have a failed mds daemon");
|
||||
ostringstream ss;
|
||||
ss << "fs " << fs->mds_map.fs_name << " has " << stuck_failed.size()
|
||||
<< " failed mds" << (stuck_failed.size() > 1 ? "s" : "");
|
||||
fscheck.detail.push_back(ss.str());
|
||||
}
|
||||
|
||||
checks->merge(fschecks);
|
||||
standby_count_wanted = std::max(
|
||||
standby_count_wanted,
|
||||
|
@ -408,26 +408,6 @@ void MDSMap::get_health(list<pair<health_status_t,string> >& summary,
|
||||
|
||||
void MDSMap::get_health_checks(health_check_map_t *checks) const
|
||||
{
|
||||
// FS_WITH_FAILED_MDS
|
||||
// MDS_FAILED
|
||||
if (!failed.empty()) {
|
||||
health_check_t& fscheck = checks->add(
|
||||
"FS_WITH_FAILED_MDS", HEALTH_WARN,
|
||||
"%num% filesystem%plurals% %isorare% have a failed mds daemon");
|
||||
ostringstream ss;
|
||||
ss << "fs " << fs_name << " has " << failed.size() << " failed mds"
|
||||
<< (failed.size() > 1 ? "s" : "");
|
||||
fscheck.detail.push_back(ss.str());
|
||||
|
||||
health_check_t& check = checks->add("MDS_FAILED", HEALTH_ERR,
|
||||
"%num% mds daemon%plurals% down");
|
||||
for (auto p : failed) {
|
||||
std::ostringstream oss;
|
||||
oss << "fs " << fs_name << " mds." << p << " has failed";
|
||||
check.detail.push_back(oss.str());
|
||||
}
|
||||
}
|
||||
|
||||
// MDS_DAMAGED
|
||||
if (!damaged.empty()) {
|
||||
health_check_t& check = checks->add("MDS_DAMAGED", HEALTH_ERR,
|
||||
@ -440,7 +420,6 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const
|
||||
}
|
||||
|
||||
// FS_DEGRADED
|
||||
// MDS_DEGRADED
|
||||
if (is_degraded()) {
|
||||
health_check_t& fscheck = checks->add(
|
||||
"FS_DEGRADED", HEALTH_WARN,
|
||||
@ -469,12 +448,6 @@ void MDSMap::get_health_checks(health_check_map_t *checks) const
|
||||
if (ss.str().length())
|
||||
detail.push_back(ss.str());
|
||||
}
|
||||
if (!detail.empty()) {
|
||||
health_check_t& check = checks->add(
|
||||
"MDS_DEGRADED", HEALTH_WARN,
|
||||
"%num% mds daemon%plurals% %isorare% degraded");
|
||||
check.detail.insert(check.detail.end(), detail.begin(), detail.end());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user