Merge pull request #1692 from ceph/wip-7784

mon: OSDMonitor: HEALTH_WARN on 'mon osd down out interval == 0'

Reviewed-by: Sage Weil <sage@inktank.com>
This commit is contained in:
Sage Weil 2014-04-18 11:47:59 -07:00
commit d07ce84148
3 changed files with 29 additions and 0 deletions

View File

@ -5,3 +5,8 @@ v0.80
* OSDMap's json-formatted dump changed for keys 'full' and 'nearfull'.
What was previously being outputted as 'true' or 'false' strings are
now being outputted 'true' and 'false' booleans according to json syntax.
* HEALTH_WARN on 'mon osd down out interval == 0'. Having this option set
to zero on the leader acts much like having the 'noout' flag set. This
warning will only be reported if the monitor getting the 'health' or
'status' request has this option set to zero.

View File

@ -176,6 +176,7 @@ OPTION(mon_osd_report_timeout, OPT_INT, 900) // grace period before declaring
OPTION(mon_force_standby_active, OPT_BOOL, true) // should mons force standby-replay mds to be active
OPTION(mon_warn_on_old_mons, OPT_BOOL, true) // should mons set health to WARN if part of quorum is old?
OPTION(mon_warn_on_legacy_crush_tunables, OPT_BOOL, true) // warn if crush tunables are not optimal
OPTION(mon_warn_on_osd_down_out_interval_zero, OPT_BOOL, true) // warn if 'mon_osd_down_out_interval == 0'
OPTION(mon_min_osdmap_epochs, OPT_INT, 500)
OPTION(mon_max_pgmap_epochs, OPT_INT, 500)
OPTION(mon_max_log_epochs, OPT_INT, 500)

View File

@ -2062,6 +2062,29 @@ void OSDMonitor::get_health(list<pair<health_status_t,string> >& summary,
}
}
// Warn if 'mon_osd_down_out_interval' is set to zero.
// Having this option set to zero on the leader acts much like the
// 'noout' flag. It's hard to figure out what's going wrong with clusters
// without the 'noout' flag set but acting like that just the same, so
// we report a HEALTH_WARN in case this option is set to zero.
// This is an ugly hack to get the warning out, but until we find a way
// to spread global options throughout the mon cluster and have all mons
// using a base set of the same options, we need to work around this sort
// of things.
// There's also the obvious drawback that if this is set on a single
// monitor on a 3-monitor cluster, this warning will only be shown every
// third monitor connection.
if (g_conf->mon_warn_on_osd_down_out_interval_zero &&
g_conf->mon_osd_down_out_interval == 0) {
ostringstream ss;
ss << "mon." << mon->name << " has mon_osd_down_out_interval set to 0";
summary.push_back(make_pair(HEALTH_WARN, ss.str()));
if (detail) {
ss << "; this has the same effect as the 'noout' flag";
detail->push_back(make_pair(HEALTH_WARN, ss.str()));
}
}
get_pools_health(summary, detail);
}
}