Merge pull request #11 from digitalocean/add_explicit_health_state

collectors/health: include a definitive ceph health check
This commit is contained in:
Vaibhav Bhembre 2016-02-05 17:50:40 -05:00
commit b974fe5d96
2 changed files with 97 additions and 14 deletions

View File

@ -30,6 +30,9 @@ type ClusterHealthCollector struct {
// conn holds connection to the Ceph cluster
conn Conn
// HealthStatus shows the overall health status of a given cluster.
HealthStatus prometheus.Gauge
// DegradedPGs shows the no. of PGs that have some of the replicas
// missing.
DegradedPGs prometheus.Gauge
@ -66,12 +69,31 @@ type ClusterHealthCollector struct {
RemappedPGs prometheus.Gauge
}
const (
// CephHealthOK denotes the status of ceph cluster when healthy.
CephHealthOK = "HEALTH_OK"
// CephHealthWarn denotes the status of ceph cluster when unhealthy but recovering.
CephHealthWarn = "HEALTH_WARN"
// CephHealthErr denotes the status of ceph cluster when unhealthy but usually needs
// manual intervention.
CephHealthErr = "HEALTH_ERR"
)
// NewClusterHealthCollector creates a new instance of ClusterHealthCollector to collect health
// metrics on.
func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector {
return &ClusterHealthCollector{
conn: conn,
HealthStatus: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "health_status",
Help: "Health status of Cluster, can vary only between 3 states (err:2, warn:1, ok:0)",
},
),
DegradedPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
@ -147,6 +169,7 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector {
func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
return []prometheus.Metric{
c.HealthStatus,
c.DegradedPGs,
c.UncleanPGs,
c.UndersizedPGs,
@ -166,6 +189,7 @@ type cephHealthStats struct {
Severity string `json:"severity"`
Summary string `json:"summary"`
} `json:"summary"`
OverallStatus string `json:"overall_status"`
} `json:"health"`
OSDMap struct {
OSDMap struct {
@ -195,6 +219,17 @@ func (c *ClusterHealthCollector) collect() error {
}
}
switch stats.Health.OverallStatus {
case CephHealthOK:
c.HealthStatus.Set(0)
case CephHealthWarn:
c.HealthStatus.Set(1)
case CephHealthErr:
c.HealthStatus.Set(2)
default:
c.HealthStatus.Set(2)
}
var (
degradedRegex = regexp.MustCompile(`([\d]+) pgs degraded`)
uncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`)

View File

@ -30,7 +30,7 @@ func TestClusterHealthCollector(t *testing.T) {
regexes []*regexp.Regexp
}{
{
`
input: `
{
"osdmap": {
"osdmap": {
@ -42,12 +42,12 @@ func TestClusterHealthCollector(t *testing.T) {
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "5 pgs degraded"}]}
}`,
[]*regexp.Regexp{
regexes: []*regexp.Regexp{
regexp.MustCompile(`degraded_pgs 5`),
},
},
{
`
input: `
{
"osdmap": {
"osdmap": {
@ -59,12 +59,12 @@ func TestClusterHealthCollector(t *testing.T) {
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "6 pgs stuck unclean"}]}
}`,
[]*regexp.Regexp{
regexes: []*regexp.Regexp{
regexp.MustCompile(`unclean_pgs 6`),
},
},
{
`
input: `
{
"osdmap": {
"osdmap": {
@ -76,12 +76,12 @@ func TestClusterHealthCollector(t *testing.T) {
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "7 pgs undersized"}]}
}`,
[]*regexp.Regexp{
regexes: []*regexp.Regexp{
regexp.MustCompile(`undersized_pgs 7`),
},
},
{
`
input: `
{
"osdmap": {
"osdmap": {
@ -93,12 +93,12 @@ func TestClusterHealthCollector(t *testing.T) {
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "8 pgs stale"}]}
}`,
[]*regexp.Regexp{
regexes: []*regexp.Regexp{
regexp.MustCompile(`stale_pgs 8`),
},
},
{
`
input: `
{
"osdmap": {
"osdmap": {
@ -110,12 +110,12 @@ func TestClusterHealthCollector(t *testing.T) {
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "recovery 10/20 objects degraded"}]}
}`,
[]*regexp.Regexp{
regexes: []*regexp.Regexp{
regexp.MustCompile(`degraded_objects 10`),
},
},
{
`
input: `
{
"osdmap": {
"osdmap": {
@ -127,12 +127,12 @@ func TestClusterHealthCollector(t *testing.T) {
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "3/20 in osds are down"}]}
}`,
[]*regexp.Regexp{
regexes: []*regexp.Regexp{
regexp.MustCompile(`osds_down 3`),
},
},
{
`
input: `
{
"osdmap": {
"osdmap": {
@ -144,13 +144,61 @@ func TestClusterHealthCollector(t *testing.T) {
},
"health": {"summary": []}
}`,
[]*regexp.Regexp{
regexes: []*regexp.Regexp{
regexp.MustCompile(`osds 1200`),
regexp.MustCompile(`osds_up 1200`),
regexp.MustCompile(`osds_in 1190`),
regexp.MustCompile(`pgs_remapped 10`),
},
},
{
input: `
{
"osdmap": {
"osdmap": {
"num_osds": 1200,
"num_up_osds": 1200,
"num_in_osds": 1190,
"num_remapped_pgs": 10
}
},
"health": { "overall_status": "HEALTH_OK" } }`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`health_status 0`),
},
},
{
input: `
{
"osdmap": {
"osdmap": {
"num_osds": 1200,
"num_up_osds": 1200,
"num_in_osds": 1190,
"num_remapped_pgs": 10
}
},
"health": { "overall_status": "HEALTH_WARN" } }`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`health_status 1`),
},
},
{
input: `
{
"osdmap": {
"osdmap": {
"num_osds": 1200,
"num_up_osds": 1200,
"num_in_osds": 1190,
"num_remapped_pgs": 10
}
},
"health": { "overall_status": "HEALTH_ERR" } }`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`health_status 2`),
},
},
} {
func() {
collector := NewClusterHealthCollector(NewNoopConn(tt.input))