From 99c0dbcfc6377ddd91c400482a1bf98403f6bb4b Mon Sep 17 00:00:00 2001 From: Vaibhav Bhembre Date: Fri, 5 Feb 2016 21:56:55 +0000 Subject: [PATCH] collectors/health: include a definitive ceph health check --- collectors/health.go | 35 ++++++++++++++++++ collectors/health_test.go | 76 +++++++++++++++++++++++++++++++-------- 2 files changed, 97 insertions(+), 14 deletions(-) diff --git a/collectors/health.go b/collectors/health.go index f124a45..ea569dd 100644 --- a/collectors/health.go +++ b/collectors/health.go @@ -30,6 +30,9 @@ type ClusterHealthCollector struct { // conn holds connection to the Ceph cluster conn Conn + // HealthStatus shows the overall health status of a given cluster. + HealthStatus prometheus.Gauge + // DegradedPGs shows the no. of PGs that have some of the replicas // missing. DegradedPGs prometheus.Gauge @@ -66,12 +69,31 @@ type ClusterHealthCollector struct { RemappedPGs prometheus.Gauge } +const ( + // CephHealthOK denotes the status of ceph cluster when healthy. + CephHealthOK = "HEALTH_OK" + + // CephHealthWarn denotes the status of ceph cluster when unhealthy but recovering. + CephHealthWarn = "HEALTH_WARN" + + // CephHealthErr denotes the status of ceph cluster when unhealthy but usually needs + // manual intervention. + CephHealthErr = "HEALTH_ERR" +) + // NewClusterHealthCollector creates a new instance of ClusterHealthCollector to collect health // metrics on. func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector { return &ClusterHealthCollector{ conn: conn, + HealthStatus: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: cephNamespace, + Name: "health_status", + Help: "Health status of Cluster, can vary only between 3 states (err:2, warn:1, ok:0)", + }, + ), DegradedPGs: prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: cephNamespace, @@ -147,6 +169,7 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector { func (c *ClusterHealthCollector) metricsList() []prometheus.Metric { return []prometheus.Metric{ + c.HealthStatus, c.DegradedPGs, c.UncleanPGs, c.UndersizedPGs, @@ -166,6 +189,7 @@ type cephHealthStats struct { Severity string `json:"severity"` Summary string `json:"summary"` } `json:"summary"` + OverallStatus string `json:"overall_status"` } `json:"health"` OSDMap struct { OSDMap struct { @@ -195,6 +219,17 @@ func (c *ClusterHealthCollector) collect() error { } } + switch stats.Health.OverallStatus { + case CephHealthOK: + c.HealthStatus.Set(0) + case CephHealthWarn: + c.HealthStatus.Set(1) + case CephHealthErr: + c.HealthStatus.Set(2) + default: + c.HealthStatus.Set(2) + } + var ( degradedRegex = regexp.MustCompile(`([\d]+) pgs degraded`) uncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`) diff --git a/collectors/health_test.go b/collectors/health_test.go index 04befc1..11a8b76 100644 --- a/collectors/health_test.go +++ b/collectors/health_test.go @@ -30,7 +30,7 @@ func TestClusterHealthCollector(t *testing.T) { regexes []*regexp.Regexp }{ { - ` + input: ` { "osdmap": { "osdmap": { @@ -42,12 +42,12 @@ func TestClusterHealthCollector(t *testing.T) { }, "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "5 pgs degraded"}]} }`, - []*regexp.Regexp{ + regexes: []*regexp.Regexp{ regexp.MustCompile(`degraded_pgs 5`), }, }, { - ` + input: ` { "osdmap": { "osdmap": { @@ -59,12 +59,12 @@ func TestClusterHealthCollector(t *testing.T) { }, "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "6 pgs stuck unclean"}]} }`, - []*regexp.Regexp{ + regexes: []*regexp.Regexp{ regexp.MustCompile(`unclean_pgs 6`), }, }, { - ` + input: ` { "osdmap": { "osdmap": { @@ -76,12 +76,12 @@ func TestClusterHealthCollector(t *testing.T) { }, "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "7 pgs undersized"}]} }`, - []*regexp.Regexp{ + regexes: []*regexp.Regexp{ regexp.MustCompile(`undersized_pgs 7`), }, }, { - ` + input: ` { "osdmap": { "osdmap": { @@ -93,12 +93,12 @@ func TestClusterHealthCollector(t *testing.T) { }, "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "8 pgs stale"}]} }`, - []*regexp.Regexp{ + regexes: []*regexp.Regexp{ regexp.MustCompile(`stale_pgs 8`), }, }, { - ` + input: ` { "osdmap": { "osdmap": { @@ -110,12 +110,12 @@ func TestClusterHealthCollector(t *testing.T) { }, "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "recovery 10/20 objects degraded"}]} }`, - []*regexp.Regexp{ + regexes: []*regexp.Regexp{ regexp.MustCompile(`degraded_objects 10`), }, }, { - ` + input: ` { "osdmap": { "osdmap": { @@ -127,12 +127,12 @@ func TestClusterHealthCollector(t *testing.T) { }, "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "3/20 in osds are down"}]} }`, - []*regexp.Regexp{ + regexes: []*regexp.Regexp{ regexp.MustCompile(`osds_down 3`), }, }, { - ` + input: ` { "osdmap": { "osdmap": { @@ -144,13 +144,61 @@ func TestClusterHealthCollector(t *testing.T) { }, "health": {"summary": []} }`, - []*regexp.Regexp{ + regexes: []*regexp.Regexp{ regexp.MustCompile(`osds 1200`), regexp.MustCompile(`osds_up 1200`), regexp.MustCompile(`osds_in 1190`), regexp.MustCompile(`pgs_remapped 10`), }, }, + { + input: ` +{ + "osdmap": { + "osdmap": { + "num_osds": 1200, + "num_up_osds": 1200, + "num_in_osds": 1190, + "num_remapped_pgs": 10 + } + }, + "health": { "overall_status": "HEALTH_OK" } }`, + regexes: []*regexp.Regexp{ + regexp.MustCompile(`health_status 0`), + }, + }, + { + input: ` +{ + "osdmap": { + "osdmap": { + "num_osds": 1200, + "num_up_osds": 1200, + "num_in_osds": 1190, + "num_remapped_pgs": 10 + } + }, + "health": { "overall_status": "HEALTH_WARN" } }`, + regexes: []*regexp.Regexp{ + regexp.MustCompile(`health_status 1`), + }, + }, + { + input: ` +{ + "osdmap": { + "osdmap": { + "num_osds": 1200, + "num_up_osds": 1200, + "num_in_osds": 1190, + "num_remapped_pgs": 10 + } + }, + "health": { "overall_status": "HEALTH_ERR" } }`, + regexes: []*regexp.Regexp{ + regexp.MustCompile(`health_status 2`), + }, + }, } { func() { collector := NewClusterHealthCollector(NewNoopConn(tt.input))