Merge pull request #110 from digitalocean/luminous_pg_down

health: add stats for down PGs
2018-11-22 11:25:46 -05:00 · 2018-11-22 11:25:46 -05:00 · f91b1241dc
parent 3efbebdb4f 31bb74f8ea
commit f91b1241dc
2 changed files with 26 additions and 0 deletions
--- a/collectors/health.go
+++ b/collectors/health.go
@ -130,6 +130,9 @@ type ClusterHealthCollector struct {
 	// ForcedBackfillPGs depicts no. of PGs that are undergoing forced backfill.
 	ForcedBackfillPGs prometheus.Gauge

+	// DownPGs depicts no. of PGs that are currently down and not able to serve traffic.
+	DownPGs prometheus.Gauge
+
 	// SlowRequests depicts no. of total slow requests in the cluster
 	// This stat exists only for backwards compatbility.
 	SlowRequests prometheus.Gauge
@ -309,6 +312,14 @@ func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollecto
 				ConstLabels: labels,
 			},
 		),
+		DownPGs: prometheus.NewGauge(
+			prometheus.GaugeOpts{
+				Namespace:   cephNamespace,
+				Name:        "down_pgs",
+				Help:        "No. of PGs in the cluster in down state",
+				ConstLabels: labels,
+			},
+		),
 		SlowRequests: prometheus.NewGauge(
 			prometheus.GaugeOpts{
 				Namespace:   cephNamespace,
@ -564,6 +575,7 @@ func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
 		c.BackfillWaitPGs,
 		c.ForcedRecoveryPGs,
 		c.ForcedBackfillPGs,
+		c.DownPGs,
 		c.SlowRequests,
 		c.DegradedObjectsCount,
 		c.MisplacedObjectsCount,
@ -848,6 +860,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
 		backfillWaitPGs   float64
 		forcedRecoveryPGs float64
 		forcedBackfillPGs float64
+		downPGs           float64

 		pgStateMap = map[string]*float64{
 			"degraded":        &degradedPGs,
@ -864,6 +877,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
 			"backfill_wait":   &backfillWaitPGs,
 			"forced_recovery": &forcedRecoveryPGs,
 			"forced_backfill": &forcedBackfillPGs,
+			"down":            &downPGs,
 		}
 	)

@ -917,6 +931,9 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
 	if *pgStateMap["forced_backfill"] > 0 {
 		c.ForcedBackfillPGs.Set(*pgStateMap["forced_backfill"])
 	}
+	if *pgStateMap["down"] > 0 {
+		c.DownPGs.Set(*pgStateMap["down"])
+	}

 	c.ClientReadBytesPerSec.Set(stats.PGMap.ReadBytePerSec)
 	c.ClientWriteBytesPerSec.Set(stats.PGMap.WriteBytePerSec)
--- a/collectors/health_test.go
+++ b/collectors/health_test.go
@ -640,6 +640,14 @@ $ sudo ceph -s
                "state_name": "active+undersized+remapped+backfill_wait+forced_backfill",
                "count": 10
            },
+            {
+                "state_name": "down",
+                "count": 6
+            },
+            {
+                "state_name": "down+remapped",
+                "count": 31
+            },
            {
                "state_name": "active+forced_recovery+undersized",
                "count": 1
@ -675,6 +683,7 @@ $ sudo ceph -s
 				regexp.MustCompile(`backfill_wait_pgs{cluster="ceph"} 11`),
 				regexp.MustCompile(`forced_recovery_pgs{cluster="ceph"} 1`),
 				regexp.MustCompile(`forced_backfill_pgs{cluster="ceph"} 10`),
+				regexp.MustCompile(`down_pgs{cluster="ceph"} 37`),
 				regexp.MustCompile(`recovery_io_bytes{cluster="ceph"} 65536`),
 				regexp.MustCompile(`recovery_io_keys{cluster="ceph"} 25`),
 				regexp.MustCompile(`recovery_io_objects{cluster="ceph"} 140`),