From 31bb74f8eac0f09354ed75c08127bc1cd4f6ebd6 Mon Sep 17 00:00:00 2001 From: Vaibhav Bhembre Date: Thu, 22 Nov 2018 11:22:11 -0500 Subject: [PATCH] health: add stats for down PGs --- collectors/health.go | 17 +++++++++++++++++ collectors/health_test.go | 9 +++++++++ 2 files changed, 26 insertions(+) diff --git a/collectors/health.go b/collectors/health.go index fef2920..6f0d3bc 100644 --- a/collectors/health.go +++ b/collectors/health.go @@ -130,6 +130,9 @@ type ClusterHealthCollector struct { // ForcedBackfillPGs depicts no. of PGs that are undergoing forced backfill. ForcedBackfillPGs prometheus.Gauge + // DownPGs depicts no. of PGs that are currently down and not able to serve traffic. + DownPGs prometheus.Gauge + // SlowRequests depicts no. of total slow requests in the cluster // This stat exists only for backwards compatbility. SlowRequests prometheus.Gauge @@ -309,6 +312,14 @@ func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollecto ConstLabels: labels, }, ), + DownPGs: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: cephNamespace, + Name: "down_pgs", + Help: "No. of PGs in the cluster in down state", + ConstLabels: labels, + }, + ), SlowRequests: prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: cephNamespace, @@ -564,6 +575,7 @@ func (c *ClusterHealthCollector) metricsList() []prometheus.Metric { c.BackfillWaitPGs, c.ForcedRecoveryPGs, c.ForcedBackfillPGs, + c.DownPGs, c.SlowRequests, c.DegradedObjectsCount, c.MisplacedObjectsCount, @@ -848,6 +860,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error { backfillWaitPGs float64 forcedRecoveryPGs float64 forcedBackfillPGs float64 + downPGs float64 pgStateMap = map[string]*float64{ "degraded": °radedPGs, @@ -864,6 +877,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error { "backfill_wait": &backfillWaitPGs, "forced_recovery": &forcedRecoveryPGs, "forced_backfill": &forcedBackfillPGs, + "down": &downPGs, } ) @@ -917,6 +931,9 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error { if *pgStateMap["forced_backfill"] > 0 { c.ForcedBackfillPGs.Set(*pgStateMap["forced_backfill"]) } + if *pgStateMap["down"] > 0 { + c.DownPGs.Set(*pgStateMap["down"]) + } c.ClientReadBytesPerSec.Set(stats.PGMap.ReadBytePerSec) c.ClientWriteBytesPerSec.Set(stats.PGMap.WriteBytePerSec) diff --git a/collectors/health_test.go b/collectors/health_test.go index 29ab293..8c98904 100644 --- a/collectors/health_test.go +++ b/collectors/health_test.go @@ -640,6 +640,14 @@ $ sudo ceph -s "state_name": "active+undersized+remapped+backfill_wait+forced_backfill", "count": 10 }, + { + "state_name": "down", + "count": 6 + }, + { + "state_name": "down+remapped", + "count": 31 + }, { "state_name": "active+forced_recovery+undersized", "count": 1 @@ -675,6 +683,7 @@ $ sudo ceph -s regexp.MustCompile(`backfill_wait_pgs{cluster="ceph"} 11`), regexp.MustCompile(`forced_recovery_pgs{cluster="ceph"} 1`), regexp.MustCompile(`forced_backfill_pgs{cluster="ceph"} 10`), + regexp.MustCompile(`down_pgs{cluster="ceph"} 37`), regexp.MustCompile(`recovery_io_bytes{cluster="ceph"} 65536`), regexp.MustCompile(`recovery_io_keys{cluster="ceph"} 25`), regexp.MustCompile(`recovery_io_objects{cluster="ceph"} 140`),