From 47b7ae2ed638587cd87b6de7397359d2406a635d Mon Sep 17 00:00:00 2001 From: Matt Vandersomething Date: Tue, 21 Dec 2021 14:24:47 -0400 Subject: [PATCH] collectors/health: add repair state checking --- collectors/health.go | 15 +++++++++++++++ collectors/health_test.go | 5 +++++ 2 files changed, 20 insertions(+) diff --git a/collectors/health.go b/collectors/health.go index 3a9b0f8..701ab65 100644 --- a/collectors/health.go +++ b/collectors/health.go @@ -158,6 +158,9 @@ type ClusterHealthCollector struct { // SnaptrimWaitPGs depicts no. of PGs that are currently waiting to snaptrim SnaptrimWaitPGs prometheus.Gauge + // RepairingPGs depicts no. of PGs that are currently repairing + RepairingPGs prometheus.Gauge + // SlowOps depicts no. of total slow ops in the cluster SlowOps prometheus.Gauge @@ -507,6 +510,14 @@ func NewClusterHealthCollector(conn Conn, cluster string, logger *logrus.Logger) ConstLabels: labels, }, ), + RepairingPGs: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: cephNamespace, + Name: "repairing_pgs", + Help: "No. of PGs in the cluster with repair state", + ConstLabels: labels, + }, + ), // with Nautilus, SLOW_OPS has replaced both REQUEST_SLOW and REQUEST_STUCK // therefore slow_requests is deprecated, but for backwards compatibility // the metric name will be kept the same for the time being @@ -916,6 +927,7 @@ func (c *ClusterHealthCollector) metricsList() []prometheus.Metric { c.InconsistentPGs, c.SnaptrimPGs, c.SnaptrimWaitPGs, + c.RepairingPGs, c.SlowOps, c.DegradedObjectsCount, c.MisplacedObjectsCount, @@ -1252,6 +1264,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error { inconsistentPGs float64 snaptrimPGs float64 snaptrimWaitPGs float64 + repairingPGs float64 pgStateCounterMap = map[string]*float64{ "degraded": °radedPGs, @@ -1273,6 +1286,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error { "inconsistent": &inconsistentPGs, "snaptrim": &snaptrimPGs, "snaptrim_wait": &snaptrimWaitPGs, + "repair": &repairingPGs, } pgStateGaugeMap = map[string]prometheus.Gauge{ "degraded": c.DegradedPGs, @@ -1294,6 +1308,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error { "inconsistent": c.InconsistentPGs, "snaptrim": c.SnaptrimPGs, "snaptrim_wait": c.SnaptrimWaitPGs, + "repair": c.RepairingPGs, } ) diff --git a/collectors/health_test.go b/collectors/health_test.go index 64aba9c..876f6cf 100644 --- a/collectors/health_test.go +++ b/collectors/health_test.go @@ -340,6 +340,10 @@ $ sudo ceph -s "state_name": "active+clean+inconsistent", "count": 1 }, + { + "state_name": "active+clean+repair", + "count": 1 + }, { "state_name": "active+clean+snaptrim", "count": 15 @@ -362,6 +366,7 @@ $ sudo ceph -s regexp.MustCompile(`cluster_objects{cluster="ceph"} 13156`), regexp.MustCompile(`snaptrim_pgs{cluster="ceph"} 15`), regexp.MustCompile(`snaptrim_wait_pgs{cluster="ceph"} 25`), + regexp.MustCompile(`repair{cluster="ceph"} 1`), }, }, {