Merge pull request #204 from digitalocean/repair-counter

collectors/health: add repair state checking
This commit is contained in:
Matt1360 2021-12-21 15:07:04 -04:00 committed by GitHub
commit e8ea7d7e66
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 20 additions and 0 deletions

View File

@ -158,6 +158,9 @@ type ClusterHealthCollector struct {
// SnaptrimWaitPGs depicts no. of PGs that are currently waiting to snaptrim
SnaptrimWaitPGs prometheus.Gauge
// RepairingPGs depicts no. of PGs that are currently repairing
RepairingPGs prometheus.Gauge
// SlowOps depicts no. of total slow ops in the cluster
SlowOps prometheus.Gauge
@ -507,6 +510,14 @@ func NewClusterHealthCollector(conn Conn, cluster string, logger *logrus.Logger)
ConstLabels: labels,
},
),
RepairingPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "repairing_pgs",
Help: "No. of PGs in the cluster with repair state",
ConstLabels: labels,
},
),
// with Nautilus, SLOW_OPS has replaced both REQUEST_SLOW and REQUEST_STUCK
// therefore slow_requests is deprecated, but for backwards compatibility
// the metric name will be kept the same for the time being
@ -916,6 +927,7 @@ func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
c.InconsistentPGs,
c.SnaptrimPGs,
c.SnaptrimWaitPGs,
c.RepairingPGs,
c.SlowOps,
c.DegradedObjectsCount,
c.MisplacedObjectsCount,
@ -1252,6 +1264,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
inconsistentPGs float64
snaptrimPGs float64
snaptrimWaitPGs float64
repairingPGs float64
pgStateCounterMap = map[string]*float64{
"degraded": &degradedPGs,
@ -1273,6 +1286,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
"inconsistent": &inconsistentPGs,
"snaptrim": &snaptrimPGs,
"snaptrim_wait": &snaptrimWaitPGs,
"repair": &repairingPGs,
}
pgStateGaugeMap = map[string]prometheus.Gauge{
"degraded": c.DegradedPGs,
@ -1294,6 +1308,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
"inconsistent": c.InconsistentPGs,
"snaptrim": c.SnaptrimPGs,
"snaptrim_wait": c.SnaptrimWaitPGs,
"repair": c.RepairingPGs,
}
)

View File

@ -340,6 +340,10 @@ $ sudo ceph -s
"state_name": "active+clean+inconsistent",
"count": 1
},
{
"state_name": "active+clean+repair",
"count": 1
},
{
"state_name": "active+clean+snaptrim",
"count": 15
@ -362,6 +366,7 @@ $ sudo ceph -s
regexp.MustCompile(`cluster_objects{cluster="ceph"} 13156`),
regexp.MustCompile(`snaptrim_pgs{cluster="ceph"} 15`),
regexp.MustCompile(`snaptrim_wait_pgs{cluster="ceph"} 25`),
regexp.MustCompile(`repair{cluster="ceph"} 1`),
},
},
{