Merge pull request #108 from digitalocean/add-recovery-backfill-stats

luminous: add recovery/backfill stats
This commit is contained in:
Vaibhav Bhembre 2018-11-01 12:50:28 -04:00 committed by GitHub
commit 3efbebdb4f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 161 additions and 19 deletions

View File

@ -106,6 +106,30 @@ type ClusterHealthCollector struct {
// Deep scrubbing reads the data and uses checksums to ensure data integrity.
DeepScrubbingPGs prometheus.Gauge
// RecoveringPGs depicts no. of PGs that are in recovering state.
// The PGs in this state have been dequeued from recovery_wait queue and are
// actively undergoing recovery.
RecoveringPGs prometheus.Gauge
// RecoveryWaitPGs depicts no. of PGs that are in recovery_wait state.
// The PGs in this state are still in queue to start recovery on them.
RecoveryWaitPGs prometheus.Gauge
// BackfillingPGs depicts no. of PGs that are in backfilling state.
// The PGs in this state have been dequeued from backfill_wait queue and are
// actively undergoing recovery.
BackfillingPGs prometheus.Gauge
// BackfillWaitPGs depicts no. of PGs that are in backfill_wait state.
// The PGs in this state are still in queue to start backfill on them.
BackfillWaitPGs prometheus.Gauge
// ForcedRecoveryPGs depicts no. of PGs that are undergoing forced recovery.
ForcedRecoveryPGs prometheus.Gauge
// ForcedBackfillPGs depicts no. of PGs that are undergoing forced backfill.
ForcedBackfillPGs prometheus.Gauge
// SlowRequests depicts no. of total slow requests in the cluster
// This stat exists only for backwards compatbility.
SlowRequests prometheus.Gauge
@ -237,6 +261,54 @@ func NewClusterHealthCollector(conn Conn, cluster string) *ClusterHealthCollecto
ConstLabels: labels,
},
),
RecoveringPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "recovering_pgs",
Help: "No. of recovering PGs in the cluster",
ConstLabels: labels,
},
),
RecoveryWaitPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "recovery_wait_pgs",
Help: "No. of PGs in the cluster with recovery_wait state",
ConstLabels: labels,
},
),
BackfillingPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "backfilling_pgs",
Help: "No. of backfilling PGs in the cluster",
ConstLabels: labels,
},
),
BackfillWaitPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "backfill_wait_pgs",
Help: "No. of PGs in the cluster with backfill_wait state",
ConstLabels: labels,
},
),
ForcedRecoveryPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "forced_recovery_pgs",
Help: "No. of PGs in the cluster with forced_recovery state",
ConstLabels: labels,
},
),
ForcedBackfillPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "forced_backfill_pgs",
Help: "No. of PGs in the cluster with forced_backfill state",
ConstLabels: labels,
},
),
SlowRequests: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
@ -486,6 +558,12 @@ func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
c.PeeringPGs,
c.ScrubbingPGs,
c.DeepScrubbingPGs,
c.RecoveringPGs,
c.RecoveryWaitPGs,
c.BackfillingPGs,
c.BackfillWaitPGs,
c.ForcedRecoveryPGs,
c.ForcedBackfillPGs,
c.SlowRequests,
c.DegradedObjectsCount,
c.MisplacedObjectsCount,
@ -756,24 +834,36 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
}
var (
degradedPGs float64
activePGs float64
uncleanPGs float64
undersizedPGs float64
peeringPGs float64
stalePGs float64
scrubbingPGs float64
deepScrubbingPGs float64
degradedPGs float64
activePGs float64
uncleanPGs float64
undersizedPGs float64
peeringPGs float64
stalePGs float64
scrubbingPGs float64
deepScrubbingPGs float64
recoveringPGs float64
recoveryWaitPGs float64
backfillingPGs float64
backfillWaitPGs float64
forcedRecoveryPGs float64
forcedBackfillPGs float64
pgStateMap = map[string]*float64{
"degraded": &degradedPGs,
"active": &activePGs,
"unclean": &uncleanPGs,
"undersized": &undersizedPGs,
"peering": &peeringPGs,
"stale": &stalePGs,
"scrubbing": &scrubbingPGs,
"scrubbing+deep": &deepScrubbingPGs,
"degraded": &degradedPGs,
"active": &activePGs,
"unclean": &uncleanPGs,
"undersized": &undersizedPGs,
"peering": &peeringPGs,
"stale": &stalePGs,
"scrubbing": &scrubbingPGs,
"scrubbing+deep": &deepScrubbingPGs,
"recovering": &recoveringPGs,
"recovery_wait": &recoveryWaitPGs,
"backfilling": &backfillingPGs,
"backfill_wait": &backfillWaitPGs,
"forced_recovery": &forcedRecoveryPGs,
"forced_backfill": &forcedBackfillPGs,
}
)
@ -809,6 +899,24 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
if *pgStateMap["scrubbing+deep"] > 0 {
c.DeepScrubbingPGs.Set(*pgStateMap["scrubbing+deep"])
}
if *pgStateMap["recovering"] > 0 {
c.RecoveringPGs.Set(*pgStateMap["recovering"])
}
if *pgStateMap["recovery_wait"] > 0 {
c.RecoveryWaitPGs.Set(*pgStateMap["recovery_wait"])
}
if *pgStateMap["backfilling"] > 0 {
c.BackfillingPGs.Set(*pgStateMap["backfilling"])
}
if *pgStateMap["backfill_wait"] > 0 {
c.BackfillWaitPGs.Set(*pgStateMap["backfill_wait"])
}
if *pgStateMap["forced_recovery"] > 0 {
c.ForcedRecoveryPGs.Set(*pgStateMap["forced_recovery"])
}
if *pgStateMap["forced_backfill"] > 0 {
c.ForcedBackfillPGs.Set(*pgStateMap["forced_backfill"])
}
c.ClientReadBytesPerSec.Set(stats.PGMap.ReadBytePerSec)
c.ClientWriteBytesPerSec.Set(stats.PGMap.WriteBytePerSec)

View File

@ -615,7 +615,35 @@ $ sudo ceph -s
{
"count": 10,
"state_name": "scrubbing+deep"
}
},
{
"state_name": "remapped+recovering",
"count": 5
},
{
"state_name": "active+remapped+backfilling",
"count": 2
},
{
"state_name": "recovery_wait+inconsistent",
"count": 2
},
{
"state_name": "recovery_wait+remapped",
"count": 1
},
{
"state_name": "active+undersized+remapped+backfill_wait",
"count": 1
},
{
"state_name": "active+undersized+remapped+backfill_wait+forced_backfill",
"count": 10
},
{
"state_name": "active+forced_recovery+undersized",
"count": 1
}
],
"num_pgs": 9208,
"num_pools": 29,
@ -633,14 +661,20 @@ $ sudo ceph -s
}
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`active_pgs{cluster="ceph"} 30`),
regexp.MustCompile(`active_pgs{cluster="ceph"} 44`),
regexp.MustCompile(`degraded_pgs{cluster="ceph"} 40`),
regexp.MustCompile(`unclean_pgs{cluster="ceph"} 30`),
regexp.MustCompile(`undersized_pgs{cluster="ceph"} 40`),
regexp.MustCompile(`undersized_pgs{cluster="ceph"} 52`),
regexp.MustCompile(`stale_pgs{cluster="ceph"} 30`),
regexp.MustCompile(`peering_pgs{cluster="ceph"} 10`),
regexp.MustCompile(`scrubbing_pgs{cluster="ceph"} 20`),
regexp.MustCompile(`deep_scrubbing_pgs{cluster="ceph"} 10`),
regexp.MustCompile(`recovering_pgs{cluster="ceph"} 5`),
regexp.MustCompile(`recovery_wait_pgs{cluster="ceph"} 3`),
regexp.MustCompile(`backfilling_pgs{cluster="ceph"} 2`),
regexp.MustCompile(`backfill_wait_pgs{cluster="ceph"} 11`),
regexp.MustCompile(`forced_recovery_pgs{cluster="ceph"} 1`),
regexp.MustCompile(`forced_backfill_pgs{cluster="ceph"} 10`),
regexp.MustCompile(`recovery_io_bytes{cluster="ceph"} 65536`),
regexp.MustCompile(`recovery_io_keys{cluster="ceph"} 25`),
regexp.MustCompile(`recovery_io_objects{cluster="ceph"} 140`),