mirror of
https://github.com/digitalocean/ceph_exporter
synced 2025-02-21 13:56:48 +00:00
health: add osds_too_many_repair gauge
This commit is contained in:
parent
d33169f435
commit
ef8b362842
@ -175,6 +175,9 @@ type ClusterHealthCollector struct {
|
||||
// NewCrashReportCount reports if new Ceph daemon crash reports are available
|
||||
NewCrashReportCount prometheus.Gauge
|
||||
|
||||
// TooManyRepairs reports the number of OSDs exceeding mon_osd_warn_num_repaired
|
||||
TooManyRepairs prometheus.Gauge
|
||||
|
||||
// Objects show the total no. of RADOS objects that are currently allocated
|
||||
Objects prometheus.Gauge
|
||||
|
||||
@ -327,6 +330,7 @@ func NewClusterHealthCollector(conn Conn, cluster string, logger *logrus.Logger)
|
||||
"OSD_ROOT_DOWN": 1,
|
||||
"OSD_ROW_DOWN": 1,
|
||||
"OSD_SCRUB_ERRORS": 2,
|
||||
"OSD_TOO_MANY_REPAIRS": 1,
|
||||
"PG_AVAILABILITY": 1,
|
||||
"PG_BACKFILL_FULL": 2,
|
||||
"PG_DAMAGED": 2,
|
||||
@ -610,6 +614,14 @@ func NewClusterHealthCollector(conn Conn, cluster string, logger *logrus.Logger)
|
||||
ConstLabels: labels,
|
||||
},
|
||||
),
|
||||
TooManyRepairs: prometheus.NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: cephNamespace,
|
||||
Name: "osds_too_many_repair",
|
||||
Help: "Number of OSDs with too many repaired reads",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
),
|
||||
Objects: prometheus.NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: cephNamespace,
|
||||
@ -908,6 +920,7 @@ func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
|
||||
c.DegradedObjectsCount,
|
||||
c.MisplacedObjectsCount,
|
||||
c.NewCrashReportCount,
|
||||
c.TooManyRepairs,
|
||||
c.Objects,
|
||||
c.OSDMapFlagFull,
|
||||
c.OSDMapFlagPauseRd,
|
||||
@ -1078,6 +1091,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
|
||||
stuckStaleRegex = regexp.MustCompile(`([\d]+) pgs stuck stale`)
|
||||
slowOpsRegexNautilus = regexp.MustCompile(`([\d]+) slow ops, oldest one blocked for ([\d]+) sec`)
|
||||
newCrashreportRegex = regexp.MustCompile(`([\d]+) daemons have recently crashed`)
|
||||
tooManyRepairs = regexp.MustCompile(`Too many repaired reads on ([\d]+) OSDs`)
|
||||
osdmapFlagsRegex = regexp.MustCompile(`([^ ]+) flag\(s\) set`)
|
||||
)
|
||||
|
||||
@ -1164,6 +1178,17 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
|
||||
}
|
||||
}
|
||||
|
||||
if k == "OSD_TOO_MANY_REPAIRS" {
|
||||
matched := tooManyRepairs.FindStringSubmatch(check.Summary.Message)
|
||||
if len(matched) == 2 {
|
||||
v, err := strconv.Atoi(matched[1])
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
c.TooManyRepairs.Set(float64(v))
|
||||
}
|
||||
}
|
||||
|
||||
if k == "OSDMAP_FLAGS" {
|
||||
matched := osdmapFlagsRegex.FindStringSubmatch(check.Summary.Message)
|
||||
if len(matched) > 0 {
|
||||
|
@ -475,6 +475,25 @@ $ sudo ceph -s
|
||||
},
|
||||
{
|
||||
input: `
|
||||
{
|
||||
"health": {
|
||||
"checks": {
|
||||
"OSD_TOO_MANY_REPAIRS": {
|
||||
"severity": "HEALTH_WARN",
|
||||
"summary": {
|
||||
"message": "Too many repaired reads on 25 OSDs"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}`,
|
||||
reMatch: []*regexp.Regexp{
|
||||
regexp.MustCompile(`osds_too_many_repair{cluster="ceph"} 25`),
|
||||
regexp.MustCompile(`health_status_interp{cluster="ceph"} 1`),
|
||||
},
|
||||
},
|
||||
{
|
||||
input: `
|
||||
{
|
||||
"health": {
|
||||
"checks": {
|
||||
|
Loading…
Reference in New Issue
Block a user