health: add osds_too_many_repair gauge

This commit is contained in:
Alex Marangone 2021-12-02 09:34:23 -08:00
parent d33169f435
commit ef8b362842
2 changed files with 44 additions and 0 deletions

View File

@ -175,6 +175,9 @@ type ClusterHealthCollector struct {
// NewCrashReportCount reports if new Ceph daemon crash reports are available
NewCrashReportCount prometheus.Gauge
// TooManyRepairs reports the number of OSDs exceeding mon_osd_warn_num_repaired
TooManyRepairs prometheus.Gauge
// Objects show the total no. of RADOS objects that are currently allocated
Objects prometheus.Gauge
@ -327,6 +330,7 @@ func NewClusterHealthCollector(conn Conn, cluster string, logger *logrus.Logger)
"OSD_ROOT_DOWN": 1,
"OSD_ROW_DOWN": 1,
"OSD_SCRUB_ERRORS": 2,
"OSD_TOO_MANY_REPAIRS": 1,
"PG_AVAILABILITY": 1,
"PG_BACKFILL_FULL": 2,
"PG_DAMAGED": 2,
@ -610,6 +614,14 @@ func NewClusterHealthCollector(conn Conn, cluster string, logger *logrus.Logger)
ConstLabels: labels,
},
),
TooManyRepairs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osds_too_many_repair",
Help: "Number of OSDs with too many repaired reads",
ConstLabels: labels,
},
),
Objects: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
@ -908,6 +920,7 @@ func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
c.DegradedObjectsCount,
c.MisplacedObjectsCount,
c.NewCrashReportCount,
c.TooManyRepairs,
c.Objects,
c.OSDMapFlagFull,
c.OSDMapFlagPauseRd,
@ -1078,6 +1091,7 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
stuckStaleRegex = regexp.MustCompile(`([\d]+) pgs stuck stale`)
slowOpsRegexNautilus = regexp.MustCompile(`([\d]+) slow ops, oldest one blocked for ([\d]+) sec`)
newCrashreportRegex = regexp.MustCompile(`([\d]+) daemons have recently crashed`)
tooManyRepairs = regexp.MustCompile(`Too many repaired reads on ([\d]+) OSDs`)
osdmapFlagsRegex = regexp.MustCompile(`([^ ]+) flag\(s\) set`)
)
@ -1164,6 +1178,17 @@ func (c *ClusterHealthCollector) collect(ch chan<- prometheus.Metric) error {
}
}
if k == "OSD_TOO_MANY_REPAIRS" {
matched := tooManyRepairs.FindStringSubmatch(check.Summary.Message)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.TooManyRepairs.Set(float64(v))
}
}
if k == "OSDMAP_FLAGS" {
matched := osdmapFlagsRegex.FindStringSubmatch(check.Summary.Message)
if len(matched) > 0 {

View File

@ -475,6 +475,25 @@ $ sudo ceph -s
},
{
input: `
{
"health": {
"checks": {
"OSD_TOO_MANY_REPAIRS": {
"severity": "HEALTH_WARN",
"summary": {
"message": "Too many repaired reads on 25 OSDs"
}
}
}
}
}`,
reMatch: []*regexp.Regexp{
regexp.MustCompile(`osds_too_many_repair{cluster="ceph"} 25`),
regexp.MustCompile(`health_status_interp{cluster="ceph"} 1`),
},
},
{
input: `
{
"health": {
"checks": {