health: fix osd_down metric

2025-02-18 04:16:49 +00:00 · 2016-12-01 13:23:05 +05:30 · 2016-12-01 13:23:05 +05:30 · 2649185010
commit 2649185010
parent 0cf254f41d
2 changed files with 9 additions and 15 deletions
--- a/collectors/health.go
+++ b/collectors/health.go
@ -446,7 +446,6 @@ func (c *ClusterHealthCollector) collect() error {
 		stuckStaleRegex       = regexp.MustCompile(`([\d]+) pgs stuck stale`)
 		degradedObjectsRegex  = regexp.MustCompile(`recovery ([\d]+)/([\d]+) objects degraded`)
 		misplacedObjectsRegex = regexp.MustCompile(`recovery ([\d]+)/([\d]+) objects misplaced`)
-		osdsDownRegex         = regexp.MustCompile(`([\d]+)/([\d]+) in osds are down`)
 	)

 	for _, s := range stats.Health.Summary {
@ -539,15 +538,6 @@ func (c *ClusterHealthCollector) collect() error {
 			}
 			c.MisplacedObjectsCount.Set(float64(v))
 		}
-
-		matched = osdsDownRegex.FindStringSubmatch(s.Summary)
-		if len(matched) == 3 {
-			v, err := strconv.Atoi(matched[1])
-			if err != nil {
-				return err
-			}
-			c.OSDsDown.Set(float64(v))
-		}
 	}

 	osdsUp, err := stats.OSDMap.OSDMap.NumUpOSDs.Float64()
@ -568,6 +558,10 @@ func (c *ClusterHealthCollector) collect() error {
 	}
 	c.OSDsNum.Set(osdsNum)

+	// Ceph (until v10.2.3) doesn't expose the value of down OSDs
+	// from its status, which is why we have to compute it ourselves.
+	c.OSDsDown.Set(osdsNum - osdsUp)
+
 	remappedPGs, err := stats.OSDMap.OSDMap.NumRemappedPGs.Float64()
 	if err != nil {
 		return err
@ -822,6 +816,7 @@ func (c *ClusterHealthCollector) collectCacheIO(clientStr string) error {
 	}
 	return nil
 }
+
 // Describe sends all the descriptions of individual metrics of ClusterHealthCollector
 // to the provided prometheus channel.
 func (c *ClusterHealthCollector) Describe(ch chan<- *prometheus.Desc) {
--- a/collectors/health_test.go
+++ b/collectors/health_test.go
@ -204,16 +204,15 @@ func TestClusterHealthCollector(t *testing.T) {
 {
 	"osdmap": {
 		"osdmap": {
-			"num_osds": 0,
-			"num_up_osds": 0,
+			"num_osds": 20,
+			"num_up_osds": 10,
 			"num_in_osds": 0,
 			"num_remapped_pgs": 0
 		}
-	},
-	"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "3/20 in osds are down"}]}
+	}
 }`,
 			regexes: []*regexp.Regexp{
-				regexp.MustCompile(`osds_down 3`),
+				regexp.MustCompile(`osds_down 10`),
 			},
 		},
 		{