diff --git a/collectors/health.go b/collectors/health.go index ea569dd..68f1b15 100644 --- a/collectors/health.go +++ b/collectors/health.go @@ -37,18 +37,35 @@ type ClusterHealthCollector struct { // missing. DegradedPGs prometheus.Gauge + // StuckDegradedPGs shows the no. of PGs that have some of the replicas + // missing, and are stuck in that state. + StuckDegradedPGs prometheus.Gauge + // UncleanPGs shows the no. of PGs that do not have all objects in the PG // that are supposed to be in it. UncleanPGs prometheus.Gauge + // StuckUncleanPGs shows the no. of PGs that do not have all objects in the PG + // that are supposed to be in it, and are stuck in that state. + StuckUncleanPGs prometheus.Gauge + // UndersizedPGs depicts the no. of PGs that have fewer copies than configured // replication level. UndersizedPGs prometheus.Gauge + // StuckUndersizedPGs depicts the no. of PGs that have fewer copies than configured + // replication level, and are stuck in that state. + StuckUndersizedPGs prometheus.Gauge + // StalePGs depicts no. of PGs that are in an unknown state i.e. monitors do not know // anything about their latest state since their pg mapping was modified. StalePGs prometheus.Gauge + // StuckStalePGs depicts no. of PGs that are in an unknown state i.e. monitors do not know + // anything about their latest state since their pg mapping was modified, and are stuck + // in that state. + StuckStalePGs prometheus.Gauge + // DegradedObjectsCount gives the no. of RADOS objects are constitute the degraded PGs. DegradedObjectsCount prometheus.Gauge @@ -101,6 +118,13 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector { Help: "No. of PGs in a degraded state", }, ), + StuckDegradedPGs: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: cephNamespace, + Name: "stuck_degraded_pgs", + Help: "No. of PGs stuck in a degraded state", + }, + ), UncleanPGs: prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: cephNamespace, @@ -108,6 +132,13 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector { Help: "No. of PGs in an unclean state", }, ), + StuckUncleanPGs: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: cephNamespace, + Name: "stuck_unclean_pgs", + Help: "No. of PGs stuck in an unclean state", + }, + ), UndersizedPGs: prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: cephNamespace, @@ -115,6 +146,13 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector { Help: "No. of undersized PGs in the cluster", }, ), + StuckUndersizedPGs: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: cephNamespace, + Name: "stuck_undersized_pgs", + Help: "No. of stuck undersized PGs in the cluster", + }, + ), StalePGs: prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: cephNamespace, @@ -122,6 +160,13 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector { Help: "No. of stale PGs in the cluster", }, ), + StuckStalePGs: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: cephNamespace, + Name: "stuck_stale_pgs", + Help: "No. of stuck stale PGs in the cluster", + }, + ), DegradedObjectsCount: prometheus.NewGauge( prometheus.GaugeOpts{ Namespace: cephNamespace, @@ -171,9 +216,13 @@ func (c *ClusterHealthCollector) metricsList() []prometheus.Metric { return []prometheus.Metric{ c.HealthStatus, c.DegradedPGs, + c.StuckDegradedPGs, c.UncleanPGs, + c.StuckUncleanPGs, c.UndersizedPGs, + c.StuckUndersizedPGs, c.StalePGs, + c.StuckStalePGs, c.DegradedObjectsCount, c.OSDsDown, c.OSDsUp, @@ -232,9 +281,13 @@ func (c *ClusterHealthCollector) collect() error { var ( degradedRegex = regexp.MustCompile(`([\d]+) pgs degraded`) - uncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`) + stuckDegradedRegex = regexp.MustCompile(`([\d]+) pgs stuck degraded`) + uncleanRegex = regexp.MustCompile(`([\d]+) pgs unclean`) + stuckUncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`) undersizedRegex = regexp.MustCompile(`([\d]+) pgs undersized`) + stuckUndersizedRegex = regexp.MustCompile(`([\d]+) pgs stuck undersized`) staleRegex = regexp.MustCompile(`([\d]+) pgs stale`) + stuckStaleRegex = regexp.MustCompile(`([\d]+) pgs stuck stale`) degradedObjectsRegex = regexp.MustCompile(`recovery ([\d]+)/([\d]+) objects degraded`) osdsDownRegex = regexp.MustCompile(`([\d]+)/([\d]+) in osds are down`) ) @@ -249,6 +302,15 @@ func (c *ClusterHealthCollector) collect() error { c.DegradedPGs.Set(float64(v)) } + matched = stuckDegradedRegex.FindStringSubmatch(s.Summary) + if len(matched) == 2 { + v, err := strconv.Atoi(matched[1]) + if err != nil { + return err + } + c.StuckDegradedPGs.Set(float64(v)) + } + matched = uncleanRegex.FindStringSubmatch(s.Summary) if len(matched) == 2 { v, err := strconv.Atoi(matched[1]) @@ -258,6 +320,15 @@ func (c *ClusterHealthCollector) collect() error { c.UncleanPGs.Set(float64(v)) } + matched = stuckUncleanRegex.FindStringSubmatch(s.Summary) + if len(matched) == 2 { + v, err := strconv.Atoi(matched[1]) + if err != nil { + return err + } + c.StuckUncleanPGs.Set(float64(v)) + } + matched = undersizedRegex.FindStringSubmatch(s.Summary) if len(matched) == 2 { v, err := strconv.Atoi(matched[1]) @@ -267,6 +338,15 @@ func (c *ClusterHealthCollector) collect() error { c.UndersizedPGs.Set(float64(v)) } + matched = stuckUndersizedRegex.FindStringSubmatch(s.Summary) + if len(matched) == 2 { + v, err := strconv.Atoi(matched[1]) + if err != nil { + return err + } + c.StuckUndersizedPGs.Set(float64(v)) + } + matched = staleRegex.FindStringSubmatch(s.Summary) if len(matched) == 2 { v, err := strconv.Atoi(matched[1]) @@ -276,6 +356,15 @@ func (c *ClusterHealthCollector) collect() error { c.StalePGs.Set(float64(v)) } + matched = stuckStaleRegex.FindStringSubmatch(s.Summary) + if len(matched) == 2 { + v, err := strconv.Atoi(matched[1]) + if err != nil { + return err + } + c.StuckStalePGs.Set(float64(v)) + } + matched = degradedObjectsRegex.FindStringSubmatch(s.Summary) if len(matched) == 3 { v, err := strconv.Atoi(matched[1]) diff --git a/collectors/health_test.go b/collectors/health_test.go index 11a8b76..308d89d 100644 --- a/collectors/health_test.go +++ b/collectors/health_test.go @@ -57,7 +57,24 @@ func TestClusterHealthCollector(t *testing.T) { "num_remapped_pgs": 0 } }, - "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "6 pgs stuck unclean"}]} + "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "15 pgs stuck degraded"}]} +}`, + regexes: []*regexp.Regexp{ + regexp.MustCompile(`stuck_degraded_pgs 15`), + }, + }, + { + input: ` +{ + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, + "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "6 pgs unclean"}]} }`, regexes: []*regexp.Regexp{ regexp.MustCompile(`unclean_pgs 6`), @@ -65,6 +82,23 @@ func TestClusterHealthCollector(t *testing.T) { }, { input: ` +{ + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, + "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "16 pgs stuck unclean"}]} +}`, + regexes: []*regexp.Regexp{ + regexp.MustCompile(`stuck_unclean_pgs 16`), + }, + }, + { + input: ` { "osdmap": { "osdmap": { @@ -82,6 +116,23 @@ func TestClusterHealthCollector(t *testing.T) { }, { input: ` +{ + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, + "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "17 pgs stuck undersized"}]} +}`, + regexes: []*regexp.Regexp{ + regexp.MustCompile(`stuck_undersized_pgs 17`), + }, + }, + { + input: ` { "osdmap": { "osdmap": { @@ -99,6 +150,23 @@ func TestClusterHealthCollector(t *testing.T) { }, { input: ` +{ + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, + "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "18 pgs stuck stale"}]} +}`, + regexes: []*regexp.Regexp{ + regexp.MustCompile(`stuck_stale_pgs 18`), + }, + }, + { + input: ` { "osdmap": { "osdmap": {