From 102b2aca8df77ef57229c368dfc913c17f40dfb2 Mon Sep 17 00:00:00 2001 From: Vaibhav Bhembre Date: Fri, 8 Jan 2016 20:37:25 +0000 Subject: [PATCH 1/2] Adds OSD metrics - New metrics are added for tracking total no. of OSDs and their state for e.g. IN, UP, etc. - The count of PGs being moved around is surfaced. - Comments are added for all the health types. --- collectors/health.go | 119 +++++++++++++++++++++++++++++++++----- collectors/health_test.go | 98 +++++++++++++++++++++++++++++-- 2 files changed, 195 insertions(+), 22 deletions(-) diff --git a/collectors/health.go b/collectors/health.go index 682c3f5..f83b90c 100644 --- a/collectors/health.go +++ b/collectors/health.go @@ -27,16 +27,43 @@ import ( // It surfaces changes in the ceph parameters unlike data usage that ClusterUsageCollector // does. type ClusterHealthCollector struct { + // conn holds connection to the Ceph cluster conn Conn - DegradedPGs prometheus.Gauge - UncleanPGs prometheus.Gauge - UndersizedPGs prometheus.Gauge - StalePGs prometheus.Gauge + // DegradedPGs shows the no. of PGs that have some of the replicas + // missing. + DegradedPGs prometheus.Gauge + // UncleanPGs shows the no. of PGs that do not have all objects in the PG + // that are supposed to be in it. + UncleanPGs prometheus.Gauge + + // UndersizedPGs depicts the count of PGs that have fewer copies than configured + // replication level. + UndersizedPGs prometheus.Gauge + + // StalePGs depicts no. of PGs that are in an unknown state i.e. monitors do not know + // anything about their latest state since their pg mapping was modified. + StalePGs prometheus.Gauge + + // DegradedObjectsCount gives the no. of RADOS objects are constitute the degraded PGs. DegradedObjectsCount prometheus.Gauge + // OSDsDown show the no. of OSDs that are in the DOWN state. OSDsDown prometheus.Gauge + + // OSDsUp show the no. of OSDs that are in the UP state and are able to serve requests. + OSDsUp prometheus.Gauge + + // OSDsIn shows the no. of OSDs that are marked as IN in the cluster. + OSDsIn prometheus.Gauge + + // OSDsNum shows the count of total OSDs the cluster has. + OSDsNum prometheus.Gauge + + // RemappedPGs show the count of PGs that are currently remapped and needs to be moved + // to newer OSDs. + RemappedPGs prometheus.Gauge } // NewClusterHealthCollector creates a new instance of ClusterHealthCollector to collect health @@ -87,6 +114,34 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector { Help: "Count of OSDs that are in DOWN state", }, ), + OSDsUp: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: cephNamespace, + Name: "osds_up", + Help: "Count of OSDs that are in UP state", + }, + ), + OSDsIn: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: cephNamespace, + Name: "osds_in", + Help: "Count of OSDs that are in IN state and available to serve requests", + }, + ), + OSDsNum: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: cephNamespace, + Name: "osds", + Help: "Count of total OSDs in the cluster", + }, + ), + RemappedPGs: prometheus.NewGauge( + prometheus.GaugeOpts{ + Namespace: cephNamespace, + Name: "pgs_remapped", + Help: "No. of PGs that are remapped and incurring cluster-wide movement", + }, + ), } } @@ -98,14 +153,28 @@ func (c *ClusterHealthCollector) metricsList() []prometheus.Metric { c.StalePGs, c.DegradedObjectsCount, c.OSDsDown, + c.OSDsUp, + c.OSDsIn, + c.OSDsNum, + c.RemappedPGs, } } type cephHealthStats struct { - Summary []struct { - Severity string `json:"severity"` - Summary string `json:"summary"` - } `json:"summary"` + Health struct { + Summary []struct { + Severity string `json:"severity"` + Summary string `json:"summary"` + } `json:"summary"` + } `json:"health"` + OSDMap struct { + OSDMap struct { + NumOSDs json.Number `json:"num_osds"` + NumUpOSDs json.Number `json:"num_up_osds"` + NumInOSDs json.Number `json:"num_in_osds"` + NumRemappedPGs json.Number `json:"num_remapped_pgs"` + } `json:"osdmap"` + } `json:"osdmap"` } func (c *ClusterHealthCollector) collect() error { @@ -126,10 +195,6 @@ func (c *ClusterHealthCollector) collect() error { } } - if len(stats.Summary) < 1 { - return nil - } - var ( degradedRegex = regexp.MustCompile(`([\d]+) pgs degraded`) uncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`) @@ -139,7 +204,7 @@ func (c *ClusterHealthCollector) collect() error { osdsDownRegex = regexp.MustCompile(`([\d]+)/([\d]+) in osds are down`) ) - for _, s := range stats.Summary { + for _, s := range stats.Health.Summary { matched := degradedRegex.FindStringSubmatch(s.Summary) if len(matched) == 2 { v, err := strconv.Atoi(matched[1]) @@ -193,16 +258,38 @@ func (c *ClusterHealthCollector) collect() error { } c.OSDsDown.Set(float64(v)) } - } + osdsUp, err := stats.OSDMap.OSDMap.NumUpOSDs.Float64() + if err != nil { + return err + } + c.OSDsUp.Set(osdsUp) + + osdsIn, err := stats.OSDMap.OSDMap.NumInOSDs.Float64() + if err != nil { + return err + } + c.OSDsIn.Set(osdsIn) + + osdsNum, err := stats.OSDMap.OSDMap.NumOSDs.Float64() + if err != nil { + return err + } + c.OSDsNum.Set(osdsNum) + + remappedPGs, err := stats.OSDMap.OSDMap.NumRemappedPGs.Float64() + if err != nil { + return err + } + c.RemappedPGs.Set(remappedPGs) + return nil } func (c *ClusterHealthCollector) cephUsageCommand() []byte { cmd, err := json.Marshal(map[string]interface{}{ - "prefix": "health", - "detail": "detail", + "prefix": "status", "format": "json", }) if err != nil { diff --git a/collectors/health_test.go b/collectors/health_test.go index fdbc802..6f48b17 100644 --- a/collectors/health_test.go +++ b/collectors/health_test.go @@ -30,41 +30,127 @@ func TestClusterHealthCollector(t *testing.T) { regexes []*regexp.Regexp }{ { - `{"summary": [{"severity": "HEALTH_WARN", "summary": "5 pgs degraded"}]}`, + ` +{ + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, + "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "5 pgs degraded"}]} +}`, []*regexp.Regexp{ regexp.MustCompile(`degraded_pgs 5`), }, }, { - `{"summary": [{"severity": "HEALTH_WARN", "summary": "6 pgs stuck unclean"}]}`, + ` +{ + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, + "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "6 pgs stuck unclean"}]} +}`, []*regexp.Regexp{ regexp.MustCompile(`unclean_pgs 6`), }, }, { - `{"summary": [{"severity": "HEALTH_WARN", "summary": "7 pgs undersized"}]}`, + ` +{ + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, + "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "7 pgs undersized"}]} +}`, []*regexp.Regexp{ regexp.MustCompile(`undersized_pgs 7`), }, }, { - `{"summary": [{"severity": "HEALTH_WARN", "summary": "8 pgs stale"}]}`, + ` +{ + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, + "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "8 pgs stale"}]} +}`, []*regexp.Regexp{ regexp.MustCompile(`stale_pgs 8`), }, }, { - `{"summary": [{"severity": "HEALTH_WARN", "summary": "recovery 10/20 objects degraded"}]}`, + ` +{ + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, + "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "recovery 10/20 objects degraded"}]} +}`, []*regexp.Regexp{ regexp.MustCompile(`degraded_objects 10`), }, }, { - `{"summary": [{"severity": "HEALTH_WARN", "summary": "3/20 in osds are down"}]}`, + ` +{ + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, + "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "3/20 in osds are down"}]} +}`, []*regexp.Regexp{ regexp.MustCompile(`osds_down 3`), }, }, + { + ` +{ + "osdmap": { + "osdmap": { + "num_osds": 1200, + "num_up_osds": 1200, + "num_in_osds": 1190, + "num_remapped_pgs": 10 + } + }, + "health": {"summary": []} +}`, + []*regexp.Regexp{ + regexp.MustCompile(`osds 1200`), + regexp.MustCompile(`osds_up 1200`), + regexp.MustCompile(`osds_in 1190`), + regexp.MustCompile(`pgs_remapped 10`), + }, + }, } { func() { collector := NewClusterHealthCollector(NewNoopConn(tt.input)) From bc5c61b8c6051257e22ec98b7f5a5cfc4e47a4ad Mon Sep 17 00:00:00 2001 From: Vaibhav Bhembre Date: Fri, 8 Jan 2016 21:10:12 +0000 Subject: [PATCH 2/2] Use no. instead of count in the comments of metrics --- collectors/health.go | 6 +- collectors/health_test.go | 112 +++++++++++++++++++------------------- 2 files changed, 59 insertions(+), 59 deletions(-) diff --git a/collectors/health.go b/collectors/health.go index f83b90c..849c5fc 100644 --- a/collectors/health.go +++ b/collectors/health.go @@ -38,7 +38,7 @@ type ClusterHealthCollector struct { // that are supposed to be in it. UncleanPGs prometheus.Gauge - // UndersizedPGs depicts the count of PGs that have fewer copies than configured + // UndersizedPGs depicts the no. of PGs that have fewer copies than configured // replication level. UndersizedPGs prometheus.Gauge @@ -58,10 +58,10 @@ type ClusterHealthCollector struct { // OSDsIn shows the no. of OSDs that are marked as IN in the cluster. OSDsIn prometheus.Gauge - // OSDsNum shows the count of total OSDs the cluster has. + // OSDsNum shows the no. of total OSDs the cluster has. OSDsNum prometheus.Gauge - // RemappedPGs show the count of PGs that are currently remapped and needs to be moved + // RemappedPGs show the no. of PGs that are currently remapped and needs to be moved // to newer OSDs. RemappedPGs prometheus.Gauge } diff --git a/collectors/health_test.go b/collectors/health_test.go index 6f48b17..04befc1 100644 --- a/collectors/health_test.go +++ b/collectors/health_test.go @@ -32,14 +32,14 @@ func TestClusterHealthCollector(t *testing.T) { { ` { - "osdmap": { - "osdmap": { - "num_osds": 0, - "num_up_osds": 0, - "num_in_osds": 0, - "num_remapped_pgs": 0 - } - }, + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "5 pgs degraded"}]} }`, []*regexp.Regexp{ @@ -49,14 +49,14 @@ func TestClusterHealthCollector(t *testing.T) { { ` { - "osdmap": { - "osdmap": { - "num_osds": 0, - "num_up_osds": 0, - "num_in_osds": 0, - "num_remapped_pgs": 0 - } - }, + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "6 pgs stuck unclean"}]} }`, []*regexp.Regexp{ @@ -66,14 +66,14 @@ func TestClusterHealthCollector(t *testing.T) { { ` { - "osdmap": { - "osdmap": { - "num_osds": 0, - "num_up_osds": 0, - "num_in_osds": 0, - "num_remapped_pgs": 0 - } - }, + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "7 pgs undersized"}]} }`, []*regexp.Regexp{ @@ -83,14 +83,14 @@ func TestClusterHealthCollector(t *testing.T) { { ` { - "osdmap": { - "osdmap": { - "num_osds": 0, - "num_up_osds": 0, - "num_in_osds": 0, - "num_remapped_pgs": 0 - } - }, + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "8 pgs stale"}]} }`, []*regexp.Regexp{ @@ -100,14 +100,14 @@ func TestClusterHealthCollector(t *testing.T) { { ` { - "osdmap": { - "osdmap": { - "num_osds": 0, - "num_up_osds": 0, - "num_in_osds": 0, - "num_remapped_pgs": 0 - } - }, + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "recovery 10/20 objects degraded"}]} }`, []*regexp.Regexp{ @@ -117,14 +117,14 @@ func TestClusterHealthCollector(t *testing.T) { { ` { - "osdmap": { - "osdmap": { - "num_osds": 0, - "num_up_osds": 0, - "num_in_osds": 0, - "num_remapped_pgs": 0 - } - }, + "osdmap": { + "osdmap": { + "num_osds": 0, + "num_up_osds": 0, + "num_in_osds": 0, + "num_remapped_pgs": 0 + } + }, "health": {"summary": [{"severity": "HEALTH_WARN", "summary": "3/20 in osds are down"}]} }`, []*regexp.Regexp{ @@ -134,14 +134,14 @@ func TestClusterHealthCollector(t *testing.T) { { ` { - "osdmap": { - "osdmap": { - "num_osds": 1200, - "num_up_osds": 1200, - "num_in_osds": 1190, - "num_remapped_pgs": 10 - } - }, + "osdmap": { + "osdmap": { + "num_osds": 1200, + "num_up_osds": 1200, + "num_in_osds": 1190, + "num_remapped_pgs": 10 + } + }, "health": {"summary": []} }`, []*regexp.Regexp{