mirror of
https://github.com/digitalocean/ceph_exporter
synced 2025-02-19 21:06:49 +00:00
collectors/health: include a definitive ceph health check
This commit is contained in:
parent
d7b527366c
commit
99c0dbcfc6
@ -30,6 +30,9 @@ type ClusterHealthCollector struct {
|
||||
// conn holds connection to the Ceph cluster
|
||||
conn Conn
|
||||
|
||||
// HealthStatus shows the overall health status of a given cluster.
|
||||
HealthStatus prometheus.Gauge
|
||||
|
||||
// DegradedPGs shows the no. of PGs that have some of the replicas
|
||||
// missing.
|
||||
DegradedPGs prometheus.Gauge
|
||||
@ -66,12 +69,31 @@ type ClusterHealthCollector struct {
|
||||
RemappedPGs prometheus.Gauge
|
||||
}
|
||||
|
||||
const (
|
||||
// CephHealthOK denotes the status of ceph cluster when healthy.
|
||||
CephHealthOK = "HEALTH_OK"
|
||||
|
||||
// CephHealthWarn denotes the status of ceph cluster when unhealthy but recovering.
|
||||
CephHealthWarn = "HEALTH_WARN"
|
||||
|
||||
// CephHealthErr denotes the status of ceph cluster when unhealthy but usually needs
|
||||
// manual intervention.
|
||||
CephHealthErr = "HEALTH_ERR"
|
||||
)
|
||||
|
||||
// NewClusterHealthCollector creates a new instance of ClusterHealthCollector to collect health
|
||||
// metrics on.
|
||||
func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector {
|
||||
return &ClusterHealthCollector{
|
||||
conn: conn,
|
||||
|
||||
HealthStatus: prometheus.NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: cephNamespace,
|
||||
Name: "health_status",
|
||||
Help: "Health status of Cluster, can vary only between 3 states (err:2, warn:1, ok:0)",
|
||||
},
|
||||
),
|
||||
DegradedPGs: prometheus.NewGauge(
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: cephNamespace,
|
||||
@ -147,6 +169,7 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector {
|
||||
|
||||
func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
|
||||
return []prometheus.Metric{
|
||||
c.HealthStatus,
|
||||
c.DegradedPGs,
|
||||
c.UncleanPGs,
|
||||
c.UndersizedPGs,
|
||||
@ -166,6 +189,7 @@ type cephHealthStats struct {
|
||||
Severity string `json:"severity"`
|
||||
Summary string `json:"summary"`
|
||||
} `json:"summary"`
|
||||
OverallStatus string `json:"overall_status"`
|
||||
} `json:"health"`
|
||||
OSDMap struct {
|
||||
OSDMap struct {
|
||||
@ -195,6 +219,17 @@ func (c *ClusterHealthCollector) collect() error {
|
||||
}
|
||||
}
|
||||
|
||||
switch stats.Health.OverallStatus {
|
||||
case CephHealthOK:
|
||||
c.HealthStatus.Set(0)
|
||||
case CephHealthWarn:
|
||||
c.HealthStatus.Set(1)
|
||||
case CephHealthErr:
|
||||
c.HealthStatus.Set(2)
|
||||
default:
|
||||
c.HealthStatus.Set(2)
|
||||
}
|
||||
|
||||
var (
|
||||
degradedRegex = regexp.MustCompile(`([\d]+) pgs degraded`)
|
||||
uncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`)
|
||||
|
@ -30,7 +30,7 @@ func TestClusterHealthCollector(t *testing.T) {
|
||||
regexes []*regexp.Regexp
|
||||
}{
|
||||
{
|
||||
`
|
||||
input: `
|
||||
{
|
||||
"osdmap": {
|
||||
"osdmap": {
|
||||
@ -42,12 +42,12 @@ func TestClusterHealthCollector(t *testing.T) {
|
||||
},
|
||||
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "5 pgs degraded"}]}
|
||||
}`,
|
||||
[]*regexp.Regexp{
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`degraded_pgs 5`),
|
||||
},
|
||||
},
|
||||
{
|
||||
`
|
||||
input: `
|
||||
{
|
||||
"osdmap": {
|
||||
"osdmap": {
|
||||
@ -59,12 +59,12 @@ func TestClusterHealthCollector(t *testing.T) {
|
||||
},
|
||||
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "6 pgs stuck unclean"}]}
|
||||
}`,
|
||||
[]*regexp.Regexp{
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`unclean_pgs 6`),
|
||||
},
|
||||
},
|
||||
{
|
||||
`
|
||||
input: `
|
||||
{
|
||||
"osdmap": {
|
||||
"osdmap": {
|
||||
@ -76,12 +76,12 @@ func TestClusterHealthCollector(t *testing.T) {
|
||||
},
|
||||
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "7 pgs undersized"}]}
|
||||
}`,
|
||||
[]*regexp.Regexp{
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`undersized_pgs 7`),
|
||||
},
|
||||
},
|
||||
{
|
||||
`
|
||||
input: `
|
||||
{
|
||||
"osdmap": {
|
||||
"osdmap": {
|
||||
@ -93,12 +93,12 @@ func TestClusterHealthCollector(t *testing.T) {
|
||||
},
|
||||
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "8 pgs stale"}]}
|
||||
}`,
|
||||
[]*regexp.Regexp{
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`stale_pgs 8`),
|
||||
},
|
||||
},
|
||||
{
|
||||
`
|
||||
input: `
|
||||
{
|
||||
"osdmap": {
|
||||
"osdmap": {
|
||||
@ -110,12 +110,12 @@ func TestClusterHealthCollector(t *testing.T) {
|
||||
},
|
||||
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "recovery 10/20 objects degraded"}]}
|
||||
}`,
|
||||
[]*regexp.Regexp{
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`degraded_objects 10`),
|
||||
},
|
||||
},
|
||||
{
|
||||
`
|
||||
input: `
|
||||
{
|
||||
"osdmap": {
|
||||
"osdmap": {
|
||||
@ -127,12 +127,12 @@ func TestClusterHealthCollector(t *testing.T) {
|
||||
},
|
||||
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "3/20 in osds are down"}]}
|
||||
}`,
|
||||
[]*regexp.Regexp{
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`osds_down 3`),
|
||||
},
|
||||
},
|
||||
{
|
||||
`
|
||||
input: `
|
||||
{
|
||||
"osdmap": {
|
||||
"osdmap": {
|
||||
@ -144,13 +144,61 @@ func TestClusterHealthCollector(t *testing.T) {
|
||||
},
|
||||
"health": {"summary": []}
|
||||
}`,
|
||||
[]*regexp.Regexp{
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`osds 1200`),
|
||||
regexp.MustCompile(`osds_up 1200`),
|
||||
regexp.MustCompile(`osds_in 1190`),
|
||||
regexp.MustCompile(`pgs_remapped 10`),
|
||||
},
|
||||
},
|
||||
{
|
||||
input: `
|
||||
{
|
||||
"osdmap": {
|
||||
"osdmap": {
|
||||
"num_osds": 1200,
|
||||
"num_up_osds": 1200,
|
||||
"num_in_osds": 1190,
|
||||
"num_remapped_pgs": 10
|
||||
}
|
||||
},
|
||||
"health": { "overall_status": "HEALTH_OK" } }`,
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`health_status 0`),
|
||||
},
|
||||
},
|
||||
{
|
||||
input: `
|
||||
{
|
||||
"osdmap": {
|
||||
"osdmap": {
|
||||
"num_osds": 1200,
|
||||
"num_up_osds": 1200,
|
||||
"num_in_osds": 1190,
|
||||
"num_remapped_pgs": 10
|
||||
}
|
||||
},
|
||||
"health": { "overall_status": "HEALTH_WARN" } }`,
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`health_status 1`),
|
||||
},
|
||||
},
|
||||
{
|
||||
input: `
|
||||
{
|
||||
"osdmap": {
|
||||
"osdmap": {
|
||||
"num_osds": 1200,
|
||||
"num_up_osds": 1200,
|
||||
"num_in_osds": 1190,
|
||||
"num_remapped_pgs": 10
|
||||
}
|
||||
},
|
||||
"health": { "overall_status": "HEALTH_ERR" } }`,
|
||||
regexes: []*regexp.Regexp{
|
||||
regexp.MustCompile(`health_status 2`),
|
||||
},
|
||||
},
|
||||
} {
|
||||
func() {
|
||||
collector := NewClusterHealthCollector(NewNoopConn(tt.input))
|
||||
|
Loading…
Reference in New Issue
Block a user