mirror of
https://github.com/digitalocean/ceph_exporter
synced 2025-03-01 17:50:23 +00:00
health/pg_stats: stuck pg stats should be reported separately
This commit is contained in:
parent
b974fe5d96
commit
17a2040a59
@ -37,18 +37,35 @@ type ClusterHealthCollector struct {
|
|||||||
// missing.
|
// missing.
|
||||||
DegradedPGs prometheus.Gauge
|
DegradedPGs prometheus.Gauge
|
||||||
|
|
||||||
|
// StuckDegradedPGs shows the no. of PGs that have some of the replicas
|
||||||
|
// missing, and are stuck in that state.
|
||||||
|
StuckDegradedPGs prometheus.Gauge
|
||||||
|
|
||||||
// UncleanPGs shows the no. of PGs that do not have all objects in the PG
|
// UncleanPGs shows the no. of PGs that do not have all objects in the PG
|
||||||
// that are supposed to be in it.
|
// that are supposed to be in it.
|
||||||
UncleanPGs prometheus.Gauge
|
UncleanPGs prometheus.Gauge
|
||||||
|
|
||||||
|
// StuckUncleanPGs shows the no. of PGs that do not have all objects in the PG
|
||||||
|
// that are supposed to be in it, and are stuck in that state.
|
||||||
|
StuckUncleanPGs prometheus.Gauge
|
||||||
|
|
||||||
// UndersizedPGs depicts the no. of PGs that have fewer copies than configured
|
// UndersizedPGs depicts the no. of PGs that have fewer copies than configured
|
||||||
// replication level.
|
// replication level.
|
||||||
UndersizedPGs prometheus.Gauge
|
UndersizedPGs prometheus.Gauge
|
||||||
|
|
||||||
|
// StuckUndersizedPGs depicts the no. of PGs that have fewer copies than configured
|
||||||
|
// replication level, and are stuck in that state.
|
||||||
|
StuckUndersizedPGs prometheus.Gauge
|
||||||
|
|
||||||
// StalePGs depicts no. of PGs that are in an unknown state i.e. monitors do not know
|
// StalePGs depicts no. of PGs that are in an unknown state i.e. monitors do not know
|
||||||
// anything about their latest state since their pg mapping was modified.
|
// anything about their latest state since their pg mapping was modified.
|
||||||
StalePGs prometheus.Gauge
|
StalePGs prometheus.Gauge
|
||||||
|
|
||||||
|
// StuckStalePGs depicts no. of PGs that are in an unknown state i.e. monitors do not know
|
||||||
|
// anything about their latest state since their pg mapping was modified, and are stuck
|
||||||
|
// in that state.
|
||||||
|
StuckStalePGs prometheus.Gauge
|
||||||
|
|
||||||
// DegradedObjectsCount gives the no. of RADOS objects are constitute the degraded PGs.
|
// DegradedObjectsCount gives the no. of RADOS objects are constitute the degraded PGs.
|
||||||
DegradedObjectsCount prometheus.Gauge
|
DegradedObjectsCount prometheus.Gauge
|
||||||
|
|
||||||
@ -101,6 +118,13 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector {
|
|||||||
Help: "No. of PGs in a degraded state",
|
Help: "No. of PGs in a degraded state",
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
|
StuckDegradedPGs: prometheus.NewGauge(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: cephNamespace,
|
||||||
|
Name: "stuck_degraded_pgs",
|
||||||
|
Help: "No. of PGs stuck in a degraded state",
|
||||||
|
},
|
||||||
|
),
|
||||||
UncleanPGs: prometheus.NewGauge(
|
UncleanPGs: prometheus.NewGauge(
|
||||||
prometheus.GaugeOpts{
|
prometheus.GaugeOpts{
|
||||||
Namespace: cephNamespace,
|
Namespace: cephNamespace,
|
||||||
@ -108,6 +132,13 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector {
|
|||||||
Help: "No. of PGs in an unclean state",
|
Help: "No. of PGs in an unclean state",
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
|
StuckUncleanPGs: prometheus.NewGauge(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: cephNamespace,
|
||||||
|
Name: "stuck_unclean_pgs",
|
||||||
|
Help: "No. of PGs stuck in an unclean state",
|
||||||
|
},
|
||||||
|
),
|
||||||
UndersizedPGs: prometheus.NewGauge(
|
UndersizedPGs: prometheus.NewGauge(
|
||||||
prometheus.GaugeOpts{
|
prometheus.GaugeOpts{
|
||||||
Namespace: cephNamespace,
|
Namespace: cephNamespace,
|
||||||
@ -115,6 +146,13 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector {
|
|||||||
Help: "No. of undersized PGs in the cluster",
|
Help: "No. of undersized PGs in the cluster",
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
|
StuckUndersizedPGs: prometheus.NewGauge(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: cephNamespace,
|
||||||
|
Name: "stuck_undersized_pgs",
|
||||||
|
Help: "No. of stuck undersized PGs in the cluster",
|
||||||
|
},
|
||||||
|
),
|
||||||
StalePGs: prometheus.NewGauge(
|
StalePGs: prometheus.NewGauge(
|
||||||
prometheus.GaugeOpts{
|
prometheus.GaugeOpts{
|
||||||
Namespace: cephNamespace,
|
Namespace: cephNamespace,
|
||||||
@ -122,6 +160,13 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector {
|
|||||||
Help: "No. of stale PGs in the cluster",
|
Help: "No. of stale PGs in the cluster",
|
||||||
},
|
},
|
||||||
),
|
),
|
||||||
|
StuckStalePGs: prometheus.NewGauge(
|
||||||
|
prometheus.GaugeOpts{
|
||||||
|
Namespace: cephNamespace,
|
||||||
|
Name: "stuck_stale_pgs",
|
||||||
|
Help: "No. of stuck stale PGs in the cluster",
|
||||||
|
},
|
||||||
|
),
|
||||||
DegradedObjectsCount: prometheus.NewGauge(
|
DegradedObjectsCount: prometheus.NewGauge(
|
||||||
prometheus.GaugeOpts{
|
prometheus.GaugeOpts{
|
||||||
Namespace: cephNamespace,
|
Namespace: cephNamespace,
|
||||||
@ -171,9 +216,13 @@ func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
|
|||||||
return []prometheus.Metric{
|
return []prometheus.Metric{
|
||||||
c.HealthStatus,
|
c.HealthStatus,
|
||||||
c.DegradedPGs,
|
c.DegradedPGs,
|
||||||
|
c.StuckDegradedPGs,
|
||||||
c.UncleanPGs,
|
c.UncleanPGs,
|
||||||
|
c.StuckUncleanPGs,
|
||||||
c.UndersizedPGs,
|
c.UndersizedPGs,
|
||||||
|
c.StuckUndersizedPGs,
|
||||||
c.StalePGs,
|
c.StalePGs,
|
||||||
|
c.StuckStalePGs,
|
||||||
c.DegradedObjectsCount,
|
c.DegradedObjectsCount,
|
||||||
c.OSDsDown,
|
c.OSDsDown,
|
||||||
c.OSDsUp,
|
c.OSDsUp,
|
||||||
@ -232,9 +281,13 @@ func (c *ClusterHealthCollector) collect() error {
|
|||||||
|
|
||||||
var (
|
var (
|
||||||
degradedRegex = regexp.MustCompile(`([\d]+) pgs degraded`)
|
degradedRegex = regexp.MustCompile(`([\d]+) pgs degraded`)
|
||||||
uncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`)
|
stuckDegradedRegex = regexp.MustCompile(`([\d]+) pgs stuck degraded`)
|
||||||
|
uncleanRegex = regexp.MustCompile(`([\d]+) pgs unclean`)
|
||||||
|
stuckUncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`)
|
||||||
undersizedRegex = regexp.MustCompile(`([\d]+) pgs undersized`)
|
undersizedRegex = regexp.MustCompile(`([\d]+) pgs undersized`)
|
||||||
|
stuckUndersizedRegex = regexp.MustCompile(`([\d]+) pgs stuck undersized`)
|
||||||
staleRegex = regexp.MustCompile(`([\d]+) pgs stale`)
|
staleRegex = regexp.MustCompile(`([\d]+) pgs stale`)
|
||||||
|
stuckStaleRegex = regexp.MustCompile(`([\d]+) pgs stuck stale`)
|
||||||
degradedObjectsRegex = regexp.MustCompile(`recovery ([\d]+)/([\d]+) objects degraded`)
|
degradedObjectsRegex = regexp.MustCompile(`recovery ([\d]+)/([\d]+) objects degraded`)
|
||||||
osdsDownRegex = regexp.MustCompile(`([\d]+)/([\d]+) in osds are down`)
|
osdsDownRegex = regexp.MustCompile(`([\d]+)/([\d]+) in osds are down`)
|
||||||
)
|
)
|
||||||
@ -249,6 +302,15 @@ func (c *ClusterHealthCollector) collect() error {
|
|||||||
c.DegradedPGs.Set(float64(v))
|
c.DegradedPGs.Set(float64(v))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
matched = stuckDegradedRegex.FindStringSubmatch(s.Summary)
|
||||||
|
if len(matched) == 2 {
|
||||||
|
v, err := strconv.Atoi(matched[1])
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
c.StuckDegradedPGs.Set(float64(v))
|
||||||
|
}
|
||||||
|
|
||||||
matched = uncleanRegex.FindStringSubmatch(s.Summary)
|
matched = uncleanRegex.FindStringSubmatch(s.Summary)
|
||||||
if len(matched) == 2 {
|
if len(matched) == 2 {
|
||||||
v, err := strconv.Atoi(matched[1])
|
v, err := strconv.Atoi(matched[1])
|
||||||
@ -258,6 +320,15 @@ func (c *ClusterHealthCollector) collect() error {
|
|||||||
c.UncleanPGs.Set(float64(v))
|
c.UncleanPGs.Set(float64(v))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
matched = stuckUncleanRegex.FindStringSubmatch(s.Summary)
|
||||||
|
if len(matched) == 2 {
|
||||||
|
v, err := strconv.Atoi(matched[1])
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
c.StuckUncleanPGs.Set(float64(v))
|
||||||
|
}
|
||||||
|
|
||||||
matched = undersizedRegex.FindStringSubmatch(s.Summary)
|
matched = undersizedRegex.FindStringSubmatch(s.Summary)
|
||||||
if len(matched) == 2 {
|
if len(matched) == 2 {
|
||||||
v, err := strconv.Atoi(matched[1])
|
v, err := strconv.Atoi(matched[1])
|
||||||
@ -267,6 +338,15 @@ func (c *ClusterHealthCollector) collect() error {
|
|||||||
c.UndersizedPGs.Set(float64(v))
|
c.UndersizedPGs.Set(float64(v))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
matched = stuckUndersizedRegex.FindStringSubmatch(s.Summary)
|
||||||
|
if len(matched) == 2 {
|
||||||
|
v, err := strconv.Atoi(matched[1])
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
c.StuckUndersizedPGs.Set(float64(v))
|
||||||
|
}
|
||||||
|
|
||||||
matched = staleRegex.FindStringSubmatch(s.Summary)
|
matched = staleRegex.FindStringSubmatch(s.Summary)
|
||||||
if len(matched) == 2 {
|
if len(matched) == 2 {
|
||||||
v, err := strconv.Atoi(matched[1])
|
v, err := strconv.Atoi(matched[1])
|
||||||
@ -276,6 +356,15 @@ func (c *ClusterHealthCollector) collect() error {
|
|||||||
c.StalePGs.Set(float64(v))
|
c.StalePGs.Set(float64(v))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
matched = stuckStaleRegex.FindStringSubmatch(s.Summary)
|
||||||
|
if len(matched) == 2 {
|
||||||
|
v, err := strconv.Atoi(matched[1])
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
c.StuckStalePGs.Set(float64(v))
|
||||||
|
}
|
||||||
|
|
||||||
matched = degradedObjectsRegex.FindStringSubmatch(s.Summary)
|
matched = degradedObjectsRegex.FindStringSubmatch(s.Summary)
|
||||||
if len(matched) == 3 {
|
if len(matched) == 3 {
|
||||||
v, err := strconv.Atoi(matched[1])
|
v, err := strconv.Atoi(matched[1])
|
||||||
|
@ -57,7 +57,24 @@ func TestClusterHealthCollector(t *testing.T) {
|
|||||||
"num_remapped_pgs": 0
|
"num_remapped_pgs": 0
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "6 pgs stuck unclean"}]}
|
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "15 pgs stuck degraded"}]}
|
||||||
|
}`,
|
||||||
|
regexes: []*regexp.Regexp{
|
||||||
|
regexp.MustCompile(`stuck_degraded_pgs 15`),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: `
|
||||||
|
{
|
||||||
|
"osdmap": {
|
||||||
|
"osdmap": {
|
||||||
|
"num_osds": 0,
|
||||||
|
"num_up_osds": 0,
|
||||||
|
"num_in_osds": 0,
|
||||||
|
"num_remapped_pgs": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "6 pgs unclean"}]}
|
||||||
}`,
|
}`,
|
||||||
regexes: []*regexp.Regexp{
|
regexes: []*regexp.Regexp{
|
||||||
regexp.MustCompile(`unclean_pgs 6`),
|
regexp.MustCompile(`unclean_pgs 6`),
|
||||||
@ -65,6 +82,23 @@ func TestClusterHealthCollector(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
input: `
|
input: `
|
||||||
|
{
|
||||||
|
"osdmap": {
|
||||||
|
"osdmap": {
|
||||||
|
"num_osds": 0,
|
||||||
|
"num_up_osds": 0,
|
||||||
|
"num_in_osds": 0,
|
||||||
|
"num_remapped_pgs": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "16 pgs stuck unclean"}]}
|
||||||
|
}`,
|
||||||
|
regexes: []*regexp.Regexp{
|
||||||
|
regexp.MustCompile(`stuck_unclean_pgs 16`),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: `
|
||||||
{
|
{
|
||||||
"osdmap": {
|
"osdmap": {
|
||||||
"osdmap": {
|
"osdmap": {
|
||||||
@ -82,6 +116,23 @@ func TestClusterHealthCollector(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
input: `
|
input: `
|
||||||
|
{
|
||||||
|
"osdmap": {
|
||||||
|
"osdmap": {
|
||||||
|
"num_osds": 0,
|
||||||
|
"num_up_osds": 0,
|
||||||
|
"num_in_osds": 0,
|
||||||
|
"num_remapped_pgs": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "17 pgs stuck undersized"}]}
|
||||||
|
}`,
|
||||||
|
regexes: []*regexp.Regexp{
|
||||||
|
regexp.MustCompile(`stuck_undersized_pgs 17`),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: `
|
||||||
{
|
{
|
||||||
"osdmap": {
|
"osdmap": {
|
||||||
"osdmap": {
|
"osdmap": {
|
||||||
@ -99,6 +150,23 @@ func TestClusterHealthCollector(t *testing.T) {
|
|||||||
},
|
},
|
||||||
{
|
{
|
||||||
input: `
|
input: `
|
||||||
|
{
|
||||||
|
"osdmap": {
|
||||||
|
"osdmap": {
|
||||||
|
"num_osds": 0,
|
||||||
|
"num_up_osds": 0,
|
||||||
|
"num_in_osds": 0,
|
||||||
|
"num_remapped_pgs": 0
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "18 pgs stuck stale"}]}
|
||||||
|
}`,
|
||||||
|
regexes: []*regexp.Regexp{
|
||||||
|
regexp.MustCompile(`stuck_stale_pgs 18`),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
input: `
|
||||||
{
|
{
|
||||||
"osdmap": {
|
"osdmap": {
|
||||||
"osdmap": {
|
"osdmap": {
|
||||||
|
Loading…
Reference in New Issue
Block a user