health/pg_stats: stuck pg stats should be reported separately

This commit is contained in:
Vaibhav Bhembre 2016-02-08 15:48:17 +00:00
parent b974fe5d96
commit 17a2040a59
2 changed files with 159 additions and 2 deletions

View File

@ -37,18 +37,35 @@ type ClusterHealthCollector struct {
// missing.
DegradedPGs prometheus.Gauge
// StuckDegradedPGs shows the no. of PGs that have some of the replicas
// missing, and are stuck in that state.
StuckDegradedPGs prometheus.Gauge
// UncleanPGs shows the no. of PGs that do not have all objects in the PG
// that are supposed to be in it.
UncleanPGs prometheus.Gauge
// StuckUncleanPGs shows the no. of PGs that do not have all objects in the PG
// that are supposed to be in it, and are stuck in that state.
StuckUncleanPGs prometheus.Gauge
// UndersizedPGs depicts the no. of PGs that have fewer copies than configured
// replication level.
UndersizedPGs prometheus.Gauge
// StuckUndersizedPGs depicts the no. of PGs that have fewer copies than configured
// replication level, and are stuck in that state.
StuckUndersizedPGs prometheus.Gauge
// StalePGs depicts no. of PGs that are in an unknown state i.e. monitors do not know
// anything about their latest state since their pg mapping was modified.
StalePGs prometheus.Gauge
// StuckStalePGs depicts no. of PGs that are in an unknown state i.e. monitors do not know
// anything about their latest state since their pg mapping was modified, and are stuck
// in that state.
StuckStalePGs prometheus.Gauge
// DegradedObjectsCount gives the no. of RADOS objects are constitute the degraded PGs.
DegradedObjectsCount prometheus.Gauge
@ -101,6 +118,13 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector {
Help: "No. of PGs in a degraded state",
},
),
StuckDegradedPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "stuck_degraded_pgs",
Help: "No. of PGs stuck in a degraded state",
},
),
UncleanPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
@ -108,6 +132,13 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector {
Help: "No. of PGs in an unclean state",
},
),
StuckUncleanPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "stuck_unclean_pgs",
Help: "No. of PGs stuck in an unclean state",
},
),
UndersizedPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
@ -115,6 +146,13 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector {
Help: "No. of undersized PGs in the cluster",
},
),
StuckUndersizedPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "stuck_undersized_pgs",
Help: "No. of stuck undersized PGs in the cluster",
},
),
StalePGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
@ -122,6 +160,13 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector {
Help: "No. of stale PGs in the cluster",
},
),
StuckStalePGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "stuck_stale_pgs",
Help: "No. of stuck stale PGs in the cluster",
},
),
DegradedObjectsCount: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
@ -171,9 +216,13 @@ func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
return []prometheus.Metric{
c.HealthStatus,
c.DegradedPGs,
c.StuckDegradedPGs,
c.UncleanPGs,
c.StuckUncleanPGs,
c.UndersizedPGs,
c.StuckUndersizedPGs,
c.StalePGs,
c.StuckStalePGs,
c.DegradedObjectsCount,
c.OSDsDown,
c.OSDsUp,
@ -232,9 +281,13 @@ func (c *ClusterHealthCollector) collect() error {
var (
degradedRegex = regexp.MustCompile(`([\d]+) pgs degraded`)
uncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`)
stuckDegradedRegex = regexp.MustCompile(`([\d]+) pgs stuck degraded`)
uncleanRegex = regexp.MustCompile(`([\d]+) pgs unclean`)
stuckUncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`)
undersizedRegex = regexp.MustCompile(`([\d]+) pgs undersized`)
stuckUndersizedRegex = regexp.MustCompile(`([\d]+) pgs stuck undersized`)
staleRegex = regexp.MustCompile(`([\d]+) pgs stale`)
stuckStaleRegex = regexp.MustCompile(`([\d]+) pgs stuck stale`)
degradedObjectsRegex = regexp.MustCompile(`recovery ([\d]+)/([\d]+) objects degraded`)
osdsDownRegex = regexp.MustCompile(`([\d]+)/([\d]+) in osds are down`)
)
@ -249,6 +302,15 @@ func (c *ClusterHealthCollector) collect() error {
c.DegradedPGs.Set(float64(v))
}
matched = stuckDegradedRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.StuckDegradedPGs.Set(float64(v))
}
matched = uncleanRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
@ -258,6 +320,15 @@ func (c *ClusterHealthCollector) collect() error {
c.UncleanPGs.Set(float64(v))
}
matched = stuckUncleanRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.StuckUncleanPGs.Set(float64(v))
}
matched = undersizedRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
@ -267,6 +338,15 @@ func (c *ClusterHealthCollector) collect() error {
c.UndersizedPGs.Set(float64(v))
}
matched = stuckUndersizedRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.StuckUndersizedPGs.Set(float64(v))
}
matched = staleRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
@ -276,6 +356,15 @@ func (c *ClusterHealthCollector) collect() error {
c.StalePGs.Set(float64(v))
}
matched = stuckStaleRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
if err != nil {
return err
}
c.StuckStalePGs.Set(float64(v))
}
matched = degradedObjectsRegex.FindStringSubmatch(s.Summary)
if len(matched) == 3 {
v, err := strconv.Atoi(matched[1])

View File

@ -57,7 +57,24 @@ func TestClusterHealthCollector(t *testing.T) {
"num_remapped_pgs": 0
}
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "6 pgs stuck unclean"}]}
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "15 pgs stuck degraded"}]}
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`stuck_degraded_pgs 15`),
},
},
{
input: `
{
"osdmap": {
"osdmap": {
"num_osds": 0,
"num_up_osds": 0,
"num_in_osds": 0,
"num_remapped_pgs": 0
}
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "6 pgs unclean"}]}
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`unclean_pgs 6`),
@ -65,6 +82,23 @@ func TestClusterHealthCollector(t *testing.T) {
},
{
input: `
{
"osdmap": {
"osdmap": {
"num_osds": 0,
"num_up_osds": 0,
"num_in_osds": 0,
"num_remapped_pgs": 0
}
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "16 pgs stuck unclean"}]}
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`stuck_unclean_pgs 16`),
},
},
{
input: `
{
"osdmap": {
"osdmap": {
@ -82,6 +116,23 @@ func TestClusterHealthCollector(t *testing.T) {
},
{
input: `
{
"osdmap": {
"osdmap": {
"num_osds": 0,
"num_up_osds": 0,
"num_in_osds": 0,
"num_remapped_pgs": 0
}
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "17 pgs stuck undersized"}]}
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`stuck_undersized_pgs 17`),
},
},
{
input: `
{
"osdmap": {
"osdmap": {
@ -99,6 +150,23 @@ func TestClusterHealthCollector(t *testing.T) {
},
{
input: `
{
"osdmap": {
"osdmap": {
"num_osds": 0,
"num_up_osds": 0,
"num_in_osds": 0,
"num_remapped_pgs": 0
}
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "18 pgs stuck stale"}]}
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`stuck_stale_pgs 18`),
},
},
{
input: `
{
"osdmap": {
"osdmap": {