Merge pull request #4 from digitalocean/add_osd_stats

Adds new OSD metrics
This commit is contained in:
Vaibhav Bhembre 2016-01-08 16:50:43 -05:00
commit e5567a13d9
2 changed files with 195 additions and 22 deletions

View File

@ -27,16 +27,43 @@ import (
// It surfaces changes in the ceph parameters unlike data usage that ClusterUsageCollector
// does.
type ClusterHealthCollector struct {
// conn holds connection to the Ceph cluster
conn Conn
DegradedPGs prometheus.Gauge
UncleanPGs prometheus.Gauge
UndersizedPGs prometheus.Gauge
StalePGs prometheus.Gauge
// DegradedPGs shows the no. of PGs that have some of the replicas
// missing.
DegradedPGs prometheus.Gauge
// UncleanPGs shows the no. of PGs that do not have all objects in the PG
// that are supposed to be in it.
UncleanPGs prometheus.Gauge
// UndersizedPGs depicts the no. of PGs that have fewer copies than configured
// replication level.
UndersizedPGs prometheus.Gauge
// StalePGs depicts no. of PGs that are in an unknown state i.e. monitors do not know
// anything about their latest state since their pg mapping was modified.
StalePGs prometheus.Gauge
// DegradedObjectsCount gives the no. of RADOS objects are constitute the degraded PGs.
DegradedObjectsCount prometheus.Gauge
// OSDsDown show the no. of OSDs that are in the DOWN state.
OSDsDown prometheus.Gauge
// OSDsUp show the no. of OSDs that are in the UP state and are able to serve requests.
OSDsUp prometheus.Gauge
// OSDsIn shows the no. of OSDs that are marked as IN in the cluster.
OSDsIn prometheus.Gauge
// OSDsNum shows the no. of total OSDs the cluster has.
OSDsNum prometheus.Gauge
// RemappedPGs show the no. of PGs that are currently remapped and needs to be moved
// to newer OSDs.
RemappedPGs prometheus.Gauge
}
// NewClusterHealthCollector creates a new instance of ClusterHealthCollector to collect health
@ -87,6 +114,34 @@ func NewClusterHealthCollector(conn Conn) *ClusterHealthCollector {
Help: "Count of OSDs that are in DOWN state",
},
),
OSDsUp: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osds_up",
Help: "Count of OSDs that are in UP state",
},
),
OSDsIn: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osds_in",
Help: "Count of OSDs that are in IN state and available to serve requests",
},
),
OSDsNum: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osds",
Help: "Count of total OSDs in the cluster",
},
),
RemappedPGs: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "pgs_remapped",
Help: "No. of PGs that are remapped and incurring cluster-wide movement",
},
),
}
}
@ -98,14 +153,28 @@ func (c *ClusterHealthCollector) metricsList() []prometheus.Metric {
c.StalePGs,
c.DegradedObjectsCount,
c.OSDsDown,
c.OSDsUp,
c.OSDsIn,
c.OSDsNum,
c.RemappedPGs,
}
}
type cephHealthStats struct {
Summary []struct {
Severity string `json:"severity"`
Summary string `json:"summary"`
} `json:"summary"`
Health struct {
Summary []struct {
Severity string `json:"severity"`
Summary string `json:"summary"`
} `json:"summary"`
} `json:"health"`
OSDMap struct {
OSDMap struct {
NumOSDs json.Number `json:"num_osds"`
NumUpOSDs json.Number `json:"num_up_osds"`
NumInOSDs json.Number `json:"num_in_osds"`
NumRemappedPGs json.Number `json:"num_remapped_pgs"`
} `json:"osdmap"`
} `json:"osdmap"`
}
func (c *ClusterHealthCollector) collect() error {
@ -126,10 +195,6 @@ func (c *ClusterHealthCollector) collect() error {
}
}
if len(stats.Summary) < 1 {
return nil
}
var (
degradedRegex = regexp.MustCompile(`([\d]+) pgs degraded`)
uncleanRegex = regexp.MustCompile(`([\d]+) pgs stuck unclean`)
@ -139,7 +204,7 @@ func (c *ClusterHealthCollector) collect() error {
osdsDownRegex = regexp.MustCompile(`([\d]+)/([\d]+) in osds are down`)
)
for _, s := range stats.Summary {
for _, s := range stats.Health.Summary {
matched := degradedRegex.FindStringSubmatch(s.Summary)
if len(matched) == 2 {
v, err := strconv.Atoi(matched[1])
@ -193,16 +258,38 @@ func (c *ClusterHealthCollector) collect() error {
}
c.OSDsDown.Set(float64(v))
}
}
osdsUp, err := stats.OSDMap.OSDMap.NumUpOSDs.Float64()
if err != nil {
return err
}
c.OSDsUp.Set(osdsUp)
osdsIn, err := stats.OSDMap.OSDMap.NumInOSDs.Float64()
if err != nil {
return err
}
c.OSDsIn.Set(osdsIn)
osdsNum, err := stats.OSDMap.OSDMap.NumOSDs.Float64()
if err != nil {
return err
}
c.OSDsNum.Set(osdsNum)
remappedPGs, err := stats.OSDMap.OSDMap.NumRemappedPGs.Float64()
if err != nil {
return err
}
c.RemappedPGs.Set(remappedPGs)
return nil
}
func (c *ClusterHealthCollector) cephUsageCommand() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "health",
"detail": "detail",
"prefix": "status",
"format": "json",
})
if err != nil {

View File

@ -30,41 +30,127 @@ func TestClusterHealthCollector(t *testing.T) {
regexes []*regexp.Regexp
}{
{
`{"summary": [{"severity": "HEALTH_WARN", "summary": "5 pgs degraded"}]}`,
`
{
"osdmap": {
"osdmap": {
"num_osds": 0,
"num_up_osds": 0,
"num_in_osds": 0,
"num_remapped_pgs": 0
}
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "5 pgs degraded"}]}
}`,
[]*regexp.Regexp{
regexp.MustCompile(`degraded_pgs 5`),
},
},
{
`{"summary": [{"severity": "HEALTH_WARN", "summary": "6 pgs stuck unclean"}]}`,
`
{
"osdmap": {
"osdmap": {
"num_osds": 0,
"num_up_osds": 0,
"num_in_osds": 0,
"num_remapped_pgs": 0
}
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "6 pgs stuck unclean"}]}
}`,
[]*regexp.Regexp{
regexp.MustCompile(`unclean_pgs 6`),
},
},
{
`{"summary": [{"severity": "HEALTH_WARN", "summary": "7 pgs undersized"}]}`,
`
{
"osdmap": {
"osdmap": {
"num_osds": 0,
"num_up_osds": 0,
"num_in_osds": 0,
"num_remapped_pgs": 0
}
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "7 pgs undersized"}]}
}`,
[]*regexp.Regexp{
regexp.MustCompile(`undersized_pgs 7`),
},
},
{
`{"summary": [{"severity": "HEALTH_WARN", "summary": "8 pgs stale"}]}`,
`
{
"osdmap": {
"osdmap": {
"num_osds": 0,
"num_up_osds": 0,
"num_in_osds": 0,
"num_remapped_pgs": 0
}
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "8 pgs stale"}]}
}`,
[]*regexp.Regexp{
regexp.MustCompile(`stale_pgs 8`),
},
},
{
`{"summary": [{"severity": "HEALTH_WARN", "summary": "recovery 10/20 objects degraded"}]}`,
`
{
"osdmap": {
"osdmap": {
"num_osds": 0,
"num_up_osds": 0,
"num_in_osds": 0,
"num_remapped_pgs": 0
}
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "recovery 10/20 objects degraded"}]}
}`,
[]*regexp.Regexp{
regexp.MustCompile(`degraded_objects 10`),
},
},
{
`{"summary": [{"severity": "HEALTH_WARN", "summary": "3/20 in osds are down"}]}`,
`
{
"osdmap": {
"osdmap": {
"num_osds": 0,
"num_up_osds": 0,
"num_in_osds": 0,
"num_remapped_pgs": 0
}
},
"health": {"summary": [{"severity": "HEALTH_WARN", "summary": "3/20 in osds are down"}]}
}`,
[]*regexp.Regexp{
regexp.MustCompile(`osds_down 3`),
},
},
{
`
{
"osdmap": {
"osdmap": {
"num_osds": 1200,
"num_up_osds": 1200,
"num_in_osds": 1190,
"num_remapped_pgs": 10
}
},
"health": {"summary": []}
}`,
[]*regexp.Regexp{
regexp.MustCompile(`osds 1200`),
regexp.MustCompile(`osds_up 1200`),
regexp.MustCompile(`osds_in 1190`),
regexp.MustCompile(`pgs_remapped 10`),
},
},
} {
func() {
collector := NewClusterHealthCollector(NewNoopConn(tt.input))