osd: add metrics for down and destroyed OSD

This commit is contained in:
Vaibhav Bhembre 2018-06-24 13:16:09 -04:00
parent 9a39cc64ed
commit aa5abdc470
2 changed files with 247 additions and 4 deletions

View File

@ -6,8 +6,9 @@ import (
"fmt"
"log"
"github.com/prometheus/client_golang/prometheus"
"strings"
"github.com/prometheus/client_golang/prometheus"
)
const (
@ -78,6 +79,9 @@ type OSDCollector struct {
// OSDUp displays the Up state of the OSD
OSDUp *prometheus.GaugeVec
// OSDDownDesc displays OSDs present in the cluster in "down" state
OSDDownDesc *prometheus.Desc
// TotalBytes displays total bytes in all OSDs
TotalBytes prometheus.Gauge
@ -269,6 +273,12 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
},
[]string{"osd"},
),
OSDDownDesc: prometheus.NewDesc(
fmt.Sprintf("%s_osd_down", cephNamespace),
"No. of OSDs down in the cluster",
[]string{"osd", "status"},
labels,
),
ScrubbingStateDesc: prometheus.NewDesc(
fmt.Sprintf("%s_osd_scrub_state", cephNamespace),
"State of OSDs involved in a scrub",
@ -340,7 +350,22 @@ type cephOSDDump struct {
} `json:"osds"`
}
func (o *OSDCollector) collect() error {
type cephOSDTreeDown struct {
Nodes []struct {
ID int64 `json:"id"`
Name string `json:"name"`
Type string `json:"type"`
Status string `json:"status"`
} `json:"nodes"`
Stray []struct {
ID int64 `json:"id"`
Name string `json:"name"`
Type string `json:"type"`
Status string `json:"status"`
} `json:"stray"`
}
func (o *OSDCollector) collectOSDDF() error {
cmd := o.cephOSDDFCommand()
buf, _, err := o.conn.MonCommand(cmd)
@ -493,6 +518,34 @@ func (o *OSDCollector) collectOSDPerf() error {
return nil
}
func (o *OSDCollector) collectOSDTreeDown(ch chan<- prometheus.Metric) error {
osdDownCmd := o.cephOSDTreeCommand("down")
buff, _, err := o.conn.MonCommand(osdDownCmd)
if err != nil {
log.Println("[ERROR] Unable to collect data from ceph osd tree down", err)
return err
}
osdDown := &cephOSDTreeDown{}
if err := json.Unmarshal(buff, osdDown); err != nil {
return err
}
downItems := append(osdDown.Nodes, osdDown.Stray...)
for _, downItem := range downItems {
if downItem.Type != "osd" {
continue
}
osdName := downItem.Name
ch <- prometheus.MustNewConstMetric(o.OSDDownDesc, prometheus.GaugeValue, 1, osdName, downItem.Status)
}
return nil
}
func (o *OSDCollector) collectOSDDump() error {
osdDumpCmd := o.cephOSDDump()
buff, _, err := o.conn.MonCommand(osdDumpCmd)
@ -541,7 +594,6 @@ func (o *OSDCollector) collectOSDScrubState(ch chan<- prometheus.Metric) error {
stats := cephPGDumpBriefResponse{}
if err := json.Unmarshal(buf, &stats); err != nil {
log.Println("Unmarshal:", string(buf[:100]))
return err
}
@ -610,6 +662,18 @@ func (o *OSDCollector) cephOSDPerfCommand() []byte {
return cmd
}
func (o *OSDCollector) cephOSDTreeCommand(states ...string) []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "osd tree",
"states": states,
"format": "json",
})
if err != nil {
panic(err)
}
return cmd
}
func (o *OSDCollector) cephPGDumpCommand() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "pg dump",
@ -644,7 +708,11 @@ func (o *OSDCollector) Collect(ch chan<- prometheus.Metric) {
log.Println("failed collecting osd dump:", err)
}
if err := o.collect(); err != nil {
if err := o.collectOSDDF(); err != nil {
log.Println("failed collecting osd metrics:", err)
}
if err := o.collectOSDTreeDown(ch); err != nil {
log.Println("failed collecting osd metrics:", err)
}

View File

@ -310,6 +310,181 @@ func TestOSDCollector(t *testing.T) {
regexp.MustCompile(`ceph_osd_scrub_state{cluster="ceph",osd="osd.23"} 2`),
},
},
{
input: `
{
"nodes": [],
"stray": [
{
"id": 524,
"name": "osd.524",
"type": "osd",
"type_id": 0,
"crush_weight": 0.000000,
"depth": 0,
"exists": 1,
"status": "destroyed",
"reweight": 0.000000,
"primary_affinity": 1.000000
}
]
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_osd_down{cluster="ceph",osd="osd.524",status="destroyed"} 1`),
},
},
{
input: `
{
"nodes": [],
"stray": [
{
"id": 524,
"name": "osd.524",
"type": "osd",
"type_id": 0,
"crush_weight": 0.000000,
"depth": 0,
"exists": 1,
"status": "down",
"reweight": 0.000000,
"primary_affinity": 1.000000
}
]
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_osd_down{cluster="ceph",osd="osd.524",status="down"} 1`),
},
},
{
input: `
{
"nodes": [
{
"id": -18,
"name": "data",
"type": "root",
"type_id": 10,
"children": [
-20
]
},
{
"id": -20,
"name": "R1-data",
"type": "rack",
"type_id": 3,
"pool_weights": {},
"children": [
-8
]
},
{
"id": -8,
"name": "test-data03-object01",
"type": "host",
"type_id": 1,
"pool_weights": {},
"children": [
97
]
},
{
"id": 524,
"device_class": "hdd",
"name": "osd.524",
"type": "osd",
"type_id": 0,
"crush_weight": 7.265991,
"depth": 3,
"pool_weights": {},
"exists": 1,
"status": "destroyed",
"reweight": 0.000000,
"primary_affinity": 1.000000
}
],
"stray": []
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_osd_down{cluster="ceph",osd="osd.524",status="destroyed"} 1`),
},
},
{
input: `
{
"nodes": [
{
"id": -18,
"name": "data",
"type": "root",
"type_id": 10,
"children": [
-20
]
},
{
"id": -20,
"name": "R1-data",
"type": "rack",
"type_id": 3,
"pool_weights": {},
"children": [
-8
]
},
{
"id": -8,
"name": "test-data03-object01",
"type": "host",
"type_id": 1,
"pool_weights": {},
"children": [
97
]
},
{
"id": 524,
"device_class": "hdd",
"name": "osd.524",
"type": "osd",
"type_id": 0,
"crush_weight": 7.265991,
"depth": 3,
"pool_weights": {},
"exists": 1,
"status": "destroyed",
"reweight": 0.000000,
"primary_affinity": 1.000000
}
],
"stray": [
{
"id": 525,
"name": "osd.525",
"type": "osd",
"type_id": 0,
"crush_weight": 0.000000,
"depth": 0,
"exists": 1,
"status": "down",
"reweight": 0.000000,
"primary_affinity": 1.000000
}
]
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_osd_down{cluster="ceph",osd="osd.524",status="destroyed"} 1`),
regexp.MustCompile(`ceph_osd_down{cluster="ceph",osd="osd.525",status="down"} 1`),
},
},
{
input: `
{
"nodes": []}}
}`,
regexes: []*regexp.Regexp{},
},
} {
func() {
collector := NewOSDCollector(NewNoopConn(tt.input), "ceph")