ceph_exporter/collectors/osd.go

519 lines
11 KiB
Go
Raw Normal View History

2016-07-14 05:01:19 +00:00
package collectors
import (
"encoding/json"
"fmt"
"log"
"github.com/prometheus/client_golang/prometheus"
)
//OsdCollector sample comment
2016-07-14 05:01:19 +00:00
type OsdCollector struct {
conn Conn
//CrushWeight is a persistent setting, and it affects how CRUSH assigns data to OSDs.
//It displays the CRUSH weight for the OSD
2016-07-14 05:01:19 +00:00
CrushWeight *prometheus.GaugeVec
//Depth displays the OSD's level of hierarchy in the CRUSH map
2016-07-14 05:01:19 +00:00
Depth *prometheus.GaugeVec
//Reweight sets an override weight on the OSD.
//It displays value within 0 to 1.
2016-07-14 05:01:19 +00:00
Reweight *prometheus.GaugeVec
//Bytes displays the total bytes available in the OSD
Bytes *prometheus.GaugeVec
2016-07-14 05:01:19 +00:00
//UsedBytes displays the total used bytes in the OSD
UsedBytes *prometheus.GaugeVec
2016-07-14 05:01:19 +00:00
//AvailBytes displays the total available bytes in the OSD
AvailBytes *prometheus.GaugeVec
2016-07-14 05:01:19 +00:00
//Utilization displays current utilization of the OSD
2016-07-14 05:01:19 +00:00
Utilization *prometheus.GaugeVec
//Pgs displays total no. of placement groups in the OSD.
//Available in Ceph Jewel version.
2016-07-14 05:01:19 +00:00
Pgs *prometheus.GaugeVec
//CommitLatency displays in seconds how long it takes for an operation to be applied to disk
2016-07-14 05:01:19 +00:00
CommitLatency *prometheus.GaugeVec
//ApplyLatency displays in seconds how long it takes to get applied to the backing filesystem
2016-07-14 05:01:19 +00:00
ApplyLatency *prometheus.GaugeVec
//OsdsIn displays the In state of the OSD
2016-07-14 05:01:19 +00:00
OsdIn *prometheus.GaugeVec
//OsdsUP displays the Up state of the OSD
2016-07-14 05:01:19 +00:00
OsdUp *prometheus.GaugeVec
//TotalBytes displays total bytes in all OSDs
TotalBytes prometheus.Gauge
2016-07-14 05:01:19 +00:00
//TotalUsedBytes displays total used bytes in all OSDs
TotalUsedBytes prometheus.Gauge
2016-07-14 05:01:19 +00:00
//TotalAvailBytes displays total available bytes in all OSDs
TotalAvailBytes prometheus.Gauge
2016-07-14 05:01:19 +00:00
//AverageUtil displays average utilization in all OSDs
2016-07-14 05:01:19 +00:00
AverageUtil prometheus.Gauge
}
//NewOsdCollector creates an instance of the OsdCollector and instantiates
// the individual metrics that show information about the osd.
2016-07-14 05:01:19 +00:00
func NewOsdCollector(conn Conn) *OsdCollector {
return &OsdCollector{
conn: conn,
CrushWeight: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_crush_weight",
Help: "OSD Crush Weight",
},
[]string{"osd"},
),
Depth: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_depth",
Help: "OSD Depth",
},
[]string{"osd"},
),
Reweight: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_reweight",
Help: "OSD Reweight",
},
[]string{"osd"},
),
Bytes: prometheus.NewGaugeVec(
2016-07-14 05:01:19 +00:00
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_bytes",
Help: "OSD Total Bytes",
2016-07-14 05:01:19 +00:00
},
[]string{"osd"},
),
UsedBytes: prometheus.NewGaugeVec(
2016-07-14 05:01:19 +00:00
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_used_bytes",
Help: "OSD Used Storage in Bytes",
2016-07-14 05:01:19 +00:00
},
[]string{"osd"},
),
AvailBytes: prometheus.NewGaugeVec(
2016-07-14 05:01:19 +00:00
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_avail_bytes",
Help: "OSD Available Storage in Bytes",
2016-07-14 05:01:19 +00:00
},
[]string{"osd"},
),
Utilization: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_utilization",
Help: "OSD Utilization",
},
[]string{"osd"},
),
Pgs: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_pgs",
Help: "OSD Placement Group Count",
},
[]string{"osd"},
),
TotalBytes: prometheus.NewGauge(
2016-07-14 05:01:19 +00:00
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_total_bytes",
Help: "OSD Total Storage Bytes",
2016-07-14 05:01:19 +00:00
},
),
TotalUsedBytes: prometheus.NewGauge(
2016-07-14 05:01:19 +00:00
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_total_used_bytes",
Help: "OSD Total Used Storage Bytes",
2016-07-14 05:01:19 +00:00
},
),
TotalAvailBytes: prometheus.NewGauge(
2016-07-14 05:01:19 +00:00
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_total_avail_bytes",
Help: "OSD Total Available Storage Bytes ",
2016-07-14 05:01:19 +00:00
},
),
AverageUtil: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_average_utilization",
Help: "OSD Average Utilization",
},
),
CommitLatency: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_perf_commit_latency_seconds",
2016-07-14 05:01:19 +00:00
Help: "OSD Perf Commit Latency",
},
[]string{"osd"},
),
ApplyLatency: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_perf_apply_latency_seconds",
2016-07-14 05:01:19 +00:00
Help: "OSD Perf Apply Latency",
},
[]string{"osd"},
),
OsdIn: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_in",
Help: "OSD In Status",
},
[]string{"osd"},
),
OsdUp: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_up",
Help: "OSD Up Status",
},
[]string{"osd"},
),
}
}
func (o *OsdCollector) collectorList() []prometheus.Collector {
return []prometheus.Collector{
o.CrushWeight,
o.Depth,
o.Reweight,
o.Bytes,
o.UsedBytes,
o.AvailBytes,
2016-07-14 05:01:19 +00:00
o.Utilization,
o.Pgs,
o.TotalBytes,
o.TotalUsedBytes,
o.TotalAvailBytes,
2016-07-14 05:01:19 +00:00
o.AverageUtil,
o.CommitLatency,
o.ApplyLatency,
o.OsdIn,
o.OsdUp,
}
}
type cephOsdDf struct {
OsdNodes []struct {
Name string `json:"name"`
CrushWeight json.Number `json:"crush_weight"`
Depth json.Number `json:"depth"`
Reweight json.Number `json:"reweight"`
KB json.Number `json:"kb"`
UsedKB json.Number `json:"kb_used"`
AvailKB json.Number `json:"kb_avail"`
Utilization json.Number `json:"utilization"`
Pgs json.Number `json:"pgs"`
} `json:"nodes"`
Summary struct {
TotalKB json.Number `json:"total_kb"`
TotalUsedKB json.Number `json:"total_kb_used"`
TotalAvailKB json.Number `json:"total_kb_avail"`
AverageUtil json.Number `json:"average_utilization"`
} `json:"summary"`
}
type cephPerfStat struct {
PerfInfo []struct {
ID json.Number `json:"id"`
2016-07-14 05:01:19 +00:00
Stats struct {
CommitLatency json.Number `json:"commit_latency_ms"`
ApplyLatency json.Number `json:"apply_latency_ms"`
} `json:"perf_stats"`
} `json:"osd_perf_infos"`
}
type cephOsdDump struct {
Osds []struct {
Osd json.Number `json:"osd"`
Up json.Number `json:"up"`
In json.Number `json:"in"`
} `json:"osds"`
}
func (o *OsdCollector) collect() error {
2016-07-20 12:16:11 +00:00
cmd := o.cephOSDDfCommand()
2016-07-15 09:37:38 +00:00
2016-07-14 05:01:19 +00:00
buf, _, err := o.conn.MonCommand(cmd)
2016-07-15 09:37:38 +00:00
if err != nil {
log.Println("[ERROR] Unable to collect data from ceph osd df", err)
return err
}
2016-07-14 05:01:19 +00:00
osdDf := &cephOsdDf{}
if err := json.Unmarshal(buf, osdDf); err != nil {
return err
}
for _, node := range osdDf.OsdNodes {
crushWeight, err := node.CrushWeight.Float64()
if err != nil {
return err
}
o.CrushWeight.WithLabelValues(node.Name).Set(crushWeight)
depth, err := node.Depth.Float64()
if err != nil {
2016-07-15 09:37:38 +00:00
2016-07-14 05:01:19 +00:00
return err
}
o.Depth.WithLabelValues(node.Name).Set(depth)
reweight, err := node.Reweight.Float64()
if err != nil {
return err
}
o.Reweight.WithLabelValues(node.Name).Set(reweight)
osdKB, err := node.KB.Float64()
2016-07-14 05:01:19 +00:00
if err != nil {
return nil
}
o.Bytes.WithLabelValues(node.Name).Set(osdKB * 1e3)
2016-07-14 05:01:19 +00:00
usedKB, err := node.UsedKB.Float64()
2016-07-14 05:01:19 +00:00
if err != nil {
return err
}
o.UsedBytes.WithLabelValues(node.Name).Set(usedKB * 1e3)
2016-07-14 05:01:19 +00:00
availKB, err := node.AvailKB.Float64()
2016-07-14 05:01:19 +00:00
if err != nil {
return err
}
o.AvailBytes.WithLabelValues(node.Name).Set(availKB * 1e3)
2016-07-14 05:01:19 +00:00
util, err := node.Utilization.Float64()
if err != nil {
return err
}
o.Utilization.WithLabelValues(node.Name).Set(util)
pgs, err := node.Pgs.Float64()
if err != nil {
continue
2016-07-14 05:01:19 +00:00
}
o.Pgs.WithLabelValues(node.Name).Set(pgs)
}
totalKB, err := osdDf.Summary.TotalKB.Float64()
2016-07-14 05:01:19 +00:00
if err != nil {
2016-07-15 09:37:38 +00:00
return err
2016-07-14 05:01:19 +00:00
}
o.TotalBytes.Set(totalKB * 1e3)
2016-07-14 05:01:19 +00:00
totalUsedKB, err := osdDf.Summary.TotalUsedKB.Float64()
2016-07-14 05:01:19 +00:00
if err != nil {
return err
}
o.TotalUsedBytes.Set(totalUsedKB * 1e3)
2016-07-14 05:01:19 +00:00
totalAvailKB, err := osdDf.Summary.TotalAvailKB.Float64()
2016-07-14 05:01:19 +00:00
if err != nil {
return err
}
o.TotalAvailBytes.Set(totalAvailKB * 1e3)
2016-07-14 05:01:19 +00:00
averageUtil, err := osdDf.Summary.AverageUtil.Float64()
if err != nil {
2016-07-15 09:37:38 +00:00
return err
2016-07-14 05:01:19 +00:00
}
o.AverageUtil.Set(averageUtil)
return nil
}
func (o *OsdCollector) collectOsdPerf() error {
osdPerfCmd := o.cephOSDPerfCommand()
2016-07-15 09:37:38 +00:00
buf, _, err := o.conn.MonCommand(osdPerfCmd)
if err != nil {
log.Println("[ERROR] Unable to collect data from ceph osd perf", err)
return err
}
2016-07-14 05:01:19 +00:00
osdPerf := &cephPerfStat{}
if err := json.Unmarshal(buf, osdPerf); err != nil {
return err
}
for _, perfStat := range osdPerf.PerfInfo {
osdID, err := perfStat.ID.Int64()
2016-07-14 05:01:19 +00:00
if err != nil {
return err
}
osdName := fmt.Sprintf("osd.%v", osdID)
2016-07-14 05:01:19 +00:00
commitLatency, err := perfStat.Stats.CommitLatency.Float64()
if err != nil {
return err
}
o.CommitLatency.WithLabelValues(osdName).Set(commitLatency / 1e3)
2016-07-14 05:01:19 +00:00
applyLatency, err := perfStat.Stats.ApplyLatency.Float64()
if err != nil {
return err
}
o.ApplyLatency.WithLabelValues(osdName).Set(applyLatency / 1e3)
2016-07-14 05:01:19 +00:00
}
return nil
}
func (o *OsdCollector) collectOsdDump() error {
osdDumpCmd := o.cephOsdDump()
2016-07-15 09:37:38 +00:00
buff, _, err := o.conn.MonCommand(osdDumpCmd)
if err != nil {
log.Println("[ERROR] Unable to collect data from ceph osd dump", err)
return err
}
2016-07-14 05:01:19 +00:00
osdDump := &cephOsdDump{}
if err := json.Unmarshal(buff, osdDump); err != nil {
return err
}
for _, dumpInfo := range osdDump.Osds {
osdID, err := dumpInfo.Osd.Int64()
2016-07-14 05:01:19 +00:00
if err != nil {
return err
}
osdName := fmt.Sprintf("osd.%v", osdID)
2016-07-14 05:01:19 +00:00
in, err := dumpInfo.In.Float64()
if err != nil {
return err
}
o.OsdIn.WithLabelValues(osdName).Set(in)
up, err := dumpInfo.Up.Float64()
if err != nil {
return err
}
o.OsdUp.WithLabelValues(osdName).Set(up)
}
return nil
}
func (o *OsdCollector) cephOsdDump() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "osd dump",
"format": "json",
})
if err != nil {
panic(err)
}
return cmd
}
2016-07-20 12:16:11 +00:00
func (o *OsdCollector) cephOSDDfCommand() []byte {
2016-07-14 05:01:19 +00:00
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "osd df",
"format": "json",
})
if err != nil {
2016-07-20 12:16:11 +00:00
panic(err)
2016-07-14 05:01:19 +00:00
}
2016-07-20 12:16:11 +00:00
return cmd
2016-07-14 05:01:19 +00:00
}
func (o *OsdCollector) cephOSDPerfCommand() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "osd perf",
"format": "json",
})
if err != nil {
panic(err)
}
return cmd
}
// Describe sends the descriptors of each OsdCollector related metrics we have defined
// to the provided prometheus channel.
2016-07-14 05:01:19 +00:00
func (o *OsdCollector) Describe(ch chan<- *prometheus.Desc) {
for _, metric := range o.collectorList() {
metric.Describe(ch)
}
}
// Collect sends all the collected metrics to the provided prometheus channel.
// It requires the caller to handle synchronization.
2016-07-14 05:01:19 +00:00
func (o *OsdCollector) Collect(ch chan<- prometheus.Metric) {
if err := o.collectOsdPerf(); err != nil {
log.Println("failed collecting cluster osd perf stats:", err)
}
if err := o.collectOsdDump(); err != nil {
log.Println("failed collecting cluster osd dump", err)
}
2016-07-15 09:37:38 +00:00
if err := o.collect(); err != nil {
log.Println("failed collecting osd metrics:", err)
}
2016-07-14 05:01:19 +00:00
for _, metric := range o.collectorList() {
metric.Collect(ch)
}
}