Merge pull request #26 from AcalephStorage/feature/osd

ceph osd metrics
This commit is contained in:
Vaibhav Bhembre 2016-07-29 15:45:18 -04:00 committed by GitHub
commit 95e41e62eb
4 changed files with 772 additions and 1 deletions

View File

@ -6,7 +6,7 @@ ENV GOPATH /go
ENV PATH $GOROOT/bin:$PATH
ENV APPLOC $GOPATH/src/github.com/digitalocean/ceph_exporter
RUN apt-get install -y apt-transport-https
RUN apt-get update && apt-get install -y apt-transport-https
RUN echo "deb https://download.ceph.com/debian-jewel trusty main" >> /etc/apt/sources.list

520
collectors/osd.go Normal file
View File

@ -0,0 +1,520 @@
package collectors
import (
"encoding/json"
"fmt"
"log"
"github.com/prometheus/client_golang/prometheus"
)
// OSDCollector displays statistics about OSD in the ceph cluster.
// An important aspect of monitoring OSDs is to ensure that when the cluster is up and
// running that all OSDs that are in the cluster are up and running, too
type OSDCollector struct {
conn Conn
// CrushWeight is a persistent setting, and it affects how CRUSH assigns data to OSDs.
// It displays the CRUSH weight for the OSD
CrushWeight *prometheus.GaugeVec
// Depth displays the OSD's level of hierarchy in the CRUSH map
Depth *prometheus.GaugeVec
// Reweight sets an override weight on the OSD.
// It displays value within 0 to 1.
Reweight *prometheus.GaugeVec
// Bytes displays the total bytes available in the OSD
Bytes *prometheus.GaugeVec
// UsedBytes displays the total used bytes in the OSD
UsedBytes *prometheus.GaugeVec
// AvailBytes displays the total available bytes in the OSD
AvailBytes *prometheus.GaugeVec
// Utilization displays current utilization of the OSD
Utilization *prometheus.GaugeVec
// Pgs displays total no. of placement groups in the OSD.
// Available in Ceph Jewel version.
Pgs *prometheus.GaugeVec
// CommitLatency displays in seconds how long it takes for an operation to be applied to disk
CommitLatency *prometheus.GaugeVec
// ApplyLatency displays in seconds how long it takes to get applied to the backing filesystem
ApplyLatency *prometheus.GaugeVec
// OSDIn displays the In state of the OSD
OSDIn *prometheus.GaugeVec
// OSDUp displays the Up state of the OSD
OSDUp *prometheus.GaugeVec
// TotalBytes displays total bytes in all OSDs
TotalBytes prometheus.Gauge
// TotalUsedBytes displays total used bytes in all OSDs
TotalUsedBytes prometheus.Gauge
// TotalAvailBytes displays total available bytes in all OSDs
TotalAvailBytes prometheus.Gauge
// AverageUtil displays average utilization in all OSDs
AverageUtil prometheus.Gauge
}
//NewOSDCollector creates an instance of the OSDCollector and instantiates
// the individual metrics that show information about the OSD.
func NewOSDCollector(conn Conn) *OSDCollector {
return &OSDCollector{
conn: conn,
CrushWeight: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_crush_weight",
Help: "OSD Crush Weight",
},
[]string{"osd"},
),
Depth: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_depth",
Help: "OSD Depth",
},
[]string{"osd"},
),
Reweight: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_reweight",
Help: "OSD Reweight",
},
[]string{"osd"},
),
Bytes: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_bytes",
Help: "OSD Total Bytes",
},
[]string{"osd"},
),
UsedBytes: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_used_bytes",
Help: "OSD Used Storage in Bytes",
},
[]string{"osd"},
),
AvailBytes: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_avail_bytes",
Help: "OSD Available Storage in Bytes",
},
[]string{"osd"},
),
Utilization: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_utilization",
Help: "OSD Utilization",
},
[]string{"osd"},
),
Pgs: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_pgs",
Help: "OSD Placement Group Count",
},
[]string{"osd"},
),
TotalBytes: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_total_bytes",
Help: "OSD Total Storage Bytes",
},
),
TotalUsedBytes: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_total_used_bytes",
Help: "OSD Total Used Storage Bytes",
},
),
TotalAvailBytes: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_total_avail_bytes",
Help: "OSD Total Available Storage Bytes ",
},
),
AverageUtil: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_average_utilization",
Help: "OSD Average Utilization",
},
),
CommitLatency: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_perf_commit_latency_seconds",
Help: "OSD Perf Commit Latency",
},
[]string{"osd"},
),
ApplyLatency: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_perf_apply_latency_seconds",
Help: "OSD Perf Apply Latency",
},
[]string{"osd"},
),
OSDIn: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_in",
Help: "OSD In Status",
},
[]string{"osd"},
),
OSDUp: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "osd_up",
Help: "OSD Up Status",
},
[]string{"osd"},
),
}
}
func (o *OSDCollector) collectorList() []prometheus.Collector {
return []prometheus.Collector{
o.CrushWeight,
o.Depth,
o.Reweight,
o.Bytes,
o.UsedBytes,
o.AvailBytes,
o.Utilization,
o.Pgs,
o.TotalBytes,
o.TotalUsedBytes,
o.TotalAvailBytes,
o.AverageUtil,
o.CommitLatency,
o.ApplyLatency,
o.OSDIn,
o.OSDUp,
}
}
type cephOSDDF struct {
OSDNodes []struct {
Name string `json:"name"`
CrushWeight json.Number `json:"crush_weight"`
Depth json.Number `json:"depth"`
Reweight json.Number `json:"reweight"`
KB json.Number `json:"kb"`
UsedKB json.Number `json:"kb_used"`
AvailKB json.Number `json:"kb_avail"`
Utilization json.Number `json:"utilization"`
Pgs json.Number `json:"pgs"`
} `json:"nodes"`
Summary struct {
TotalKB json.Number `json:"total_kb"`
TotalUsedKB json.Number `json:"total_kb_used"`
TotalAvailKB json.Number `json:"total_kb_avail"`
AverageUtil json.Number `json:"average_utilization"`
} `json:"summary"`
}
type cephPerfStat struct {
PerfInfo []struct {
ID json.Number `json:"id"`
Stats struct {
CommitLatency json.Number `json:"commit_latency_ms"`
ApplyLatency json.Number `json:"apply_latency_ms"`
} `json:"perf_stats"`
} `json:"osd_perf_infos"`
}
type cephOSDDump struct {
OSDs []struct {
OSD json.Number `json:"osd"`
Up json.Number `json:"up"`
In json.Number `json:"in"`
} `json:"osds"`
}
func (o *OSDCollector) collect() error {
cmd := o.cephOSDDFCommand()
buf, _, err := o.conn.MonCommand(cmd)
if err != nil {
log.Println("[ERROR] Unable to collect data from ceph osd df", err)
return err
}
osdDF := &cephOSDDF{}
if err := json.Unmarshal(buf, osdDF); err != nil {
return err
}
for _, node := range osdDF.OSDNodes {
crushWeight, err := node.CrushWeight.Float64()
if err != nil {
return err
}
o.CrushWeight.WithLabelValues(node.Name).Set(crushWeight)
depth, err := node.Depth.Float64()
if err != nil {
return err
}
o.Depth.WithLabelValues(node.Name).Set(depth)
reweight, err := node.Reweight.Float64()
if err != nil {
return err
}
o.Reweight.WithLabelValues(node.Name).Set(reweight)
osdKB, err := node.KB.Float64()
if err != nil {
return nil
}
o.Bytes.WithLabelValues(node.Name).Set(osdKB * 1e3)
usedKB, err := node.UsedKB.Float64()
if err != nil {
return err
}
o.UsedBytes.WithLabelValues(node.Name).Set(usedKB * 1e3)
availKB, err := node.AvailKB.Float64()
if err != nil {
return err
}
o.AvailBytes.WithLabelValues(node.Name).Set(availKB * 1e3)
util, err := node.Utilization.Float64()
if err != nil {
return err
}
o.Utilization.WithLabelValues(node.Name).Set(util)
pgs, err := node.Pgs.Float64()
if err != nil {
continue
}
o.Pgs.WithLabelValues(node.Name).Set(pgs)
}
totalKB, err := osdDF.Summary.TotalKB.Float64()
if err != nil {
return err
}
o.TotalBytes.Set(totalKB * 1e3)
totalUsedKB, err := osdDF.Summary.TotalUsedKB.Float64()
if err != nil {
return err
}
o.TotalUsedBytes.Set(totalUsedKB * 1e3)
totalAvailKB, err := osdDF.Summary.TotalAvailKB.Float64()
if err != nil {
return err
}
o.TotalAvailBytes.Set(totalAvailKB * 1e3)
averageUtil, err := osdDF.Summary.AverageUtil.Float64()
if err != nil {
return err
}
o.AverageUtil.Set(averageUtil)
return nil
}
func (o *OSDCollector) collectOSDPerf() error {
osdPerfCmd := o.cephOSDPerfCommand()
buf, _, err := o.conn.MonCommand(osdPerfCmd)
if err != nil {
log.Println("[ERROR] Unable to collect data from ceph osd perf", err)
return err
}
osdPerf := &cephPerfStat{}
if err := json.Unmarshal(buf, osdPerf); err != nil {
return err
}
for _, perfStat := range osdPerf.PerfInfo {
osdID, err := perfStat.ID.Int64()
if err != nil {
return err
}
osdName := fmt.Sprintf("osd.%v", osdID)
commitLatency, err := perfStat.Stats.CommitLatency.Float64()
if err != nil {
return err
}
o.CommitLatency.WithLabelValues(osdName).Set(commitLatency / 1e3)
applyLatency, err := perfStat.Stats.ApplyLatency.Float64()
if err != nil {
return err
}
o.ApplyLatency.WithLabelValues(osdName).Set(applyLatency / 1e3)
}
return nil
}
func (o *OSDCollector) collectOSDDump() error {
osdDumpCmd := o.cephOSDDump()
buff, _, err := o.conn.MonCommand(osdDumpCmd)
if err != nil {
log.Println("[ERROR] Unable to collect data from ceph osd dump", err)
return err
}
osdDump := &cephOSDDump{}
if err := json.Unmarshal(buff, osdDump); err != nil {
return err
}
for _, dumpInfo := range osdDump.OSDs {
osdID, err := dumpInfo.OSD.Int64()
if err != nil {
return err
}
osdName := fmt.Sprintf("osd.%v", osdID)
in, err := dumpInfo.In.Float64()
if err != nil {
return err
}
o.OSDIn.WithLabelValues(osdName).Set(in)
up, err := dumpInfo.Up.Float64()
if err != nil {
return err
}
o.OSDUp.WithLabelValues(osdName).Set(up)
}
return nil
}
func (o *OSDCollector) cephOSDDump() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "osd dump",
"format": "json",
})
if err != nil {
panic(err)
}
return cmd
}
func (o *OSDCollector) cephOSDDFCommand() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "osd df",
"format": "json",
})
if err != nil {
panic(err)
}
return cmd
}
func (o *OSDCollector) cephOSDPerfCommand() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "osd perf",
"format": "json",
})
if err != nil {
panic(err)
}
return cmd
}
// Describe sends the descriptors of each OSDCollector related metrics we have defined
// to the provided prometheus channel.
func (o *OSDCollector) Describe(ch chan<- *prometheus.Desc) {
for _, metric := range o.collectorList() {
metric.Describe(ch)
}
}
// Collect sends all the collected metrics to the provided prometheus channel.
// It requires the caller to handle synchronization.
func (o *OSDCollector) Collect(ch chan<- prometheus.Metric) {
if err := o.collectOSDPerf(); err != nil {
log.Println("failed collecting osd perf stats:", err)
}
if err := o.collectOSDDump(); err != nil {
log.Println("failed collecting osd dump:", err)
}
if err := o.collect(); err != nil {
log.Println("failed collecting osd metrics:", err)
}
for _, metric := range o.collectorList() {
metric.Collect(ch)
}
}

250
collectors/osd_test.go Normal file
View File

@ -0,0 +1,250 @@
package collectors
import (
"io/ioutil"
"net/http"
"net/http/httptest"
"regexp"
"testing"
"github.com/prometheus/client_golang/prometheus"
)
func TestOSDCollector(t *testing.T) {
for _, tt := range []struct {
input string
regexes []*regexp.Regexp
}{
{
input: `
{
"nodes": [
{
"id": 0,
"name": "osd.0",
"type": "osd",
"type_id": 0,
"crush_weight": 0.010391,
"depth": 2,
"reweight": 1.000000,
"kb": 11150316,
"kb_used": 40772,
"kb_avail": 11109544,
"utilization": 0.365658,
"var": 1.053676,
"pgs": 283
},
{
"id": 2,
"name": "osd.2",
"type": "osd",
"type_id": 0,
"crush_weight": 0.010391,
"depth": 2,
"reweight": 1.000000,
"kb": 11150316,
"kb_used": 36712,
"kb_avail": 11113604,
"utilization": 0.329246,
"var": 0.948753,
"pgs": 162
},
{
"id": 1,
"name": "osd.1",
"type": "osd",
"type_id": 0,
"crush_weight": 0.010391,
"depth": 2,
"reweight": 1.000000,
"kb": 11150316,
"kb_used": 40512,
"kb_avail": 11109804,
"utilization": 0.363326,
"var": 1.046957,
"pgs": 279
},
{
"id": 3,
"name": "osd.3",
"type": "osd",
"type_id": 0,
"crush_weight": 0.010391,
"depth": 2,
"reweight": 1.000000,
"kb": 11150316,
"kb_used": 36784,
"kb_avail": 11113532,
"utilization": 0.329892,
"var": 0.950614,
"pgs": 164
}
],
"stray": [],
"summary": {
"total_kb": 44601264,
"total_kb_used": 154780,
"total_kb_avail": 44446484,
"average_utilization": 0.347031,
"min_var": 0.948753,
"max_var": 1.053676,
"dev": 0.017482
}
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_osd_crush_weight{osd="osd.0"} 0.010391`),
regexp.MustCompile(`ceph_osd_crush_weight{osd="osd.1"} 0.010391`),
regexp.MustCompile(`ceph_osd_crush_weight{osd="osd.2"} 0.010391`),
regexp.MustCompile(`ceph_osd_crush_weight{osd="osd.3"} 0.010391`),
regexp.MustCompile(`ceph_osd_depth{osd="osd.0"} 2`),
regexp.MustCompile(`ceph_osd_depth{osd="osd.1"} 2`),
regexp.MustCompile(`ceph_osd_depth{osd="osd.2"} 2`),
regexp.MustCompile(`ceph_osd_depth{osd="osd.3"} 2`),
regexp.MustCompile(`ceph_osd_reweight{osd="osd.0"} 1`),
regexp.MustCompile(`ceph_osd_reweight{osd="osd.1"} 1`),
regexp.MustCompile(`ceph_osd_reweight{osd="osd.2"} 1`),
regexp.MustCompile(`ceph_osd_reweight{osd="osd.3"} 1`),
regexp.MustCompile(`ceph_osd_bytes{osd="osd.0"} 1.1150316e`),
regexp.MustCompile(`ceph_osd_bytes{osd="osd.1"} 1.1150316e`),
regexp.MustCompile(`ceph_osd_bytes{osd="osd.2"} 1.1150316e`),
regexp.MustCompile(`ceph_osd_bytes{osd="osd.3"} 1.1150316e`),
regexp.MustCompile(`ceph_osd_used_bytes{osd="osd.0"} 4.0772e`),
regexp.MustCompile(`ceph_osd_used_bytes{osd="osd.1"} 4.0512e`),
regexp.MustCompile(`ceph_osd_used_bytes{osd="osd.2"} 3.6712e`),
regexp.MustCompile(`ceph_osd_used_bytes{osd="osd.3"} 3.6784e`),
regexp.MustCompile(`ceph_osd_avail_bytes{osd="osd.0"} 1.1109544e`),
regexp.MustCompile(`ceph_osd_avail_bytes{osd="osd.1"} 1.1109804e`),
regexp.MustCompile(`ceph_osd_avail_bytes{osd="osd.2"} 1.1113604e`),
regexp.MustCompile(`ceph_osd_avail_bytes{osd="osd.3"} 1.1113532e`),
regexp.MustCompile(`ceph_osd_utilization{osd="osd.0"} 0.365658`),
regexp.MustCompile(`ceph_osd_utilization{osd="osd.1"} 0.363326`),
regexp.MustCompile(`ceph_osd_utilization{osd="osd.2"} 0.329246`),
regexp.MustCompile(`ceph_osd_utilization{osd="osd.3"} 0.329892`),
regexp.MustCompile(`ceph_osd_pgs{osd="osd.0"} 283`),
regexp.MustCompile(`ceph_osd_pgs{osd="osd.1"} 279`),
regexp.MustCompile(`ceph_osd_pgs{osd="osd.2"} 162`),
regexp.MustCompile(`ceph_osd_pgs{osd="osd.3"} 164`),
regexp.MustCompile(`ceph_osd_total_bytes 4.4601264e`),
regexp.MustCompile(`ceph_osd_total_used_bytes 1.5478e`),
regexp.MustCompile(`ceph_osd_total_avail_bytes 4.4446484e`),
regexp.MustCompile(`ceph_osd_average_utilization 0.347031`),
},
},
{
input: `
{
"osd_perf_infos": [
{
"id": 3,
"perf_stats": {
"commit_latency_ms": 1,
"apply_latency_ms": 64
}
},
{
"id": 2,
"perf_stats": {
"commit_latency_ms": 2,
"apply_latency_ms": 79
}
},
{
"id": 1,
"perf_stats": {
"commit_latency_ms": 2,
"apply_latency_ms": 39
}
},
{
"id": 0,
"perf_stats": {
"commit_latency_ms": 2,
"apply_latency_ms": 31
}
}
]
}`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_osd_perf_commit_latency_seconds{osd="osd.0"} 0.002`),
regexp.MustCompile(`ceph_osd_perf_commit_latency_seconds{osd="osd.1"} 0.002`),
regexp.MustCompile(`ceph_osd_perf_commit_latency_seconds{osd="osd.2"} 0.002`),
regexp.MustCompile(`ceph_osd_perf_commit_latency_seconds{osd="osd.3"} 0.001`),
regexp.MustCompile(`ceph_osd_perf_apply_latency_seconds{osd="osd.0"} 0.031`),
regexp.MustCompile(`ceph_osd_perf_apply_latency_seconds{osd="osd.1"} 0.039`),
regexp.MustCompile(`ceph_osd_perf_apply_latency_seconds{osd="osd.2"} 0.079`),
regexp.MustCompile(`ceph_osd_perf_apply_latency_seconds{osd="osd.3"} 0.064`),
},
},
{
input: `
{
"osds": [
{
"osd": 0,
"uuid": "135b53c3",
"up": 1,
"in": 1
},
{
"osd": 1,
"uuid": "370a33f2",
"up": 1,
"in": 1
},
{
"osd": 2,
"uuid": "ca9ab3de",
"up": 1,
"in": 1
},
{
"osd": 3,
"uuid": "bef98b10",
"up": 1,
"in": 1
}
]
}
`,
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_osd_in{osd="osd.0"} 1`),
regexp.MustCompile(`ceph_osd_in{osd="osd.1"} 1`),
regexp.MustCompile(`ceph_osd_in{osd="osd.2"} 1`),
regexp.MustCompile(`ceph_osd_in{osd="osd.3"} 1`),
regexp.MustCompile(`ceph_osd_up{osd="osd.0"} 1`),
regexp.MustCompile(`ceph_osd_up{osd="osd.1"} 1`),
regexp.MustCompile(`ceph_osd_up{osd="osd.2"} 1`),
regexp.MustCompile(`ceph_osd_up{osd="osd.3"} 1`),
},
},
} {
func() {
collector := NewOSDCollector(NewNoopConn(tt.input))
if err := prometheus.Register(collector); err != nil {
t.Fatalf("collector failed to register: %s", err)
}
defer prometheus.Unregister(collector)
server := httptest.NewServer(prometheus.Handler())
defer server.Close()
resp, err := http.Get(server.URL)
if err != nil {
t.Fatalf("unexpected failed response from prometheus: %s", err)
}
defer resp.Body.Close()
buf, err := ioutil.ReadAll(resp.Body)
if err != nil {
t.Fatalf("failed reading server response: %s", err)
}
for _, re := range tt.regexes {
if !re.Match(buf) {
t.Errorf("failed matching: %q", re)
}
}
}()
}
}

View File

@ -50,6 +50,7 @@ func NewCephExporter(conn *rados.Conn) *CephExporter {
collectors.NewPoolUsageCollector(conn),
collectors.NewClusterHealthCollector(conn),
collectors.NewMonitorCollector(conn),
collectors.NewOSDCollector(conn),
},
}
}