ceph_exporter/collectors/monitors.go

367 lines
10 KiB
Go

// Copyright 2016 DigitalOcean
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package collectors
import (
"encoding/json"
"log"
"github.com/prometheus/client_golang/prometheus"
)
// MonitorCollector is used to extract stats related to monitors
// running within Ceph cluster. As we extract information pertaining
// to each monitor instance, there are various vector metrics we
// need to use.
type MonitorCollector struct {
conn Conn
// TotalKBs display the total storage a given monitor node has.
TotalKBs *prometheus.GaugeVec
// UsedKBs depict how much of the total storage our monitor process
// has utilized.
UsedKBs *prometheus.GaugeVec
// AvailKBs shows the space left unused.
AvailKBs *prometheus.GaugeVec
// PercentAvail shows the amount of unused space as a percentage of total
// space.
PercentAvail *prometheus.GaugeVec
// Store exposes information about internal backing store.
Store Store
// ClockSkew shows how far the monitor clocks have skewed from each other. This
// is an important metric because the functioning of Ceph's paxos depends on
// the clocks being aligned as close to each other as possible.
ClockSkew *prometheus.GaugeVec
// Latency displays the time the monitors take to communicate between themselves.
Latency *prometheus.GaugeVec
// NodesinQuorum show the size of the working monitor quorum. Any change in this
// metric can imply a significant issue in the cluster if it is not manually changed.
NodesinQuorum prometheus.Gauge
}
// Store displays information about Monitor's FileStore. It is responsible for
// storing all the meta information about the cluster, including monmaps, osdmaps,
// pgmaps, etc. along with logs and other data.
type Store struct {
// TotalBytes displays the current size of the FileStore.
TotalBytes *prometheus.GaugeVec
// SSTBytes shows the amount used by LevelDB's sorted-string tables.
SSTBytes *prometheus.GaugeVec
// LogBytes shows the amount used by logs.
LogBytes *prometheus.GaugeVec
// MiscBytes shows the amount used by miscellaneous information.
MiscBytes *prometheus.GaugeVec
}
// NewMonitorCollector creates an instance of the MonitorCollector and instantiates
// the individual metrics that show information about the monitor processes.
func NewMonitorCollector(conn Conn, cluster string) *MonitorCollector {
labels := make(prometheus.Labels)
labels["cluster"] = cluster
return &MonitorCollector{
conn: conn,
TotalKBs: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_capacity_bytes",
Help: "Total storage capacity of the monitor node",
ConstLabels: labels,
},
[]string{"monitor"},
),
UsedKBs: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_used_bytes",
Help: "Storage of the monitor node that is currently allocated for use",
ConstLabels: labels,
},
[]string{"monitor"},
),
AvailKBs: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_avail_bytes",
Help: "Total unused storage capacity that the monitor node has left",
ConstLabels: labels,
},
[]string{"monitor"},
),
PercentAvail: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_avail_percent",
Help: "Percentage of total unused storage capacity that the monitor node has left",
ConstLabels: labels,
},
[]string{"monitor"},
),
Store: Store{
TotalBytes: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_store_capacity_bytes",
Help: "Total capacity of the FileStore backing the monitor daemon",
ConstLabels: labels,
},
[]string{"monitor"},
),
SSTBytes: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_store_sst_bytes",
Help: "Capacity of the FileStore used only for raw SSTs",
ConstLabels: labels,
},
[]string{"monitor"},
),
LogBytes: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_store_log_bytes",
Help: "Capacity of the FileStore used only for logging",
ConstLabels: labels,
},
[]string{"monitor"},
),
MiscBytes: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_store_misc_bytes",
Help: "Capacity of the FileStore used only for storing miscellaneous information",
ConstLabels: labels,
},
[]string{"monitor"},
),
},
ClockSkew: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_clock_skew_seconds",
Help: "Clock skew the monitor node is incurring",
ConstLabels: labels,
},
[]string{"monitor"},
),
Latency: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_latency_seconds",
Help: "Latency the monitor node is incurring",
ConstLabels: labels,
},
[]string{"monitor"},
),
NodesinQuorum: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "monitor_quorum_count",
Help: "The total size of the monitor quorum",
ConstLabels: labels,
},
),
}
}
func (m *MonitorCollector) collectorList() []prometheus.Collector {
return []prometheus.Collector{
m.TotalKBs,
m.UsedKBs,
m.AvailKBs,
m.PercentAvail,
m.Store.TotalBytes,
m.Store.SSTBytes,
m.Store.LogBytes,
m.Store.MiscBytes,
m.ClockSkew,
m.Latency,
}
}
func (m *MonitorCollector) metricsList() []prometheus.Metric {
return []prometheus.Metric{
m.NodesinQuorum,
}
}
type cephMonitorStats struct {
Health struct {
Health struct {
HealthServices []struct {
Mons []struct {
Name string `json:"name"`
KBTotal json.Number `json:"kb_total"`
KBUsed json.Number `json:"kb_used"`
KBAvail json.Number `json:"kb_avail"`
AvailPercent json.Number `json:"avail_percent"`
StoreStats struct {
BytesTotal json.Number `json:"bytes_total"`
BytesSST json.Number `json:"bytes_sst"`
BytesLog json.Number `json:"bytes_log"`
BytesMisc json.Number `json:"bytes_misc"`
} `json:"store_stats"`
} `json:"mons"`
} `json:"health_services"`
} `json:"health"`
TimeChecks struct {
Mons []struct {
Name string `json:"name"`
Skew json.Number `json:"skew"`
Latency json.Number `json:"latency"`
} `json:"mons"`
} `json:"timechecks"`
} `json:"health"`
Quorum []int `json:"quorum"`
}
func (m *MonitorCollector) collect() error {
cmd := m.cephUsageCommand()
buf, _, err := m.conn.MonCommand(cmd)
if err != nil {
return err
}
stats := &cephMonitorStats{}
if err := json.Unmarshal(buf, stats); err != nil {
return err
}
for _, healthService := range stats.Health.Health.HealthServices {
for _, monstat := range healthService.Mons {
kbTotal, err := monstat.KBTotal.Float64()
if err != nil {
return err
}
m.TotalKBs.WithLabelValues(monstat.Name).Set(kbTotal * 1e3)
kbUsed, err := monstat.KBUsed.Float64()
if err != nil {
return err
}
m.UsedKBs.WithLabelValues(monstat.Name).Set(kbUsed * 1e3)
kbAvail, err := monstat.KBAvail.Float64()
if err != nil {
return err
}
m.AvailKBs.WithLabelValues(monstat.Name).Set(kbAvail * 1e3)
percentAvail, err := monstat.AvailPercent.Float64()
if err != nil {
return err
}
m.PercentAvail.WithLabelValues(monstat.Name).Set(percentAvail)
storeBytes, err := monstat.StoreStats.BytesTotal.Float64()
if err != nil {
return err
}
m.Store.TotalBytes.WithLabelValues(monstat.Name).Set(storeBytes)
sstBytes, err := monstat.StoreStats.BytesSST.Float64()
if err != nil {
return err
}
m.Store.SSTBytes.WithLabelValues(monstat.Name).Set(sstBytes)
logBytes, err := monstat.StoreStats.BytesLog.Float64()
if err != nil {
return err
}
m.Store.LogBytes.WithLabelValues(monstat.Name).Set(logBytes)
miscBytes, err := monstat.StoreStats.BytesMisc.Float64()
if err != nil {
return err
}
m.Store.MiscBytes.WithLabelValues(monstat.Name).Set(miscBytes)
}
}
for _, monstat := range stats.Health.TimeChecks.Mons {
skew, err := monstat.Skew.Float64()
if err != nil {
return err
}
m.ClockSkew.WithLabelValues(monstat.Name).Set(skew)
latency, err := monstat.Latency.Float64()
if err != nil {
return err
}
m.Latency.WithLabelValues(monstat.Name).Set(latency)
}
m.NodesinQuorum.Set(float64(len(stats.Quorum)))
return nil
}
func (m *MonitorCollector) cephUsageCommand() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "status",
"format": "json",
})
if err != nil {
// panic! because ideally in no world this hard-coded input
// should fail.
panic(err)
}
return cmd
}
// Describe sends the descriptors of each Monitor related metric we have defined
// to the channel provided.
func (m *MonitorCollector) Describe(ch chan<- *prometheus.Desc) {
for _, metric := range m.collectorList() {
metric.Describe(ch)
}
for _, metric := range m.metricsList() {
ch <- metric.Desc()
}
}
// Collect extracts the given metrics from the Monitors and sends it to the prometheus
// channel.
func (m *MonitorCollector) Collect(ch chan<- prometheus.Metric) {
if err := m.collect(); err != nil {
log.Println("failed collecting monitor metrics:", err)
return
}
for _, metric := range m.collectorList() {
metric.Collect(ch)
}
for _, metric := range m.metricsList() {
ch <- metric
}
}