Port over lumoinous changes for labels to nautilus branch
This commit is contained in:
parent
0031656370
commit
8c7e4dde45
|
@ -43,8 +43,9 @@ var _ Conn = &rados.Conn{}
|
|||
// we can deprecate output, because cmdOut is able to hold the outputs we desire
|
||||
// for multiple commands for "go test".
|
||||
type NoopConn struct {
|
||||
output string // deprecated
|
||||
cmdOut map[string]string
|
||||
output string // deprecated
|
||||
cmdOut []map[string]string
|
||||
iteration int
|
||||
}
|
||||
|
||||
// The stub we use for testing should also satisfy the interface properties.
|
||||
|
@ -54,17 +55,22 @@ var _ Conn = &NoopConn{}
|
|||
// at the end of the command we issue to Ceph is fixed and should be specified
|
||||
// in the only input parameter.
|
||||
func NewNoopConn(output string) *NoopConn {
|
||||
return &NoopConn{output: output}
|
||||
}
|
||||
|
||||
// NewNoopConnWithCmdOut returns an instance of *NoopConn. The string that we
|
||||
// want output at the end of the command we issue to Ceph can be various and
|
||||
// should be specified by the map in the only input parameter.
|
||||
func NewNoopConnWithCmdOut(cmdOut []map[string]string) *NoopConn {
|
||||
return &NoopConn{
|
||||
output: output,
|
||||
cmdOut: make(map[string]string),
|
||||
cmdOut: cmdOut,
|
||||
iteration: 0,
|
||||
}
|
||||
}
|
||||
|
||||
// NewNoopConnWithCmdOut returns an instance of *NoopConn. THe string that we
|
||||
// want output at the end of the command we issue to Ceph can be various and
|
||||
// should be specified by the map in the only input parameter.
|
||||
func NewNoopConnWithCmdOut(cmdOut map[string]string) *NoopConn {
|
||||
return &NoopConn{cmdOut: cmdOut}
|
||||
// IncIteration increments iteration by 1.
|
||||
func (n *NoopConn) IncIteration() {
|
||||
n.iteration++
|
||||
}
|
||||
|
||||
// ReadDefaultConfigFile does not need to return an error. It satisfies
|
||||
|
@ -107,13 +113,13 @@ func (n *NoopConn) MonCommand(args []byte) ([]byte, string, error) {
|
|||
|
||||
switch dc[0] {
|
||||
case "pgs_brief":
|
||||
return []byte(n.cmdOut["ceph pg dump pgs_brief"]), "", nil
|
||||
return []byte(n.cmdOut[n.iteration]["ceph pg dump pgs_brief"]), "", nil
|
||||
}
|
||||
|
||||
case "osd tree":
|
||||
val, ok := cmd["states"]
|
||||
if !ok {
|
||||
break
|
||||
return []byte(n.cmdOut[n.iteration]["ceph osd tree"]), "", nil
|
||||
}
|
||||
|
||||
st, ok := val.([]interface{})
|
||||
|
@ -123,22 +129,24 @@ func (n *NoopConn) MonCommand(args []byte) ([]byte, string, error) {
|
|||
|
||||
switch st[0] {
|
||||
case "down":
|
||||
return []byte(n.cmdOut["ceph osd tree down"]), "", nil
|
||||
return []byte(n.cmdOut[n.iteration]["ceph osd tree down"]), "", nil
|
||||
}
|
||||
|
||||
case "osd df":
|
||||
return []byte(n.cmdOut["ceph osd df"]), "", nil
|
||||
return []byte(n.cmdOut[n.iteration]["ceph osd df"]), "", nil
|
||||
|
||||
case "osd perf":
|
||||
return []byte(n.cmdOut["ceph osd perf"]), "", nil
|
||||
return []byte(n.cmdOut[n.iteration]["ceph osd perf"]), "", nil
|
||||
|
||||
case "osd dump":
|
||||
return []byte(n.cmdOut["ceph osd dump"]), "", nil
|
||||
return []byte(n.cmdOut[n.iteration]["ceph osd dump"]), "", nil
|
||||
}
|
||||
|
||||
return []byte(n.output), "", nil
|
||||
}
|
||||
|
||||
// PGCommand returns the provided output string to NoopConn as is, making
|
||||
// it seem like it actually ran something and producted that string as a result.
|
||||
// it seem like it actually ran something and produced that string as a result.
|
||||
func (n *NoopConn) PGCommand(pgid, args []byte) ([]byte, string, error) {
|
||||
// Unmarshal the input command and see if we need to intercept
|
||||
cmd := map[string]interface{}{}
|
||||
|
@ -150,7 +158,7 @@ func (n *NoopConn) PGCommand(pgid, args []byte) ([]byte, string, error) {
|
|||
// Intercept and mock the output
|
||||
switch prefix := cmd["prefix"]; prefix {
|
||||
case "query":
|
||||
return []byte(n.cmdOut[fmt.Sprintf("ceph tell %s query", string(pgid))]), "", nil
|
||||
return []byte(n.cmdOut[n.iteration][fmt.Sprintf("ceph tell %s query", string(pgid))]), "", nil
|
||||
}
|
||||
|
||||
return []byte(n.output), "", nil
|
||||
|
|
|
@ -5,6 +5,9 @@ import (
|
|||
"encoding/json"
|
||||
"fmt"
|
||||
"log"
|
||||
"math"
|
||||
"regexp"
|
||||
"strconv"
|
||||
|
||||
"strings"
|
||||
|
||||
|
@ -21,23 +24,33 @@ const (
|
|||
scrubStateDeepScrubbing = 2
|
||||
)
|
||||
|
||||
// OSDCollector displays statistics about OSD in the ceph cluster.
|
||||
// OSDCollector displays statistics about OSD in the Ceph cluster.
|
||||
// An important aspect of monitoring OSDs is to ensure that when the cluster is
|
||||
// up and running that all OSDs that are in the cluster are up and running, too
|
||||
type OSDCollector struct {
|
||||
conn Conn
|
||||
|
||||
// initalCollect flags if it is the first time for this OSDCollector to
|
||||
// collect metrics
|
||||
initialCollect bool
|
||||
|
||||
// osdScrubCache holds the cache of previous PG scrubs
|
||||
osdScrubCache map[int]int
|
||||
|
||||
// osdLabelsCache holds a cache of osd labels
|
||||
osdLabelsCache map[int64]*cephOSDLabel
|
||||
|
||||
// osdObjectsBackfilledCache holds the cache of previous increase in number
|
||||
// of objects backfilled of all OSDs
|
||||
osdObjectsBackfilledCache map[int64]int64
|
||||
|
||||
// pgStateCache holds the cache of previous states of all PGs
|
||||
pgStateCache map[string]string
|
||||
|
||||
// pgObjectsRecoveredCache holds the cache of previous number of objects
|
||||
// recovered of all PGs
|
||||
pgObjectsRecoveredCache map[string]int64
|
||||
|
||||
// pgBackfillTargetsCache holds the cache of previous backfill targets OSDs
|
||||
// of all PGs
|
||||
pgBackfillTargetsCache map[string]map[int64]int64
|
||||
|
||||
// pgDumpBrief holds the content of PG dump brief
|
||||
pgDumpBrief cephPGDumpBrief
|
||||
|
||||
|
@ -113,6 +126,9 @@ type OSDCollector struct {
|
|||
|
||||
// PGObjectsRecoveredDesc displays total number of objects recovered in a PG
|
||||
PGObjectsRecoveredDesc *prometheus.Desc
|
||||
|
||||
// OSDObjectsBackfilled displays average number of objects backfilled in an OSD
|
||||
OSDObjectsBackfilled *prometheus.CounterVec
|
||||
}
|
||||
|
||||
// This ensures OSDCollector implements interface prometheus.Collector.
|
||||
|
@ -123,12 +139,17 @@ var _ prometheus.Collector = &OSDCollector{}
|
|||
func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
||||
labels := make(prometheus.Labels)
|
||||
labels["cluster"] = cluster
|
||||
osdLabels := []string{"osd", "device_class", "host", "rack", "root"}
|
||||
|
||||
return &OSDCollector{
|
||||
conn: conn,
|
||||
initialCollect: true,
|
||||
osdScrubCache: make(map[int]int),
|
||||
pgObjectsRecoveredCache: make(map[string]int64),
|
||||
conn: conn,
|
||||
|
||||
osdScrubCache: make(map[int]int),
|
||||
osdLabelsCache: make(map[int64]*cephOSDLabel),
|
||||
osdObjectsBackfilledCache: make(map[int64]int64),
|
||||
pgStateCache: make(map[string]string),
|
||||
pgObjectsRecoveredCache: make(map[string]int64),
|
||||
pgBackfillTargetsCache: make(map[string]map[int64]int64),
|
||||
|
||||
CrushWeight: prometheus.NewGaugeVec(
|
||||
prometheus.GaugeOpts{
|
||||
|
@ -137,7 +158,7 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD Crush Weight",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
Depth: prometheus.NewGaugeVec(
|
||||
|
@ -147,7 +168,7 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD Depth",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
Reweight: prometheus.NewGaugeVec(
|
||||
|
@ -157,7 +178,7 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD Reweight",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
Bytes: prometheus.NewGaugeVec(
|
||||
|
@ -167,7 +188,7 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD Total Bytes",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
UsedBytes: prometheus.NewGaugeVec(
|
||||
|
@ -177,7 +198,7 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD Used Storage in Bytes",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
AvailBytes: prometheus.NewGaugeVec(
|
||||
|
@ -187,7 +208,7 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD Available Storage in Bytes",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
Utilization: prometheus.NewGaugeVec(
|
||||
|
@ -197,7 +218,7 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD Utilization",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
Variance: prometheus.NewGaugeVec(
|
||||
|
@ -207,7 +228,7 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD Variance",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
Pgs: prometheus.NewGaugeVec(
|
||||
|
@ -217,7 +238,7 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD Placement Group Count",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
TotalBytes: prometheus.NewGauge(
|
||||
|
@ -262,7 +283,7 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD Perf Commit Latency",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
ApplyLatency: prometheus.NewGaugeVec(
|
||||
|
@ -272,7 +293,7 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD Perf Apply Latency",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
OSDIn: prometheus.NewGaugeVec(
|
||||
|
@ -282,7 +303,7 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD In Status",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
OSDUp: prometheus.NewGaugeVec(
|
||||
|
@ -292,7 +313,7 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD Up Status",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
OSDFull: prometheus.NewGaugeVec(
|
||||
|
@ -302,7 +323,7 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD Full Status",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
OSDNearFull: prometheus.NewGaugeVec(
|
||||
|
@ -312,7 +333,7 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD Near Full Status",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
OSDBackfillFull: prometheus.NewGaugeVec(
|
||||
|
@ -322,20 +343,20 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
Help: "OSD Backfill Full Status",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
),
|
||||
|
||||
OSDDownDesc: prometheus.NewDesc(
|
||||
fmt.Sprintf("%s_osd_down", cephNamespace),
|
||||
"Number of OSDs down in the cluster",
|
||||
[]string{"osd", "status"},
|
||||
append([]string{"status"}, osdLabels...),
|
||||
labels,
|
||||
),
|
||||
|
||||
ScrubbingStateDesc: prometheus.NewDesc(
|
||||
fmt.Sprintf("%s_osd_scrub_state", cephNamespace),
|
||||
"State of OSDs involved in a scrub",
|
||||
[]string{"osd"},
|
||||
osdLabels,
|
||||
labels,
|
||||
),
|
||||
|
||||
|
@ -345,6 +366,16 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
|
|||
[]string{"pgid"},
|
||||
labels,
|
||||
),
|
||||
|
||||
OSDObjectsBackfilled: prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: cephNamespace,
|
||||
Name: "osd_objects_backfilled",
|
||||
Help: "Average number of objects backfilled in an OSD",
|
||||
ConstLabels: labels,
|
||||
},
|
||||
append([]string{"pgid"}, osdLabels...),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -370,6 +401,7 @@ func (o *OSDCollector) collectorList() []prometheus.Collector {
|
|||
o.OSDFull,
|
||||
o.OSDNearFull,
|
||||
o.OSDBackfillFull,
|
||||
o.OSDObjectsBackfilled,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -418,6 +450,26 @@ type cephOSDDump struct {
|
|||
} `json:"osds"`
|
||||
}
|
||||
|
||||
type cephOSDTree struct {
|
||||
Nodes []struct {
|
||||
ID int64 `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Status string `json:"status"`
|
||||
Class string `json:"device_class"`
|
||||
CrushWeight float64 `json:"crush_weight"`
|
||||
Children []int64 `json:"children"`
|
||||
} `json:"nodes"`
|
||||
Stray []struct {
|
||||
ID int64 `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Status string `json:"status"`
|
||||
CrushWeight float64 `json:"crush_weight"`
|
||||
Children []int `json:"children"`
|
||||
} `json:"stray"`
|
||||
}
|
||||
|
||||
type cephOSDTreeDown struct {
|
||||
Nodes []struct {
|
||||
ID int64 `json:"id"`
|
||||
|
@ -460,7 +512,52 @@ type cephPGQuery struct {
|
|||
} `json:"recovery_state"`
|
||||
}
|
||||
|
||||
// backfillTargets function would go here - not sure if it is needed
|
||||
type cephOSDLabel struct {
|
||||
ID int64 `json:"id"`
|
||||
Name string `json:"name"`
|
||||
Type string `json:"type"`
|
||||
Status string `json:"status"`
|
||||
DeviceClass string `json:"device_class"`
|
||||
CrushWeight float64 `json:"crush_weight"`
|
||||
Root string `json:"host"`
|
||||
Rack string `json:"rack"`
|
||||
Host string `json:"host"`
|
||||
parent int64 // parent id when building tables
|
||||
}
|
||||
|
||||
// backfillTargets returns a map from PG query result containing OSDs and
|
||||
// corresponding shards that are being backfilled.
|
||||
func (c cephPGQuery) backfillTargets() map[int64]int64 {
|
||||
osdRegExp := regexp.MustCompile(`^(\d+)\((\d+)\)$`)
|
||||
targets := make(map[int64]int64)
|
||||
|
||||
for _, state := range c.RecoveryState {
|
||||
if state.RecoverProgress == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
for _, osd := range state.RecoverProgress.BackfillTargets {
|
||||
m := osdRegExp.FindStringSubmatch(osd)
|
||||
if m == nil {
|
||||
continue
|
||||
}
|
||||
|
||||
osdID, err := strconv.ParseInt(m[1], 10, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
shard, err := strconv.ParseInt(m[2], 10, 64)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
targets[osdID] = shard
|
||||
}
|
||||
}
|
||||
|
||||
return targets
|
||||
}
|
||||
|
||||
func (o *OSDCollector) collectOSDDF() error {
|
||||
cmd := o.cephOSDDFCommand()
|
||||
|
@ -480,70 +577,70 @@ func (o *OSDCollector) collectOSDDF() error {
|
|||
}
|
||||
|
||||
for _, node := range osdDF.OSDNodes {
|
||||
lb := o.getOSDLabelFromName(node.Name)
|
||||
|
||||
crushWeight, err := node.CrushWeight.Float64()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
o.CrushWeight.WithLabelValues(node.Name).Set(crushWeight)
|
||||
|
||||
o.CrushWeight.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(crushWeight)
|
||||
depth, err := node.Depth.Float64()
|
||||
if err != nil {
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
o.Depth.WithLabelValues(node.Name).Set(depth)
|
||||
o.Depth.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(depth)
|
||||
|
||||
reweight, err := node.Reweight.Float64()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
o.Reweight.WithLabelValues(node.Name).Set(reweight)
|
||||
o.Reweight.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(reweight)
|
||||
|
||||
osdKB, err := node.KB.Float64()
|
||||
if err != nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
o.Bytes.WithLabelValues(node.Name).Set(osdKB * 1e3)
|
||||
o.Bytes.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(osdKB * 1e3)
|
||||
|
||||
usedKB, err := node.UsedKB.Float64()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
o.UsedBytes.WithLabelValues(node.Name).Set(usedKB * 1e3)
|
||||
o.UsedBytes.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(usedKB * 1e3)
|
||||
|
||||
availKB, err := node.AvailKB.Float64()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
o.AvailBytes.WithLabelValues(node.Name).Set(availKB * 1e3)
|
||||
o.AvailBytes.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(availKB * 1e3)
|
||||
|
||||
util, err := node.Utilization.Float64()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
o.Utilization.WithLabelValues(node.Name).Set(util)
|
||||
o.Utilization.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(util)
|
||||
|
||||
variance, err := node.Variance.Float64()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
o.Variance.WithLabelValues(node.Name).Set(variance)
|
||||
o.Variance.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(variance)
|
||||
|
||||
pgs, err := node.Pgs.Float64()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
o.Pgs.WithLabelValues(node.Name).Set(pgs)
|
||||
o.Pgs.WithLabelValues(node.Name, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(pgs)
|
||||
|
||||
}
|
||||
|
||||
|
@ -599,22 +696,120 @@ func (o *OSDCollector) collectOSDPerf() error {
|
|||
}
|
||||
osdName := fmt.Sprintf(osdLabelFormat, osdID)
|
||||
|
||||
lb := o.getOSDLabelFromID(osdID)
|
||||
|
||||
commitLatency, err := perfStat.Stats.CommitLatency.Float64()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
o.CommitLatency.WithLabelValues(osdName).Set(commitLatency / 1e3)
|
||||
o.CommitLatency.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(commitLatency / 1e3)
|
||||
|
||||
applyLatency, err := perfStat.Stats.ApplyLatency.Float64()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
o.ApplyLatency.WithLabelValues(osdName).Set(applyLatency / 1e3)
|
||||
o.ApplyLatency.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(applyLatency / 1e3)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
func buildOSDLabels(data []byte) (map[int64]*cephOSDLabel, error) {
|
||||
nodeList := &cephOSDTree{}
|
||||
if err := json.Unmarshal(data, nodeList); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
nodeMap := make(map[int64]*cephOSDLabel)
|
||||
for _, node := range nodeList.Nodes {
|
||||
label := cephOSDLabel{
|
||||
ID: node.ID,
|
||||
Name: node.Name,
|
||||
Type: node.Type,
|
||||
Status: node.Status,
|
||||
DeviceClass: node.Class,
|
||||
CrushWeight: node.CrushWeight,
|
||||
parent: math.MaxInt64,
|
||||
}
|
||||
nodeMap[node.ID] = &label
|
||||
}
|
||||
// now that we built a lookup table, fill in the parents
|
||||
for _, node := range nodeList.Nodes {
|
||||
for _, child := range node.Children {
|
||||
if label, ok := nodeMap[child]; ok {
|
||||
label.parent = node.ID
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
var findParent func(from *cephOSDLabel, kind string) (*cephOSDLabel, bool)
|
||||
findParent = func(from *cephOSDLabel, kind string) (*cephOSDLabel, bool) {
|
||||
if parent, ok := nodeMap[from.parent]; ok {
|
||||
if parent.Type == kind {
|
||||
return parent, true
|
||||
}
|
||||
return findParent(parent, kind)
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// Now that we have parents filled in walk our map, and build a map of just osds.
|
||||
for k := range nodeMap {
|
||||
osdLabel := nodeMap[k]
|
||||
if host, ok := findParent(osdLabel, "host"); ok {
|
||||
osdLabel.Host = host.Name
|
||||
}
|
||||
if rack, ok := findParent(osdLabel, "rack"); ok {
|
||||
osdLabel.Rack = rack.Name
|
||||
}
|
||||
if root, ok := findParent(osdLabel, "root"); ok {
|
||||
osdLabel.Root = root.Name
|
||||
}
|
||||
}
|
||||
|
||||
for k := range nodeMap {
|
||||
osdLabel := nodeMap[k]
|
||||
if osdLabel.Type != "osd" {
|
||||
delete(nodeMap, k)
|
||||
}
|
||||
}
|
||||
return nodeMap, nil
|
||||
}
|
||||
|
||||
func (o *OSDCollector) buildOSDLabelCache() error {
|
||||
cmd := o.cephOSDTreeCommand()
|
||||
data, _, err := o.conn.MonCommand(cmd)
|
||||
if err != nil {
|
||||
log.Printf("failed sending Mon command %s: %s", cmd, err)
|
||||
return err
|
||||
}
|
||||
|
||||
cache, err := buildOSDLabels(data)
|
||||
if err != nil {
|
||||
log.Printf("failed to decode OSD lables: %s", err)
|
||||
return err
|
||||
}
|
||||
o.osdLabelsCache = cache
|
||||
return nil
|
||||
}
|
||||
|
||||
func (o *OSDCollector) getOSDLabelFromID(id int64) *cephOSDLabel {
|
||||
if label, ok := o.osdLabelsCache[id]; ok {
|
||||
return label
|
||||
}
|
||||
return &cephOSDLabel{}
|
||||
}
|
||||
|
||||
func (o *OSDCollector) getOSDLabelFromName(osdid string) *cephOSDLabel {
|
||||
var id int64
|
||||
c, err := fmt.Sscanf(osdid, "osd.%d", &id)
|
||||
if err != nil || c != 1 {
|
||||
return &cephOSDLabel{}
|
||||
}
|
||||
|
||||
return o.getOSDLabelFromID(id)
|
||||
}
|
||||
|
||||
func (o *OSDCollector) collectOSDTreeDown(ch chan<- prometheus.Metric) error {
|
||||
cmd := o.cephOSDTreeCommand("down")
|
||||
buff, _, err := o.conn.MonCommand(cmd)
|
||||
|
@ -636,8 +831,15 @@ func (o *OSDCollector) collectOSDTreeDown(ch chan<- prometheus.Metric) error {
|
|||
}
|
||||
|
||||
osdName := downItem.Name
|
||||
lb := o.getOSDLabelFromName(osdName)
|
||||
|
||||
ch <- prometheus.MustNewConstMetric(o.OSDDownDesc, prometheus.GaugeValue, 1, osdName, downItem.Status)
|
||||
ch <- prometheus.MustNewConstMetric(o.OSDDownDesc, prometheus.GaugeValue, 1,
|
||||
downItem.Status,
|
||||
osdName,
|
||||
lb.DeviceClass,
|
||||
lb.Host,
|
||||
lb.Root,
|
||||
lb.Rack)
|
||||
}
|
||||
|
||||
return nil
|
||||
|
@ -662,32 +864,33 @@ func (o *OSDCollector) collectOSDDump() error {
|
|||
return err
|
||||
}
|
||||
osdName := fmt.Sprintf(osdLabelFormat, osdID)
|
||||
lb := o.getOSDLabelFromID(osdID)
|
||||
|
||||
in, err := dumpInfo.In.Float64()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
o.OSDIn.WithLabelValues(osdName).Set(in)
|
||||
o.OSDIn.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(in)
|
||||
|
||||
up, err := dumpInfo.Up.Float64()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
o.OSDUp.WithLabelValues(osdName).Set(up)
|
||||
o.OSDUp.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(up)
|
||||
|
||||
o.OSDFull.WithLabelValues(osdName).Set(0)
|
||||
o.OSDNearFull.WithLabelValues(osdName).Set(0)
|
||||
o.OSDBackfillFull.WithLabelValues(osdName).Set(0)
|
||||
o.OSDFull.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(0)
|
||||
o.OSDNearFull.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(0)
|
||||
o.OSDBackfillFull.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(0)
|
||||
for _, state := range dumpInfo.State {
|
||||
switch state {
|
||||
case "full":
|
||||
o.OSDFull.WithLabelValues(osdName).Set(1)
|
||||
o.OSDFull.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(1)
|
||||
case "nearfull":
|
||||
o.OSDNearFull.WithLabelValues(osdName).Set(1)
|
||||
o.OSDNearFull.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(1)
|
||||
case "backfillfull":
|
||||
o.OSDBackfillFull.WithLabelValues(osdName).Set(1)
|
||||
o.OSDBackfillFull.WithLabelValues(osdName, lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Set(1)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -752,11 +955,16 @@ func (o *OSDCollector) collectOSDScrubState(ch chan<- prometheus.Metric) error {
|
|||
}
|
||||
|
||||
for i, v := range o.osdScrubCache {
|
||||
lb := o.getOSDLabelFromID(int64(i))
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
o.ScrubbingStateDesc,
|
||||
prometheus.GaugeValue,
|
||||
float64(v),
|
||||
fmt.Sprintf(osdLabelFormat, i))
|
||||
fmt.Sprintf(osdLabelFormat, i),
|
||||
lb.DeviceClass,
|
||||
lb.Host,
|
||||
lb.Root,
|
||||
lb.Root)
|
||||
}
|
||||
|
||||
return nil
|
||||
|
@ -764,21 +972,54 @@ func (o *OSDCollector) collectOSDScrubState(ch chan<- prometheus.Metric) error {
|
|||
|
||||
func (o *OSDCollector) collectPGRecoveryState(ch chan<- prometheus.Metric) error {
|
||||
for _, pg := range o.pgDumpBrief.PGStats {
|
||||
if o.initialCollect || strings.Contains(pg.State, "recovering") {
|
||||
|
||||
// We need previous PG state in order to update the metric when a PG has
|
||||
// completed recovery or backfill. Or it could be an empty string if
|
||||
// unknown.
|
||||
prevPGState, prevPGStateFound := o.pgStateCache[pg.PGID]
|
||||
prevNumObjectsRecovered := o.pgObjectsRecoveredCache[pg.PGID]
|
||||
prevBackfillTargets := o.pgBackfillTargetsCache[pg.PGID]
|
||||
|
||||
if !prevPGStateFound || strings.Contains(prevPGState, "recovering") || strings.Contains(pg.State, "recovering") ||
|
||||
strings.Contains(prevPGState, "backfilling") || strings.Contains(pg.State, "backfilling") {
|
||||
query, err := o.performPGQuery(pg.PGID)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
o.pgStateCache[pg.PGID] = pg.State
|
||||
o.pgObjectsRecoveredCache[pg.PGID] = query.Info.Stats.StatSum.NumObjectsRecovered
|
||||
o.pgBackfillTargetsCache[pg.PGID] = query.backfillTargets()
|
||||
|
||||
// There is no previous backfill_targets, and we have just cached
|
||||
// it. Wait for the next time so that we can know the increased
|
||||
// number of objects backfilled for this entire PG and compute the
|
||||
// average increased number of objects backfilled for each OSD in
|
||||
// the backfill_targets.
|
||||
if prevBackfillTargets == nil || len(prevBackfillTargets) == 0 {
|
||||
continue
|
||||
}
|
||||
|
||||
// Average out the total number of objects backfilled to each OSD
|
||||
// The average number rounds to the nearest integer, rounding half
|
||||
// away from zero.
|
||||
eachOSDIncrease := math.Round(float64(o.pgObjectsRecoveredCache[pg.PGID]-prevNumObjectsRecovered) / float64(len(prevBackfillTargets)))
|
||||
|
||||
for osdID := range prevBackfillTargets {
|
||||
lb := o.getOSDLabelFromID(osdID)
|
||||
// It is possible that osdID has gone from the backfill_targets
|
||||
// this time if backfill has completed on it. In this case we
|
||||
// still count the increase to this OSD.
|
||||
o.OSDObjectsBackfilled.WithLabelValues(pg.PGID, fmt.Sprintf(osdLabelFormat, osdID), lb.DeviceClass, lb.Host, lb.Rack, lb.Root).Add(eachOSDIncrease)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for pgid, val := range o.pgObjectsRecoveredCache {
|
||||
for pgid, v := range o.pgObjectsRecoveredCache {
|
||||
ch <- prometheus.MustNewConstMetric(
|
||||
o.PGObjectsRecoveredDesc,
|
||||
prometheus.GaugeValue,
|
||||
float64(val),
|
||||
float64(v),
|
||||
pgid)
|
||||
}
|
||||
|
||||
|
@ -819,11 +1060,15 @@ func (o *OSDCollector) cephOSDPerfCommand() []byte {
|
|||
}
|
||||
|
||||
func (o *OSDCollector) cephOSDTreeCommand(states ...string) []byte {
|
||||
cmd, err := json.Marshal(map[string]interface{}{
|
||||
req := map[string]interface{}{
|
||||
"prefix": "osd tree",
|
||||
"states": states,
|
||||
"format": jsonFormat,
|
||||
})
|
||||
}
|
||||
if len(states) > 0 {
|
||||
req["states"] = states
|
||||
}
|
||||
|
||||
cmd, err := json.Marshal(req)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
@ -883,6 +1128,7 @@ func (o *OSDCollector) Collect(ch chan<- prometheus.Metric) {
|
|||
o.ApplyLatency.Reset()
|
||||
o.OSDIn.Reset()
|
||||
o.OSDUp.Reset()
|
||||
o.buildOSDLabelCache()
|
||||
|
||||
if err := o.collectOSDPerf(); err != nil {
|
||||
log.Println("failed collecting OSD perf metrics:", err)
|
||||
|
@ -900,10 +1146,6 @@ func (o *OSDCollector) Collect(ch chan<- prometheus.Metric) {
|
|||
log.Println("failed collecting OSD tree down metrics:", err)
|
||||
}
|
||||
|
||||
for _, metric := range o.collectorList() {
|
||||
metric.Collect(ch)
|
||||
}
|
||||
|
||||
if err := o.performPGDumpBrief(); err != nil {
|
||||
log.Println("failed performing PG dump brief:", err)
|
||||
}
|
||||
|
@ -916,7 +1158,7 @@ func (o *OSDCollector) Collect(ch chan<- prometheus.Metric) {
|
|||
log.Println("failed collecting PG recovery metrics:", err)
|
||||
}
|
||||
|
||||
if o.initialCollect {
|
||||
o.initialCollect = false
|
||||
for _, metric := range o.collectorList() {
|
||||
metric.Collect(ch)
|
||||
}
|
||||
}
|
||||
|
|
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue