Add metric ceph_pg_objects_recovered_total

This commit is contained in:
Yue Zhu 2019-09-23 17:51:50 -04:00
parent 6b4bbc8724
commit cc479e019a
3 changed files with 563 additions and 378 deletions

View File

@ -14,7 +14,12 @@
package collectors
import "github.com/ceph/go-ceph/rados"
import (
"encoding/json"
"fmt"
"github.com/ceph/go-ceph/rados"
)
// Conn interface implements only necessary methods that are used
// in this repository of *rados.Conn. This keeps rest of the implementation
@ -36,16 +41,27 @@ var _ Conn = &rados.Conn{}
// each individual collectors becomes a lot easier after that.
type NoopConn struct {
output string
cmdOut map[string]string
}
// The stub we use for testing should also satisfy the interface properties.
var _ Conn = &NoopConn{}
// NewNoopConn returns an instance of *NoopConn. The string that we want
// outputted at the end of the command we issue to ceph, should be
// specified in the only input parameter.
// NewNoopConn returns an instance of *NoopConn. The string that we want output
// at the end of the command we issue to Ceph is fixed and should be specified
// in the only input parameter.
func NewNoopConn(output string) *NoopConn {
return &NoopConn{output}
return &NoopConn{
output: output,
cmdOut: make(map[string]string),
}
}
// NewNoopConnWithCmdOut returns an instance of *NoopConn. The string that we
// want output at the end of the command we issue to Ceph can be various and
// should be specified by the map in the only input parameter.
func NewNoopConnWithCmdOut(cmdOut map[string]string) *NoopConn {
return &NoopConn{cmdOut: cmdOut}
}
// ReadDefaultConfigFile does not need to return an error. It satisfies
@ -65,12 +81,66 @@ func (n *NoopConn) Shutdown() {}
// MonCommand returns the provided output string to NoopConn as is, making
// it seem like it actually ran something and produced that string as a result.
func (n *NoopConn) MonCommand(_ []byte) ([]byte, string, error) {
func (n *NoopConn) MonCommand(args []byte) ([]byte, string, error) {
// Unmarshal the input command and see if we need to intercept
cmd := map[string]interface{}{}
err := json.Unmarshal(args, &cmd)
if err != nil {
return []byte(n.output), "", err
}
// Intercept and mock the output
switch prefix := cmd["prefix"]; prefix {
case "pg dump":
dc, ok := cmd["dumpcontents"].([]interface{})
if !ok || len(dc) == 0 {
break
}
switch dc[0] {
case "pgs_brief":
return []byte(n.cmdOut["ceph pg dump pgs_brief"]), "", nil
}
case "osd tree":
st, ok := cmd["states"].([]interface{})
if !ok || len(st) == 0 {
break
}
switch st[0] {
case "down":
return []byte(n.cmdOut["ceph osd tree down"]), "", nil
}
case "osd df":
return []byte(n.cmdOut["ceph osd df"]), "", nil
case "osd perf":
return []byte(n.cmdOut["ceph osd perf"]), "", nil
case "osd dump":
return []byte(n.cmdOut["ceph osd dump"]), "", nil
}
return []byte(n.output), "", nil
}
// PGCommand returns the provided output string to NoopConn as is, making
// it seem like it actually ran something and produced that string as a result.
func (n *NoopConn) PGCommand(_, _ []byte) ([]byte, string, error) {
func (n *NoopConn) PGCommand(pgid, args []byte) ([]byte, string, error) {
// Unmarshal the input command and see if we need to intercept
cmd := map[string]interface{}{}
err := json.Unmarshal(args, &cmd)
if err != nil {
return []byte(n.output), "", err
}
// Intercept and mock the output
switch prefix := cmd["prefix"]; prefix {
case "query":
return []byte(n.cmdOut[fmt.Sprintf("ceph tell %s query", string(pgid))]), "", nil
}
return []byte(n.output), "", nil
}

View File

@ -31,7 +31,6 @@ type cephPGDumpBrief []struct {
type cephPGQuery struct {
State string `json:"state"`
Info struct {
PGID string `json:"pgid"`
Stats struct {
StatSum struct {
NumObjectsRecovered int64 `json:"num_objects_recovered"`
@ -128,6 +127,9 @@ type OSDCollector struct {
// ScrubbingStateDesc depicts if an osd is being scrubbed
// labelled by OSD
ScrubbingStateDesc *prometheus.Desc
// PGObjectsRecoveredDesc displays total number of objects recovered in a PG
PGObjectsRecoveredDesc *prometheus.Desc
}
//NewOSDCollector creates an instance of the OSDCollector and instantiates
@ -348,6 +350,13 @@ func NewOSDCollector(conn Conn, cluster string) *OSDCollector {
[]string{"osd"},
labels,
),
PGObjectsRecoveredDesc: prometheus.NewDesc(
fmt.Sprintf("%s_pg_objects_recovered_total", cephNamespace),
"Number of objects recovered in a PG",
[]string{"pgid"},
labels,
),
}
}
@ -729,6 +738,25 @@ func (o *OSDCollector) collectOSDScrubState(ch chan<- prometheus.Metric) error {
return nil
}
func (o *OSDCollector) collectPGRecoveryState(ch chan<- prometheus.Metric) error {
for _, pg := range o.pgDumpBrief {
if strings.Contains(pg.State, "recovering") {
query, err := o.performPGQuery(pg.PGID)
if err != nil {
continue
}
ch <- prometheus.MustNewConstMetric(
o.PGObjectsRecoveredDesc,
prometheus.CounterValue,
float64(query.Info.Stats.StatSum.NumObjectsRecovered),
pg.PGID,
)
}
}
return nil
}
func (o *OSDCollector) cephOSDDump() []byte {
cmd, err := json.Marshal(map[string]interface{}{
"prefix": "osd dump",
@ -806,7 +834,9 @@ func (o *OSDCollector) Describe(ch chan<- *prometheus.Desc) {
for _, metric := range o.collectorList() {
metric.Describe(ch)
}
ch <- o.OSDDownDesc
ch <- o.ScrubbingStateDesc
ch <- o.PGObjectsRecoveredDesc
}
// Collect sends all the collected metrics to the provided prometheus channel.
@ -829,19 +859,19 @@ func (o *OSDCollector) Collect(ch chan<- prometheus.Metric) {
o.OSDUp.Reset()
if err := o.collectOSDPerf(); err != nil {
log.Println("failed collecting osd perf stats:", err)
log.Println("failed collecting OSD perf stats:", err)
}
if err := o.collectOSDDump(); err != nil {
log.Println("failed collecting osd dump:", err)
log.Println("failed collecting OSD dump:", err)
}
if err := o.collectOSDDF(); err != nil {
log.Println("failed collecting osd metrics:", err)
log.Println("failed collecting OSD metrics:", err)
}
if err := o.collectOSDTreeDown(ch); err != nil {
log.Println("failed collecting osd metrics:", err)
log.Println("failed collecting OSD metrics:", err)
}
for _, metric := range o.collectorList() {
@ -849,10 +879,14 @@ func (o *OSDCollector) Collect(ch chan<- prometheus.Metric) {
}
if err := o.performPGDumpBrief(); err != nil {
log.Println("failed performing pg dump brief:", err)
log.Println("failed performing PG dump brief:", err)
}
if err := o.collectOSDScrubState(ch); err != nil {
log.Println("failed collecting osd scrub state:", err)
log.Println("failed collecting OSD scrub state:", err)
}
if err := o.collectPGRecoveryState(ch); err != nil {
log.Println("failed collecting PG recovery state:", err)
}
}

View File

@ -12,100 +12,102 @@ import (
func TestOSDCollector(t *testing.T) {
for _, tt := range []struct {
input string
cmdOut map[string]string
regexes []*regexp.Regexp
}{
{
input: `
cmdOut: map[string]string{
"ceph osd df": `
{
"nodes": [
{
"id": 0,
"name": "osd.0",
"type": "osd",
"type_id": 0,
"crush_weight": 0.010391,
"depth": 2,
"reweight": 1.000000,
"kb": 11150316,
"kb_used": 40772,
"kb_avail": 11109544,
"utilization": 0.365658,
"var": 1.053676,
"pgs": 283
},
{
"id": 2,
"name": "osd.2",
"type": "osd",
"type_id": 0,
"crush_weight": 0.010391,
"depth": 2,
"reweight": 1.000000,
"kb": 11150316,
"kb_used": 36712,
"kb_avail": 11113604,
"utilization": 0.329246,
"var": 0.948753,
"pgs": 162
},
{
"id": 1,
"name": "osd.1",
"type": "osd",
"type_id": 0,
"crush_weight": 0.010391,
"depth": 2,
"reweight": 1.000000,
"kb": 11150316,
"kb_used": 40512,
"kb_avail": 11109804,
"utilization": 0.363326,
"var": 1.046957,
"pgs": 279
},
{
"id": 3,
"name": "osd.3",
"type": "osd",
"type_id": 0,
"crush_weight": 0.010391,
"depth": 2,
"reweight": 1.000000,
"kb": 11150316,
"kb_used": 36784,
"kb_avail": 11113532,
"utilization": 0.329892,
"var": 0.950614,
"pgs": 164
},
{
"id": 4,
"name": "osd.4",
"type": "osd",
"type_id": 0,
"crush_weight": 0.010391,
"depth": 2,
"reweight": 0,
"kb": 0,
"kb_used": 0,
"kb_avail": 0,
"utilization": -nan,
"var": -nan,
"pgs": 0
}
],
"stray": [],
"summary": {
"total_kb": 44601264,
"total_kb_used": 154780,
"total_kb_avail": 44446484,
"average_utilization": 0.347031,
"min_var": 0.948753,
"max_var": 1.053676,
"dev": 0.017482
"nodes": [
{
"id": 0,
"name": "osd.0",
"type": "osd",
"type_id": 0,
"crush_weight": 0.010391,
"depth": 2,
"reweight": 1.000000,
"kb": 11150316,
"kb_used": 40772,
"kb_avail": 11109544,
"utilization": 0.365658,
"var": 1.053676,
"pgs": 283
},
{
"id": 2,
"name": "osd.2",
"type": "osd",
"type_id": 0,
"crush_weight": 0.010391,
"depth": 2,
"reweight": 1.000000,
"kb": 11150316,
"kb_used": 36712,
"kb_avail": 11113604,
"utilization": 0.329246,
"var": 0.948753,
"pgs": 162
},
{
"id": 1,
"name": "osd.1",
"type": "osd",
"type_id": 0,
"crush_weight": 0.010391,
"depth": 2,
"reweight": 1.000000,
"kb": 11150316,
"kb_used": 40512,
"kb_avail": 11109804,
"utilization": 0.363326,
"var": 1.046957,
"pgs": 279
},
{
"id": 3,
"name": "osd.3",
"type": "osd",
"type_id": 0,
"crush_weight": 0.010391,
"depth": 2,
"reweight": 1.000000,
"kb": 11150316,
"kb_used": 36784,
"kb_avail": 11113532,
"utilization": 0.329892,
"var": 0.950614,
"pgs": 164
},
{
"id": 4,
"name": "osd.4",
"type": "osd",
"type_id": 0,
"crush_weight": 0.010391,
"depth": 2,
"reweight": 0,
"kb": 0,
"kb_used": 0,
"kb_avail": 0,
"utilization": -nan,
"var": -nan,
"pgs": 0
}
],
"stray": [],
"summary": {
"total_kb": 44601264,
"total_kb_used": 154780,
"total_kb_avail": 44446484,
"average_utilization": 0.347031,
"min_var": 0.948753,
"max_var": 1.053676,
"dev": 0.017482
}
}`,
},
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_osd_crush_weight{cluster="ceph",osd="osd.0"} 0.010391`),
regexp.MustCompile(`ceph_osd_crush_weight{cluster="ceph",osd="osd.1"} 0.010391`),
@ -159,46 +161,48 @@ func TestOSDCollector(t *testing.T) {
},
},
{
input: `
cmdOut: map[string]string{
"ceph osd perf": `
{
"osd_perf_infos": [
{
"id": 4,
"perf_stats": {
"commit_latency_ms": 0,
"apply_latency_ms": 0
}
},
{
"id": 3,
"perf_stats": {
"commit_latency_ms": 1,
"apply_latency_ms": 64
}
},
{
"id": 2,
"perf_stats": {
"commit_latency_ms": 2,
"apply_latency_ms": 79
}
},
{
"id": 1,
"perf_stats": {
"commit_latency_ms": 2,
"apply_latency_ms": 39
}
},
{
"id": 0,
"perf_stats": {
"commit_latency_ms": 2,
"apply_latency_ms": 31
}
}
]
"osd_perf_infos": [
{
"id": 4,
"perf_stats": {
"commit_latency_ms": 0,
"apply_latency_ms": 0
}
},
{
"id": 3,
"perf_stats": {
"commit_latency_ms": 1,
"apply_latency_ms": 64
}
},
{
"id": 2,
"perf_stats": {
"commit_latency_ms": 2,
"apply_latency_ms": 79
}
},
{
"id": 1,
"perf_stats": {
"commit_latency_ms": 2,
"apply_latency_ms": 39
}
},
{
"id": 0,
"perf_stats": {
"commit_latency_ms": 2,
"apply_latency_ms": 31
}
}
]
}`,
},
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_osd_perf_commit_latency_seconds{cluster="ceph",osd="osd.0"} 0.002`),
regexp.MustCompile(`ceph_osd_perf_commit_latency_seconds{cluster="ceph",osd="osd.1"} 0.002`),
@ -213,58 +217,59 @@ func TestOSDCollector(t *testing.T) {
},
},
{
input: `
cmdOut: map[string]string{
"ceph osd dump": `
{
"osds": [
{
"osd": 0,
"uuid": "135b53c3",
"up": 1,
"in": 1
},
{
"osd": 1,
"uuid": "370a33f2",
"up": 1,
"in": 1
},
{
"osd": 2,
"uuid": "ca9ab3de",
"up": 1,
"in": 1,
"state": [
"nearfull",
"exists",
"up"
]
},
{
"osd": 3,
"uuid": "bef98b10",
"up": 1,
"in": 1,
"state": [
"full",
"backfillfull",
"exists",
"up"
]
},
{
"osd": 4,
"uuid": "5936c9e8",
"up": 0,
"in": 0,
"state": [
"backfillfull",
"exists",
"up"
]
}
]
}
`,
"osds": [
{
"osd": 0,
"uuid": "135b53c3",
"up": 1,
"in": 1
},
{
"osd": 1,
"uuid": "370a33f2",
"up": 1,
"in": 1
},
{
"osd": 2,
"uuid": "ca9ab3de",
"up": 1,
"in": 1,
"state": [
"nearfull",
"exists",
"up"
]
},
{
"osd": 3,
"uuid": "bef98b10",
"up": 1,
"in": 1,
"state": [
"full",
"backfillfull",
"exists",
"up"
]
},
{
"osd": 4,
"uuid": "5936c9e8",
"up": 0,
"in": 0,
"state": [
"backfillfull",
"exists",
"up"
]
}
]
}`,
},
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_osd_in{cluster="ceph",osd="osd.0"} 1`),
regexp.MustCompile(`ceph_osd_in{cluster="ceph",osd="osd.1"} 1`),
@ -294,42 +299,44 @@ func TestOSDCollector(t *testing.T) {
},
},
{
input: `
cmdOut: map[string]string{
"ceph pg dump pgs_brief": `
[
{
"acting": [
1,
2,
3,
4
],
"acting_primary": 1,
"pgid": "81.1fff",
"state": "active+clean"
},
{
"acting": [
10,
11,
12,
13
],
"acting_primary": 10,
"pgid": "82.1fff",
"state": "active+clean+scrubbing"
},
{
"acting": [
20,
21,
22,
23
],
"acting_primary": 20,
"pgid": "83.1fff",
"state": "active+clean+scrubbing+deep"
}
{
"acting": [
1,
2,
3,
4
],
"acting_primary": 1,
"pgid": "81.1fff",
"state": "active+clean"
},
{
"acting": [
10,
11,
12,
13
],
"acting_primary": 10,
"pgid": "82.1fff",
"state": "active+clean+scrubbing"
},
{
"acting": [
20,
21,
22,
23
],
"acting_primary": 20,
"pgid": "83.1fff",
"state": "active+clean+scrubbing+deep"
}
]`,
},
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_osd_scrub_state{cluster="ceph",osd="osd.10"} 1`),
regexp.MustCompile(`ceph_osd_scrub_state{cluster="ceph",osd="osd.11"} 1`),
@ -342,183 +349,258 @@ func TestOSDCollector(t *testing.T) {
},
},
{
input: `
{
"nodes": [],
"stray": [
{
"id": 524,
"name": "osd.524",
"type": "osd",
"type_id": 0,
"crush_weight": 0.000000,
"depth": 0,
"exists": 1,
"status": "destroyed",
"reweight": 0.000000,
"primary_affinity": 1.000000
}
]
}`,
cmdOut: map[string]string{
"ceph osd tree down": `
{
"nodes": [],
"stray": [
{
"id": 524,
"name": "osd.524",
"type": "osd",
"type_id": 0,
"crush_weight": 0.000000,
"depth": 0,
"exists": 1,
"status": "destroyed",
"reweight": 0.000000,
"primary_affinity": 1.000000
}
]
}`,
},
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_osd_down{cluster="ceph",osd="osd.524",status="destroyed"} 1`),
},
},
{
input: `
{
"nodes": [],
"stray": [
{
"id": 524,
"name": "osd.524",
"type": "osd",
"type_id": 0,
"crush_weight": 0.000000,
"depth": 0,
"exists": 1,
"status": "down",
"reweight": 0.000000,
"primary_affinity": 1.000000
}
]
}`,
cmdOut: map[string]string{
"ceph osd tree down": `
{
"nodes": [],
"stray": [
{
"id": 524,
"name": "osd.524",
"type": "osd",
"type_id": 0,
"crush_weight": 0.000000,
"depth": 0,
"exists": 1,
"status": "down",
"reweight": 0.000000,
"primary_affinity": 1.000000
}
]
}`,
},
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_osd_down{cluster="ceph",osd="osd.524",status="down"} 1`),
},
},
{
input: `
{
"nodes": [
{
"id": -18,
"name": "data",
"type": "root",
"type_id": 10,
"children": [
-20
]
},
{
"id": -20,
"name": "R1-data",
"type": "rack",
"type_id": 3,
"pool_weights": {},
"children": [
-8
]
},
{
"id": -8,
"name": "test-data03-object01",
"type": "host",
"type_id": 1,
"pool_weights": {},
"children": [
97
]
},
{
"id": 524,
"device_class": "hdd",
"name": "osd.524",
"type": "osd",
"type_id": 0,
"crush_weight": 7.265991,
"depth": 3,
"pool_weights": {},
"exists": 1,
"status": "destroyed",
"reweight": 0.000000,
"primary_affinity": 1.000000
}
],
"stray": []
}`,
cmdOut: map[string]string{
"ceph osd tree down": `
{
"nodes": [
{
"id": -18,
"name": "data",
"type": "root",
"type_id": 10,
"children": [
-20
]
},
{
"id": -20,
"name": "R1-data",
"type": "rack",
"type_id": 3,
"pool_weights": {},
"children": [
-8
]
},
{
"id": -8,
"name": "test-data03-object01",
"type": "host",
"type_id": 1,
"pool_weights": {},
"children": [
97
]
},
{
"id": 524,
"device_class": "hdd",
"name": "osd.524",
"type": "osd",
"type_id": 0,
"crush_weight": 7.265991,
"depth": 3,
"pool_weights": {},
"exists": 1,
"status": "destroyed",
"reweight": 0.000000,
"primary_affinity": 1.000000
}
],
"stray": []
}`,
},
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_osd_down{cluster="ceph",osd="osd.524",status="destroyed"} 1`),
},
},
{
input: `
{
"nodes": [
{
"id": -18,
"name": "data",
"type": "root",
"type_id": 10,
"children": [
-20
]
},
{
"id": -20,
"name": "R1-data",
"type": "rack",
"type_id": 3,
"pool_weights": {},
"children": [
-8
]
},
{
"id": -8,
"name": "test-data03-object01",
"type": "host",
"type_id": 1,
"pool_weights": {},
"children": [
97
]
},
{
"id": 524,
"device_class": "hdd",
"name": "osd.524",
"type": "osd",
"type_id": 0,
"crush_weight": 7.265991,
"depth": 3,
"pool_weights": {},
"exists": 1,
"status": "destroyed",
"reweight": 0.000000,
"primary_affinity": 1.000000
}
],
"stray": [
{
"id": 525,
"name": "osd.525",
"type": "osd",
"type_id": 0,
"crush_weight": 0.000000,
"depth": 0,
"exists": 1,
"status": "down",
"reweight": 0.000000,
"primary_affinity": 1.000000
}
]
}`,
cmdOut: map[string]string{
"ceph osd tree down": `
{
"nodes": [
{
"id": -18,
"name": "data",
"type": "root",
"type_id": 10,
"children": [
-20
]
},
{
"id": -20,
"name": "R1-data",
"type": "rack",
"type_id": 3,
"pool_weights": {},
"children": [
-8
]
},
{
"id": -8,
"name": "test-data03-object01",
"type": "host",
"type_id": 1,
"pool_weights": {},
"children": [
97
]
},
{
"id": 524,
"device_class": "hdd",
"name": "osd.524",
"type": "osd",
"type_id": 0,
"crush_weight": 7.265991,
"depth": 3,
"pool_weights": {},
"exists": 1,
"status": "destroyed",
"reweight": 0.000000,
"primary_affinity": 1.000000
}
],
"stray": [
{
"id": 525,
"name": "osd.525",
"type": "osd",
"type_id": 0,
"crush_weight": 0.000000,
"depth": 0,
"exists": 1,
"status": "down",
"reweight": 0.000000,
"primary_affinity": 1.000000
}
]
}`,
},
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_osd_down{cluster="ceph",osd="osd.524",status="destroyed"} 1`),
regexp.MustCompile(`ceph_osd_down{cluster="ceph",osd="osd.525",status="down"} 1`),
},
},
{
input: `
{
"nodes": []}}
}`,
cmdOut: map[string]string{
"ceph osd tree down": `
{
"nodes": []}}
}`,
},
regexes: []*regexp.Regexp{},
},
{
cmdOut: map[string]string{
"ceph pg dump pgs_brief": `
[
{
"acting": [
1,
2,
3,
4
],
"acting_primary": 1,
"pgid": "81.1fff",
"state": "active+clean"
},
{
"acting": [
10,
11,
12,
13
],
"acting_primary": 10,
"pgid": "82.1fff",
"state": "active+clean+scrubbing"
},
{
"acting": [
20,
21,
22,
23
],
"acting_primary": 20,
"pgid": "83.1fff",
"state": "active+clean+scrubbing+deep"
},
{
"acting": [
30,
31,
32,
33
],
"acting_primary": 30,
"pgid": "84.1fff",
"state": "active+recovering+degraded"
}
]`,
"ceph tell 84.1fff query": `
{
"state": "active+recovering+degraded",
"info": {
"stats": {
"stat_sum": {
"num_objects_recovered": 123
}
}
}
}`,
},
regexes: []*regexp.Regexp{
regexp.MustCompile(`ceph_pg_objects_recovered_total{cluster="ceph",pgid="84.1fff"} 123`),
},
},
} {
func() {
collector := NewOSDCollector(NewNoopConn(tt.input), "ceph")
collector := NewOSDCollector(NewNoopConnWithCmdOut(tt.cmdOut), "ceph")
if err := prometheus.Register(collector); err != nil {
t.Fatalf("collector failed to register: %s", err)
}
@ -537,7 +619,6 @@ func TestOSDCollector(t *testing.T) {
if err != nil {
t.Fatalf("failed reading server response: %s", err)
}
for _, re := range tt.regexes {
if !re.Match(buf) {
t.Errorf("failed matching: %q", re)