Use ConstMetrics for ceph_crash_reports
Makes the code simpler since we're not tracking state anymore. Also rewrote the tests to be more in-line with the rest.
This commit is contained in:
parent
74c89af225
commit
adf792c3e8
|
@ -18,6 +18,7 @@ import (
|
||||||
"bufio"
|
"bufio"
|
||||||
"bytes"
|
"bytes"
|
||||||
"encoding/json"
|
"encoding/json"
|
||||||
|
"fmt"
|
||||||
"regexp"
|
"regexp"
|
||||||
|
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
|
@ -39,11 +40,7 @@ type CrashesCollector struct {
|
||||||
logger *logrus.Logger
|
logger *logrus.Logger
|
||||||
version *Version
|
version *Version
|
||||||
|
|
||||||
// We keep track of which daemons we've seen so that their error count
|
crashReportsDesc *prometheus.Desc
|
||||||
// can be reset to zero if the errors get purged.
|
|
||||||
knownEntities map[string]bool
|
|
||||||
|
|
||||||
CrashReports prometheus.GaugeVec
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// NewCrashesCollector creates a new CrashesCollector instance
|
// NewCrashesCollector creates a new CrashesCollector instance
|
||||||
|
@ -56,16 +53,11 @@ func NewCrashesCollector(exporter *Exporter) *CrashesCollector {
|
||||||
logger: exporter.Logger,
|
logger: exporter.Logger,
|
||||||
version: exporter.Version,
|
version: exporter.Version,
|
||||||
|
|
||||||
knownEntities: map[string]bool{},
|
crashReportsDesc: prometheus.NewDesc(
|
||||||
|
fmt.Sprintf("%s_crash_reports", cephNamespace),
|
||||||
CrashReports: *prometheus.NewGaugeVec(
|
"Count of crashes reports per daemon, according to `ceph crash ls`",
|
||||||
prometheus.GaugeOpts{
|
|
||||||
Namespace: cephNamespace,
|
|
||||||
Name: "crash_reports",
|
|
||||||
Help: "Count of crashes reports per daemon, according to `ceph crash ls`",
|
|
||||||
ConstLabels: labels,
|
|
||||||
},
|
|
||||||
[]string{"daemon", "status"},
|
[]string{"daemon", "status"},
|
||||||
|
labels,
|
||||||
),
|
),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -78,8 +70,8 @@ type crashEntry struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
// getCrashLs runs the 'crash ls' command and parses its results
|
// getCrashLs runs the 'crash ls' command and parses its results
|
||||||
func (c *CrashesCollector) getCrashLs() ([]crashEntry, error) {
|
func (c *CrashesCollector) getCrashLs() (map[crashEntry]int, error) {
|
||||||
crashes := make([]crashEntry, 0)
|
crashes := make(map[crashEntry]int)
|
||||||
|
|
||||||
// We parse the plain format because it is quite compact.
|
// We parse the plain format because it is quite compact.
|
||||||
// The JSON output of this command is very verbose and might be too slow
|
// The JSON output of this command is very verbose and might be too slow
|
||||||
|
@ -101,39 +93,19 @@ func (c *CrashesCollector) getCrashLs() ([]crashEntry, error) {
|
||||||
for scanner.Scan() {
|
for scanner.Scan() {
|
||||||
matched := crashLsLineRegex.FindStringSubmatch(scanner.Text())
|
matched := crashLsLineRegex.FindStringSubmatch(scanner.Text())
|
||||||
if len(matched) == 3 {
|
if len(matched) == 3 {
|
||||||
crashes = append(crashes, crashEntry{matched[1], matched[2] == "*"})
|
crashes[crashEntry{matched[1], matched[2] == "*"}]++
|
||||||
} else if len(matched) == 2 {
|
} else if len(matched) == 2 {
|
||||||
// Just in case the line-end spaces were stripped
|
// Just in case the line-end spaces were stripped
|
||||||
crashes = append(crashes, crashEntry{matched[1], false})
|
crashes[crashEntry{matched[1], false}]++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return crashes, nil
|
return crashes, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// processCrashLs takes the parsed results from getCrashLs and counts them
|
|
||||||
// in a map. It also keeps track of which daemons we've see in the past, and
|
|
||||||
// initializes all counts to zero where needed.
|
|
||||||
func (c *CrashesCollector) processCrashLs(crashes []crashEntry) map[crashEntry]int {
|
|
||||||
crashMap := make(map[crashEntry]int)
|
|
||||||
|
|
||||||
for _, crash := range crashes {
|
|
||||||
c.knownEntities[crash.entity] = true
|
|
||||||
}
|
|
||||||
for entity := range c.knownEntities {
|
|
||||||
crashMap[crashEntry{entity, true}] = 0
|
|
||||||
crashMap[crashEntry{entity, false}] = 0
|
|
||||||
}
|
|
||||||
for _, crash := range crashes {
|
|
||||||
crashMap[crash]++
|
|
||||||
}
|
|
||||||
|
|
||||||
return crashMap
|
|
||||||
}
|
|
||||||
|
|
||||||
// Describe provides the metrics descriptions to Prometheus
|
// Describe provides the metrics descriptions to Prometheus
|
||||||
func (c *CrashesCollector) Describe(ch chan<- *prometheus.Desc) {
|
func (c *CrashesCollector) Describe(ch chan<- *prometheus.Desc) {
|
||||||
c.CrashReports.Describe(ch)
|
ch <- c.crashReportsDesc
|
||||||
}
|
}
|
||||||
|
|
||||||
// Collect sends all the collected metrics Prometheus.
|
// Collect sends all the collected metrics Prometheus.
|
||||||
|
@ -142,11 +114,14 @@ func (c *CrashesCollector) Collect(ch chan<- prometheus.Metric) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
c.logger.WithError(err).Error("failed to run 'ceph crash ls'")
|
c.logger.WithError(err).Error("failed to run 'ceph crash ls'")
|
||||||
}
|
}
|
||||||
crashMap := c.processCrashLs(crashes)
|
|
||||||
|
|
||||||
for crash, count := range crashMap {
|
for crash, count := range crashes {
|
||||||
c.CrashReports.WithLabelValues(crash.entity, statusNames[crash.isNew]).Set(float64(count))
|
ch <- prometheus.MustNewConstMetric(
|
||||||
|
c.crashReportsDesc,
|
||||||
|
prometheus.GaugeValue,
|
||||||
|
float64(count),
|
||||||
|
crash.entity,
|
||||||
|
statusNames[crash.isNew],
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
c.CrashReports.Collect(ch)
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,7 +18,6 @@ import (
|
||||||
"io/ioutil"
|
"io/ioutil"
|
||||||
"net/http"
|
"net/http"
|
||||||
"net/http/httptest"
|
"net/http/httptest"
|
||||||
"reflect"
|
|
||||||
"regexp"
|
"regexp"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
@ -31,129 +30,102 @@ import (
|
||||||
|
|
||||||
func TestCrashesCollector(t *testing.T) {
|
func TestCrashesCollector(t *testing.T) {
|
||||||
|
|
||||||
const outputCephCrashLs string = `
|
for _, tt := range []struct {
|
||||||
ID ENTITY NEW
|
name string
|
||||||
2022-01-01_18:57:51.184156Z_02d9b659-69d1-4dd6-8495-ee2345208568 client.admin
|
input string
|
||||||
2022-01-01_19:02:01.401852Z_9100163b-4cd1-479f-b3a8-0dc2d288eaea mgr.mgr-node-01
|
reMatch []*regexp.Regexp
|
||||||
2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 client.admin *
|
}{
|
||||||
2022-02-03_04:03:38.371403Z_bd756324-27c0-494e-adfb-9f5f6e3db000 osd.3 *
|
{
|
||||||
2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a osd.3 *
|
name: "single new crash",
|
||||||
`
|
input: `
|
||||||
|
ID ENTITY NEW
|
||||||
t.Run(
|
2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0 *
|
||||||
"full test",
|
`,
|
||||||
func(t *testing.T) {
|
reMatch: []*regexp.Regexp{
|
||||||
conn := &MockConn{}
|
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="new"} 1`),
|
||||||
conn.On("MonCommand", mock.Anything).Return(
|
},
|
||||||
[]byte(outputCephCrashLs), "", nil,
|
},
|
||||||
)
|
{
|
||||||
|
name: "single archived crash",
|
||||||
collector := NewCrashesCollector(&Exporter{Conn: conn, Cluster: "ceph", Logger: logrus.New(), Version: Pacific})
|
input: `
|
||||||
err := prometheus.Register(collector)
|
ID ENTITY NEW
|
||||||
require.NoError(t, err)
|
2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0
|
||||||
defer prometheus.Unregister(collector)
|
`,
|
||||||
|
reMatch: []*regexp.Regexp{
|
||||||
server := httptest.NewServer(promhttp.Handler())
|
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="archived"} 1`),
|
||||||
defer server.Close()
|
},
|
||||||
|
},
|
||||||
resp, err := http.Get(server.URL)
|
{
|
||||||
require.NoError(t, err)
|
name: "two new crashes same entity",
|
||||||
defer resp.Body.Close()
|
input: `
|
||||||
|
ID ENTITY NEW
|
||||||
buf, err := ioutil.ReadAll(resp.Body)
|
2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0 *
|
||||||
require.NoError(t, err)
|
2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a osd.0 *
|
||||||
|
`,
|
||||||
reMatches := []*regexp.Regexp{
|
reMatch: []*regexp.Regexp{
|
||||||
|
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="new"} 2`),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "mix of crashes same entity",
|
||||||
|
input: `
|
||||||
|
ID ENTITY NEW
|
||||||
|
2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0
|
||||||
|
2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a osd.0 *
|
||||||
|
`,
|
||||||
|
reMatch: []*regexp.Regexp{
|
||||||
|
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="new"} 1`),
|
||||||
|
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="archived"} 1`),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
name: "mix of crashes different entities",
|
||||||
|
input: `
|
||||||
|
ID ENTITY NEW
|
||||||
|
2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 mgr.mgr-node-01 *
|
||||||
|
2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a client.admin *
|
||||||
|
`,
|
||||||
|
reMatch: []*regexp.Regexp{
|
||||||
|
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="mgr.mgr-node-01",status="new"} 1`),
|
||||||
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="client.admin",status="new"} 1`),
|
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="client.admin",status="new"} 1`),
|
||||||
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="client.admin",status="archived"} 1`),
|
},
|
||||||
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="mgr.mgr-node-01",status="new"} 0`),
|
},
|
||||||
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="mgr.mgr-node-01",status="archived"} 1`),
|
{
|
||||||
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.3",status="new"} 2`),
|
// At least code shouldn't panic
|
||||||
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.3",status="archived"} 0`),
|
name: "no crashes",
|
||||||
}
|
input: ``,
|
||||||
|
reMatch: []*regexp.Regexp{},
|
||||||
|
},
|
||||||
|
} {
|
||||||
|
t.Run(
|
||||||
|
tt.name,
|
||||||
|
func(t *testing.T) {
|
||||||
|
conn := &MockConn{}
|
||||||
|
conn.On("MonCommand", mock.Anything).Return(
|
||||||
|
[]byte(tt.input), "", nil,
|
||||||
|
)
|
||||||
|
|
||||||
// t.Log(string(buf))
|
collector := NewCrashesCollector(&Exporter{Conn: conn, Cluster: "ceph", Logger: logrus.New(), Version: Pacific})
|
||||||
for _, re := range reMatches {
|
err := prometheus.Register(collector)
|
||||||
if !re.Match(buf) {
|
require.NoError(t, err)
|
||||||
t.Errorf("expected %s to match\n", re.String())
|
defer prometheus.Unregister(collector)
|
||||||
|
|
||||||
|
server := httptest.NewServer(promhttp.Handler())
|
||||||
|
defer server.Close()
|
||||||
|
|
||||||
|
resp, err := http.Get(server.URL)
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer resp.Body.Close()
|
||||||
|
|
||||||
|
buf, err := ioutil.ReadAll(resp.Body)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
for _, re := range tt.reMatch {
|
||||||
|
if !re.Match(buf) {
|
||||||
|
t.Errorf("expected %s to match\n", re.String())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
},
|
||||||
},
|
)
|
||||||
)
|
}
|
||||||
|
|
||||||
t.Run(
|
|
||||||
"getCrashLs unit test",
|
|
||||||
func(t *testing.T) {
|
|
||||||
conn := &MockConn{}
|
|
||||||
conn.On("MonCommand", mock.Anything).Return(
|
|
||||||
[]byte(outputCephCrashLs), "", nil,
|
|
||||||
)
|
|
||||||
|
|
||||||
log := logrus.New()
|
|
||||||
log.Level = logrus.DebugLevel
|
|
||||||
collector := NewCrashesCollector(&Exporter{Conn: conn, Cluster: "ceph", Logger: log, Version: Pacific})
|
|
||||||
|
|
||||||
expected := []crashEntry{
|
|
||||||
{"client.admin", false},
|
|
||||||
{"mgr.mgr-node-01", false},
|
|
||||||
{"client.admin", true},
|
|
||||||
{"osd.3", true},
|
|
||||||
{"osd.3", true},
|
|
||||||
}
|
|
||||||
crashes, _ := collector.getCrashLs()
|
|
||||||
|
|
||||||
if !reflect.DeepEqual(crashes, expected) {
|
|
||||||
t.Errorf("incorrect getCrashLs result: expected %v, got %v\n", expected, crashes)
|
|
||||||
}
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
t.Run(
|
|
||||||
"getCrashLs empty crash list unit test",
|
|
||||||
func(t *testing.T) {
|
|
||||||
conn := &MockConn{}
|
|
||||||
conn.On("MonCommand", mock.Anything).Return(
|
|
||||||
[]byte(""), "", nil,
|
|
||||||
)
|
|
||||||
|
|
||||||
collector := NewCrashesCollector(&Exporter{Conn: conn, Cluster: "ceph", Logger: logrus.New(), Version: Pacific})
|
|
||||||
|
|
||||||
crashes, _ := collector.getCrashLs()
|
|
||||||
if len(crashes) != 0 {
|
|
||||||
t.Errorf("expected empty result from getCrashLs, got %v\n", crashes)
|
|
||||||
}
|
|
||||||
},
|
|
||||||
)
|
|
||||||
|
|
||||||
t.Run(
|
|
||||||
"processCrashLs test",
|
|
||||||
func(t *testing.T) {
|
|
||||||
collector := NewCrashesCollector(&Exporter{Conn: nil, Cluster: "ceph", Logger: logrus.New(), Version: Pacific})
|
|
||||||
|
|
||||||
newCrash := crashEntry{"daemon", true}
|
|
||||||
archivedCrash := crashEntry{"daemon", false}
|
|
||||||
|
|
||||||
// New crash
|
|
||||||
crashMap := collector.processCrashLs([]crashEntry{newCrash})
|
|
||||||
expected := map[crashEntry]int{newCrash: 1, archivedCrash: 0}
|
|
||||||
if !reflect.DeepEqual(crashMap, expected) {
|
|
||||||
t.Errorf("incorrect processCrashLs result: expected %v, got %v\n", expected, crashMap)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Archived crash
|
|
||||||
crashMap = collector.processCrashLs([]crashEntry{archivedCrash})
|
|
||||||
expected = map[crashEntry]int{newCrash: 0, archivedCrash: 1}
|
|
||||||
if !reflect.DeepEqual(crashMap, expected) {
|
|
||||||
t.Errorf("incorrect processCrashLs result: expected %v, got %v\n", expected, crashMap)
|
|
||||||
}
|
|
||||||
|
|
||||||
// Crash was memorized, check that we reset count to zero
|
|
||||||
crashMap = collector.processCrashLs([]crashEntry{})
|
|
||||||
expected = map[crashEntry]int{newCrash: 0, archivedCrash: 0}
|
|
||||||
if !reflect.DeepEqual(crashMap, expected) {
|
|
||||||
t.Errorf("incorrect processCrashLs result: expected %v, got %v\n", expected, crashMap)
|
|
||||||
}
|
|
||||||
|
|
||||||
},
|
|
||||||
)
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue