Use ConstMetrics for ceph_crash_reports

Makes the code simpler since we're not tracking state anymore.
Also rewrote the tests to be more in-line with the rest.
This commit is contained in:
Xavier Villaneau 2022-06-14 17:43:53 -04:00 committed by Xavier Villaneau
parent 74c89af225
commit adf792c3e8
2 changed files with 114 additions and 167 deletions

View File

@ -18,6 +18,7 @@ import (
"bufio" "bufio"
"bytes" "bytes"
"encoding/json" "encoding/json"
"fmt"
"regexp" "regexp"
"github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus"
@ -39,11 +40,7 @@ type CrashesCollector struct {
logger *logrus.Logger logger *logrus.Logger
version *Version version *Version
// We keep track of which daemons we've seen so that their error count crashReportsDesc *prometheus.Desc
// can be reset to zero if the errors get purged.
knownEntities map[string]bool
CrashReports prometheus.GaugeVec
} }
// NewCrashesCollector creates a new CrashesCollector instance // NewCrashesCollector creates a new CrashesCollector instance
@ -56,16 +53,11 @@ func NewCrashesCollector(exporter *Exporter) *CrashesCollector {
logger: exporter.Logger, logger: exporter.Logger,
version: exporter.Version, version: exporter.Version,
knownEntities: map[string]bool{}, crashReportsDesc: prometheus.NewDesc(
fmt.Sprintf("%s_crash_reports", cephNamespace),
CrashReports: *prometheus.NewGaugeVec( "Count of crashes reports per daemon, according to `ceph crash ls`",
prometheus.GaugeOpts{
Namespace: cephNamespace,
Name: "crash_reports",
Help: "Count of crashes reports per daemon, according to `ceph crash ls`",
ConstLabels: labels,
},
[]string{"daemon", "status"}, []string{"daemon", "status"},
labels,
), ),
} }
@ -78,8 +70,8 @@ type crashEntry struct {
} }
// getCrashLs runs the 'crash ls' command and parses its results // getCrashLs runs the 'crash ls' command and parses its results
func (c *CrashesCollector) getCrashLs() ([]crashEntry, error) { func (c *CrashesCollector) getCrashLs() (map[crashEntry]int, error) {
crashes := make([]crashEntry, 0) crashes := make(map[crashEntry]int)
// We parse the plain format because it is quite compact. // We parse the plain format because it is quite compact.
// The JSON output of this command is very verbose and might be too slow // The JSON output of this command is very verbose and might be too slow
@ -101,39 +93,19 @@ func (c *CrashesCollector) getCrashLs() ([]crashEntry, error) {
for scanner.Scan() { for scanner.Scan() {
matched := crashLsLineRegex.FindStringSubmatch(scanner.Text()) matched := crashLsLineRegex.FindStringSubmatch(scanner.Text())
if len(matched) == 3 { if len(matched) == 3 {
crashes = append(crashes, crashEntry{matched[1], matched[2] == "*"}) crashes[crashEntry{matched[1], matched[2] == "*"}]++
} else if len(matched) == 2 { } else if len(matched) == 2 {
// Just in case the line-end spaces were stripped // Just in case the line-end spaces were stripped
crashes = append(crashes, crashEntry{matched[1], false}) crashes[crashEntry{matched[1], false}]++
} }
} }
return crashes, nil return crashes, nil
} }
// processCrashLs takes the parsed results from getCrashLs and counts them
// in a map. It also keeps track of which daemons we've see in the past, and
// initializes all counts to zero where needed.
func (c *CrashesCollector) processCrashLs(crashes []crashEntry) map[crashEntry]int {
crashMap := make(map[crashEntry]int)
for _, crash := range crashes {
c.knownEntities[crash.entity] = true
}
for entity := range c.knownEntities {
crashMap[crashEntry{entity, true}] = 0
crashMap[crashEntry{entity, false}] = 0
}
for _, crash := range crashes {
crashMap[crash]++
}
return crashMap
}
// Describe provides the metrics descriptions to Prometheus // Describe provides the metrics descriptions to Prometheus
func (c *CrashesCollector) Describe(ch chan<- *prometheus.Desc) { func (c *CrashesCollector) Describe(ch chan<- *prometheus.Desc) {
c.CrashReports.Describe(ch) ch <- c.crashReportsDesc
} }
// Collect sends all the collected metrics Prometheus. // Collect sends all the collected metrics Prometheus.
@ -142,11 +114,14 @@ func (c *CrashesCollector) Collect(ch chan<- prometheus.Metric) {
if err != nil { if err != nil {
c.logger.WithError(err).Error("failed to run 'ceph crash ls'") c.logger.WithError(err).Error("failed to run 'ceph crash ls'")
} }
crashMap := c.processCrashLs(crashes)
for crash, count := range crashMap { for crash, count := range crashes {
c.CrashReports.WithLabelValues(crash.entity, statusNames[crash.isNew]).Set(float64(count)) ch <- prometheus.MustNewConstMetric(
c.crashReportsDesc,
prometheus.GaugeValue,
float64(count),
crash.entity,
statusNames[crash.isNew],
)
} }
c.CrashReports.Collect(ch)
} }

View File

@ -18,7 +18,6 @@ import (
"io/ioutil" "io/ioutil"
"net/http" "net/http"
"net/http/httptest" "net/http/httptest"
"reflect"
"regexp" "regexp"
"testing" "testing"
@ -31,129 +30,102 @@ import (
func TestCrashesCollector(t *testing.T) { func TestCrashesCollector(t *testing.T) {
const outputCephCrashLs string = ` for _, tt := range []struct {
ID ENTITY NEW name string
2022-01-01_18:57:51.184156Z_02d9b659-69d1-4dd6-8495-ee2345208568 client.admin input string
2022-01-01_19:02:01.401852Z_9100163b-4cd1-479f-b3a8-0dc2d288eaea mgr.mgr-node-01 reMatch []*regexp.Regexp
2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 client.admin * }{
2022-02-03_04:03:38.371403Z_bd756324-27c0-494e-adfb-9f5f6e3db000 osd.3 * {
2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a osd.3 * name: "single new crash",
` input: `
ID ENTITY NEW
t.Run( 2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0 *
"full test", `,
func(t *testing.T) { reMatch: []*regexp.Regexp{
conn := &MockConn{} regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="new"} 1`),
conn.On("MonCommand", mock.Anything).Return( },
[]byte(outputCephCrashLs), "", nil, },
) {
name: "single archived crash",
collector := NewCrashesCollector(&Exporter{Conn: conn, Cluster: "ceph", Logger: logrus.New(), Version: Pacific}) input: `
err := prometheus.Register(collector) ID ENTITY NEW
require.NoError(t, err) 2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0
defer prometheus.Unregister(collector) `,
reMatch: []*regexp.Regexp{
server := httptest.NewServer(promhttp.Handler()) regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="archived"} 1`),
defer server.Close() },
},
resp, err := http.Get(server.URL) {
require.NoError(t, err) name: "two new crashes same entity",
defer resp.Body.Close() input: `
ID ENTITY NEW
buf, err := ioutil.ReadAll(resp.Body) 2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0 *
require.NoError(t, err) 2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a osd.0 *
`,
reMatches := []*regexp.Regexp{ reMatch: []*regexp.Regexp{
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="new"} 2`),
},
},
{
name: "mix of crashes same entity",
input: `
ID ENTITY NEW
2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0
2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a osd.0 *
`,
reMatch: []*regexp.Regexp{
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="new"} 1`),
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="archived"} 1`),
},
},
{
name: "mix of crashes different entities",
input: `
ID ENTITY NEW
2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 mgr.mgr-node-01 *
2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a client.admin *
`,
reMatch: []*regexp.Regexp{
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="mgr.mgr-node-01",status="new"} 1`),
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="client.admin",status="new"} 1`), regexp.MustCompile(`crash_reports{cluster="ceph",daemon="client.admin",status="new"} 1`),
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="client.admin",status="archived"} 1`), },
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="mgr.mgr-node-01",status="new"} 0`), },
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="mgr.mgr-node-01",status="archived"} 1`), {
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.3",status="new"} 2`), // At least code shouldn't panic
regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.3",status="archived"} 0`), name: "no crashes",
} input: ``,
reMatch: []*regexp.Regexp{},
},
} {
t.Run(
tt.name,
func(t *testing.T) {
conn := &MockConn{}
conn.On("MonCommand", mock.Anything).Return(
[]byte(tt.input), "", nil,
)
// t.Log(string(buf)) collector := NewCrashesCollector(&Exporter{Conn: conn, Cluster: "ceph", Logger: logrus.New(), Version: Pacific})
for _, re := range reMatches { err := prometheus.Register(collector)
if !re.Match(buf) { require.NoError(t, err)
t.Errorf("expected %s to match\n", re.String()) defer prometheus.Unregister(collector)
server := httptest.NewServer(promhttp.Handler())
defer server.Close()
resp, err := http.Get(server.URL)
require.NoError(t, err)
defer resp.Body.Close()
buf, err := ioutil.ReadAll(resp.Body)
require.NoError(t, err)
for _, re := range tt.reMatch {
if !re.Match(buf) {
t.Errorf("expected %s to match\n", re.String())
}
} }
} },
}, )
) }
t.Run(
"getCrashLs unit test",
func(t *testing.T) {
conn := &MockConn{}
conn.On("MonCommand", mock.Anything).Return(
[]byte(outputCephCrashLs), "", nil,
)
log := logrus.New()
log.Level = logrus.DebugLevel
collector := NewCrashesCollector(&Exporter{Conn: conn, Cluster: "ceph", Logger: log, Version: Pacific})
expected := []crashEntry{
{"client.admin", false},
{"mgr.mgr-node-01", false},
{"client.admin", true},
{"osd.3", true},
{"osd.3", true},
}
crashes, _ := collector.getCrashLs()
if !reflect.DeepEqual(crashes, expected) {
t.Errorf("incorrect getCrashLs result: expected %v, got %v\n", expected, crashes)
}
},
)
t.Run(
"getCrashLs empty crash list unit test",
func(t *testing.T) {
conn := &MockConn{}
conn.On("MonCommand", mock.Anything).Return(
[]byte(""), "", nil,
)
collector := NewCrashesCollector(&Exporter{Conn: conn, Cluster: "ceph", Logger: logrus.New(), Version: Pacific})
crashes, _ := collector.getCrashLs()
if len(crashes) != 0 {
t.Errorf("expected empty result from getCrashLs, got %v\n", crashes)
}
},
)
t.Run(
"processCrashLs test",
func(t *testing.T) {
collector := NewCrashesCollector(&Exporter{Conn: nil, Cluster: "ceph", Logger: logrus.New(), Version: Pacific})
newCrash := crashEntry{"daemon", true}
archivedCrash := crashEntry{"daemon", false}
// New crash
crashMap := collector.processCrashLs([]crashEntry{newCrash})
expected := map[crashEntry]int{newCrash: 1, archivedCrash: 0}
if !reflect.DeepEqual(crashMap, expected) {
t.Errorf("incorrect processCrashLs result: expected %v, got %v\n", expected, crashMap)
}
// Archived crash
crashMap = collector.processCrashLs([]crashEntry{archivedCrash})
expected = map[crashEntry]int{newCrash: 0, archivedCrash: 1}
if !reflect.DeepEqual(crashMap, expected) {
t.Errorf("incorrect processCrashLs result: expected %v, got %v\n", expected, crashMap)
}
// Crash was memorized, check that we reset count to zero
crashMap = collector.processCrashLs([]crashEntry{})
expected = map[crashEntry]int{newCrash: 0, archivedCrash: 0}
if !reflect.DeepEqual(crashMap, expected) {
t.Errorf("incorrect processCrashLs result: expected %v, got %v\n", expected, crashMap)
}
},
)
} }