From adf792c3e8677211f730ed8d49f8e28ee937c8c5 Mon Sep 17 00:00:00 2001 From: Xavier Villaneau Date: Tue, 14 Jun 2022 17:43:53 -0400 Subject: [PATCH] Use ConstMetrics for ceph_crash_reports Makes the code simpler since we're not tracking state anymore. Also rewrote the tests to be more in-line with the rest. --- ceph/crashes.go | 63 ++++--------- ceph/crashes_test.go | 218 +++++++++++++++++++------------------------ 2 files changed, 114 insertions(+), 167 deletions(-) diff --git a/ceph/crashes.go b/ceph/crashes.go index 3e67fb5..10ae575 100644 --- a/ceph/crashes.go +++ b/ceph/crashes.go @@ -18,6 +18,7 @@ import ( "bufio" "bytes" "encoding/json" + "fmt" "regexp" "github.com/prometheus/client_golang/prometheus" @@ -39,11 +40,7 @@ type CrashesCollector struct { logger *logrus.Logger version *Version - // We keep track of which daemons we've seen so that their error count - // can be reset to zero if the errors get purged. - knownEntities map[string]bool - - CrashReports prometheus.GaugeVec + crashReportsDesc *prometheus.Desc } // NewCrashesCollector creates a new CrashesCollector instance @@ -56,16 +53,11 @@ func NewCrashesCollector(exporter *Exporter) *CrashesCollector { logger: exporter.Logger, version: exporter.Version, - knownEntities: map[string]bool{}, - - CrashReports: *prometheus.NewGaugeVec( - prometheus.GaugeOpts{ - Namespace: cephNamespace, - Name: "crash_reports", - Help: "Count of crashes reports per daemon, according to `ceph crash ls`", - ConstLabels: labels, - }, + crashReportsDesc: prometheus.NewDesc( + fmt.Sprintf("%s_crash_reports", cephNamespace), + "Count of crashes reports per daemon, according to `ceph crash ls`", []string{"daemon", "status"}, + labels, ), } @@ -78,8 +70,8 @@ type crashEntry struct { } // getCrashLs runs the 'crash ls' command and parses its results -func (c *CrashesCollector) getCrashLs() ([]crashEntry, error) { - crashes := make([]crashEntry, 0) +func (c *CrashesCollector) getCrashLs() (map[crashEntry]int, error) { + crashes := make(map[crashEntry]int) // We parse the plain format because it is quite compact. // The JSON output of this command is very verbose and might be too slow @@ -101,39 +93,19 @@ func (c *CrashesCollector) getCrashLs() ([]crashEntry, error) { for scanner.Scan() { matched := crashLsLineRegex.FindStringSubmatch(scanner.Text()) if len(matched) == 3 { - crashes = append(crashes, crashEntry{matched[1], matched[2] == "*"}) + crashes[crashEntry{matched[1], matched[2] == "*"}]++ } else if len(matched) == 2 { // Just in case the line-end spaces were stripped - crashes = append(crashes, crashEntry{matched[1], false}) + crashes[crashEntry{matched[1], false}]++ } } return crashes, nil } -// processCrashLs takes the parsed results from getCrashLs and counts them -// in a map. It also keeps track of which daemons we've see in the past, and -// initializes all counts to zero where needed. -func (c *CrashesCollector) processCrashLs(crashes []crashEntry) map[crashEntry]int { - crashMap := make(map[crashEntry]int) - - for _, crash := range crashes { - c.knownEntities[crash.entity] = true - } - for entity := range c.knownEntities { - crashMap[crashEntry{entity, true}] = 0 - crashMap[crashEntry{entity, false}] = 0 - } - for _, crash := range crashes { - crashMap[crash]++ - } - - return crashMap -} - // Describe provides the metrics descriptions to Prometheus func (c *CrashesCollector) Describe(ch chan<- *prometheus.Desc) { - c.CrashReports.Describe(ch) + ch <- c.crashReportsDesc } // Collect sends all the collected metrics Prometheus. @@ -142,11 +114,14 @@ func (c *CrashesCollector) Collect(ch chan<- prometheus.Metric) { if err != nil { c.logger.WithError(err).Error("failed to run 'ceph crash ls'") } - crashMap := c.processCrashLs(crashes) - for crash, count := range crashMap { - c.CrashReports.WithLabelValues(crash.entity, statusNames[crash.isNew]).Set(float64(count)) + for crash, count := range crashes { + ch <- prometheus.MustNewConstMetric( + c.crashReportsDesc, + prometheus.GaugeValue, + float64(count), + crash.entity, + statusNames[crash.isNew], + ) } - - c.CrashReports.Collect(ch) } diff --git a/ceph/crashes_test.go b/ceph/crashes_test.go index fe5e458..5b665fe 100644 --- a/ceph/crashes_test.go +++ b/ceph/crashes_test.go @@ -18,7 +18,6 @@ import ( "io/ioutil" "net/http" "net/http/httptest" - "reflect" "regexp" "testing" @@ -31,129 +30,102 @@ import ( func TestCrashesCollector(t *testing.T) { - const outputCephCrashLs string = ` - ID ENTITY NEW - 2022-01-01_18:57:51.184156Z_02d9b659-69d1-4dd6-8495-ee2345208568 client.admin - 2022-01-01_19:02:01.401852Z_9100163b-4cd1-479f-b3a8-0dc2d288eaea mgr.mgr-node-01 - 2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 client.admin * - 2022-02-03_04:03:38.371403Z_bd756324-27c0-494e-adfb-9f5f6e3db000 osd.3 * - 2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a osd.3 * - ` - - t.Run( - "full test", - func(t *testing.T) { - conn := &MockConn{} - conn.On("MonCommand", mock.Anything).Return( - []byte(outputCephCrashLs), "", nil, - ) - - collector := NewCrashesCollector(&Exporter{Conn: conn, Cluster: "ceph", Logger: logrus.New(), Version: Pacific}) - err := prometheus.Register(collector) - require.NoError(t, err) - defer prometheus.Unregister(collector) - - server := httptest.NewServer(promhttp.Handler()) - defer server.Close() - - resp, err := http.Get(server.URL) - require.NoError(t, err) - defer resp.Body.Close() - - buf, err := ioutil.ReadAll(resp.Body) - require.NoError(t, err) - - reMatches := []*regexp.Regexp{ + for _, tt := range []struct { + name string + input string + reMatch []*regexp.Regexp + }{ + { + name: "single new crash", + input: ` +ID ENTITY NEW +2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0 * + `, + reMatch: []*regexp.Regexp{ + regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="new"} 1`), + }, + }, + { + name: "single archived crash", + input: ` +ID ENTITY NEW +2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0 + `, + reMatch: []*regexp.Regexp{ + regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="archived"} 1`), + }, + }, + { + name: "two new crashes same entity", + input: ` +ID ENTITY NEW +2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0 * +2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a osd.0 * +`, + reMatch: []*regexp.Regexp{ + regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="new"} 2`), + }, + }, + { + name: "mix of crashes same entity", + input: ` +ID ENTITY NEW +2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 osd.0 +2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a osd.0 * +`, + reMatch: []*regexp.Regexp{ + regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="new"} 1`), + regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.0",status="archived"} 1`), + }, + }, + { + name: "mix of crashes different entities", + input: ` +ID ENTITY NEW +2022-02-01_21:02:46.687015Z_0de8b741-b323-4f63-828a-e460294e28b9 mgr.mgr-node-01 * +2022-02-03_04:05:45.419226Z_11c639af-5eb2-4a29-91aa-20120218891a client.admin * +`, + reMatch: []*regexp.Regexp{ + regexp.MustCompile(`crash_reports{cluster="ceph",daemon="mgr.mgr-node-01",status="new"} 1`), regexp.MustCompile(`crash_reports{cluster="ceph",daemon="client.admin",status="new"} 1`), - regexp.MustCompile(`crash_reports{cluster="ceph",daemon="client.admin",status="archived"} 1`), - regexp.MustCompile(`crash_reports{cluster="ceph",daemon="mgr.mgr-node-01",status="new"} 0`), - regexp.MustCompile(`crash_reports{cluster="ceph",daemon="mgr.mgr-node-01",status="archived"} 1`), - regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.3",status="new"} 2`), - regexp.MustCompile(`crash_reports{cluster="ceph",daemon="osd.3",status="archived"} 0`), - } + }, + }, + { + // At least code shouldn't panic + name: "no crashes", + input: ``, + reMatch: []*regexp.Regexp{}, + }, + } { + t.Run( + tt.name, + func(t *testing.T) { + conn := &MockConn{} + conn.On("MonCommand", mock.Anything).Return( + []byte(tt.input), "", nil, + ) - // t.Log(string(buf)) - for _, re := range reMatches { - if !re.Match(buf) { - t.Errorf("expected %s to match\n", re.String()) + collector := NewCrashesCollector(&Exporter{Conn: conn, Cluster: "ceph", Logger: logrus.New(), Version: Pacific}) + err := prometheus.Register(collector) + require.NoError(t, err) + defer prometheus.Unregister(collector) + + server := httptest.NewServer(promhttp.Handler()) + defer server.Close() + + resp, err := http.Get(server.URL) + require.NoError(t, err) + defer resp.Body.Close() + + buf, err := ioutil.ReadAll(resp.Body) + require.NoError(t, err) + + for _, re := range tt.reMatch { + if !re.Match(buf) { + t.Errorf("expected %s to match\n", re.String()) + } } - } - }, - ) - - t.Run( - "getCrashLs unit test", - func(t *testing.T) { - conn := &MockConn{} - conn.On("MonCommand", mock.Anything).Return( - []byte(outputCephCrashLs), "", nil, - ) - - log := logrus.New() - log.Level = logrus.DebugLevel - collector := NewCrashesCollector(&Exporter{Conn: conn, Cluster: "ceph", Logger: log, Version: Pacific}) - - expected := []crashEntry{ - {"client.admin", false}, - {"mgr.mgr-node-01", false}, - {"client.admin", true}, - {"osd.3", true}, - {"osd.3", true}, - } - crashes, _ := collector.getCrashLs() - - if !reflect.DeepEqual(crashes, expected) { - t.Errorf("incorrect getCrashLs result: expected %v, got %v\n", expected, crashes) - } - }, - ) - - t.Run( - "getCrashLs empty crash list unit test", - func(t *testing.T) { - conn := &MockConn{} - conn.On("MonCommand", mock.Anything).Return( - []byte(""), "", nil, - ) - - collector := NewCrashesCollector(&Exporter{Conn: conn, Cluster: "ceph", Logger: logrus.New(), Version: Pacific}) - - crashes, _ := collector.getCrashLs() - if len(crashes) != 0 { - t.Errorf("expected empty result from getCrashLs, got %v\n", crashes) - } - }, - ) - - t.Run( - "processCrashLs test", - func(t *testing.T) { - collector := NewCrashesCollector(&Exporter{Conn: nil, Cluster: "ceph", Logger: logrus.New(), Version: Pacific}) - - newCrash := crashEntry{"daemon", true} - archivedCrash := crashEntry{"daemon", false} - - // New crash - crashMap := collector.processCrashLs([]crashEntry{newCrash}) - expected := map[crashEntry]int{newCrash: 1, archivedCrash: 0} - if !reflect.DeepEqual(crashMap, expected) { - t.Errorf("incorrect processCrashLs result: expected %v, got %v\n", expected, crashMap) - } - - // Archived crash - crashMap = collector.processCrashLs([]crashEntry{archivedCrash}) - expected = map[crashEntry]int{newCrash: 0, archivedCrash: 1} - if !reflect.DeepEqual(crashMap, expected) { - t.Errorf("incorrect processCrashLs result: expected %v, got %v\n", expected, crashMap) - } - - // Crash was memorized, check that we reset count to zero - crashMap = collector.processCrashLs([]crashEntry{}) - expected = map[crashEntry]int{newCrash: 0, archivedCrash: 0} - if !reflect.DeepEqual(crashMap, expected) { - t.Errorf("incorrect processCrashLs result: expected %v, got %v\n", expected, crashMap) - } - - }, - ) + }, + ) + } }