From 28925efbd8d70673c7f13b4b13942723d345abed Mon Sep 17 00:00:00 2001 From: gotjosh Date: Wed, 8 Mar 2023 10:29:05 +0000 Subject: [PATCH] Metrics: Notification log maintenance success and failure (#3286) * Metrics: Notification log maintenance success and failure Due to various reasons, we've observed different kind of errors on this area. From read-only disks to silly code bugs. Errors during maintenance are effectively a data loss and therefore, we should encourage proper monitoring of this area. Similar to #3285 --------- Signed-off-by: gotjosh --- nflog/nflog.go | 21 ++++++++++++++-- nflog/nflog_test.go | 61 ++++++++++++++++++++++++++++++++------------- 2 files changed, 62 insertions(+), 20 deletions(-) diff --git a/nflog/nflog.go b/nflog/nflog.go index af39894b..c318cede 100644 --- a/nflog/nflog.go +++ b/nflog/nflog.go @@ -101,6 +101,8 @@ type metrics struct { queryErrorsTotal prometheus.Counter queryDuration prometheus.Histogram propagatedMessagesTotal prometheus.Counter + maintenanceTotal prometheus.Counter + maintenanceErrorsTotal prometheus.Counter } func newMetrics(r prometheus.Registerer) *metrics { @@ -120,6 +122,14 @@ func newMetrics(r prometheus.Registerer) *metrics { Name: "alertmanager_nflog_snapshot_size_bytes", Help: "Size of the last notification log snapshot in bytes.", }) + m.maintenanceTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_nflog_maintenance_total", + Help: "How many maintenances were executed for the notification log.", + }) + m.maintenanceErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_nflog_maintenance_errors_total", + Help: "How many maintenances were executed for the notification log that failed.", + }) m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{ Name: "alertmanager_nflog_queries_total", Help: "Number of notification log queries were received.", @@ -146,6 +156,8 @@ func newMetrics(r prometheus.Registerer) *metrics { m.queryErrorsTotal, m.queryDuration, m.propagatedMessagesTotal, + m.maintenanceTotal, + m.maintenanceErrorsTotal, ) } return m @@ -317,12 +329,17 @@ func (l *Log) Maintenance(interval time.Duration, snapf string, stopc <-chan str } runMaintenance := func(do func() (int64, error)) error { + l.metrics.maintenanceTotal.Inc() start := l.now().UTC() level.Debug(l.logger).Log("msg", "Running maintenance") size, err := do() - level.Debug(l.logger).Log("msg", "Maintenance done", "duration", l.now().Sub(start), "size", size) l.metrics.snapshotSize.Set(float64(size)) - return err + if err != nil { + l.metrics.maintenanceErrorsTotal.Inc() + return err + } + level.Debug(l.logger).Log("msg", "Maintenance done", "duration", l.now().Sub(start), "size", size) + return nil } Loop: diff --git a/nflog/nflog_test.go b/nflog/nflog_test.go index d55ee761..be85b8df 100644 --- a/nflog/nflog_test.go +++ b/nflog/nflog_test.go @@ -18,7 +18,6 @@ import ( "io" "os" "path/filepath" - "runtime" "sync" "testing" "time" @@ -27,7 +26,9 @@ import ( "github.com/benbjohnson/clock" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/require" + "go.uber.org/atomic" ) func TestLogGC(t *testing.T) { @@ -133,35 +134,53 @@ func TestLogSnapshot(t *testing.T) { func TestWithMaintenance_SupportsCustomCallback(t *testing.T) { f, err := os.CreateTemp("", "snapshot") require.NoError(t, err, "creating temp file failed") - stopc := make(chan struct{}) - var mtx sync.Mutex - var mc int + reg := prometheus.NewPedanticRegistry() opts := Options{ - Metrics: prometheus.NewPedanticRegistry(), + Metrics: reg, SnapshotFile: f.Name(), } l, err := New(opts) - mockClock := clock.NewMock() - l.clock = mockClock + clock := clock.NewMock() + l.clock = clock require.NoError(t, err) - go l.Maintenance(100*time.Millisecond, f.Name(), stopc, func() (int64, error) { - mtx.Lock() - mc++ - mtx.Unlock() + var calls atomic.Int32 + var wg sync.WaitGroup - return 0, nil - }) - runtime.Gosched() // ensure that the ticker is running. + wg.Add(1) + go func() { + defer wg.Done() + l.Maintenance(100*time.Millisecond, f.Name(), stopc, func() (int64, error) { + calls.Add(1) + return 0, nil + }) + }() + gosched() - mockClock.Add(200 * time.Millisecond) + // Before the first tick, no maintenance executed. + clock.Add(99 * time.Millisecond) + require.EqualValues(t, 0, calls.Load()) + + // Tick once. + clock.Add(1 * time.Millisecond) + require.EqualValues(t, 1, calls.Load()) + + // Stop the maintenance loop. We should get exactly one more execution of the maintenance func. close(stopc) + wg.Wait() - mtx.Lock() - defer mtx.Unlock() - require.Equal(t, 2, mc) + require.EqualValues(t, 2, calls.Load()) + // Check the maintenance metrics. + require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` +# HELP alertmanager_nflog_maintenance_errors_total How many maintenances were executed for the notification log that failed. +# TYPE alertmanager_nflog_maintenance_errors_total counter +alertmanager_nflog_maintenance_errors_total 0 +# HELP alertmanager_nflog_maintenance_total How many maintenances were executed for the notification log. +# TYPE alertmanager_nflog_maintenance_total counter +alertmanager_nflog_maintenance_total 2 +`), "alertmanager_nflog_maintenance_total", "alertmanager_nflog_maintenance_errors_total")) } func TestReplaceFile(t *testing.T) { @@ -356,3 +375,9 @@ func TestStateDecodingError(t *testing.T) { _, err = decodeState(bytes.NewReader(msg)) require.Equal(t, ErrInvalidState, err) } + +// runtime.Gosched() does not "suspend" the current goroutine so there's no guarantee that the main goroutine won't +// be able to continue. For more see https://pkg.go.dev/runtime#Gosched. +func gosched() { + time.Sleep(1 * time.Millisecond) +}