From 28925efbd8d70673c7f13b4b13942723d345abed Mon Sep 17 00:00:00 2001
From: gotjosh <josue.abreu@gmail.com>
Date: Wed, 8 Mar 2023 10:29:05 +0000
Subject: [PATCH] Metrics: Notification log maintenance success and failure
 (#3286)

* Metrics: Notification log maintenance success and failure

Due to various reasons, we've observed different kind of errors on this area. From read-only disks to silly code bugs. Errors during maintenance are effectively a data loss and therefore, we should encourage proper monitoring of this area.

Similar to #3285
---------

Signed-off-by: gotjosh <josue.abreu@gmail.com>
---
 nflog/nflog.go      | 21 ++++++++++++++--
 nflog/nflog_test.go | 61 ++++++++++++++++++++++++++++++++-------------
 2 files changed, 62 insertions(+), 20 deletions(-)

diff --git a/nflog/nflog.go b/nflog/nflog.go
index af39894b..c318cede 100644
--- a/nflog/nflog.go
+++ b/nflog/nflog.go
@@ -101,6 +101,8 @@ type metrics struct {
 	queryErrorsTotal        prometheus.Counter
 	queryDuration           prometheus.Histogram
 	propagatedMessagesTotal prometheus.Counter
+	maintenanceTotal        prometheus.Counter
+	maintenanceErrorsTotal  prometheus.Counter
 }
 
 func newMetrics(r prometheus.Registerer) *metrics {
@@ -120,6 +122,14 @@ func newMetrics(r prometheus.Registerer) *metrics {
 		Name: "alertmanager_nflog_snapshot_size_bytes",
 		Help: "Size of the last notification log snapshot in bytes.",
 	})
+	m.maintenanceTotal = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "alertmanager_nflog_maintenance_total",
+		Help: "How many maintenances were executed for the notification log.",
+	})
+	m.maintenanceErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "alertmanager_nflog_maintenance_errors_total",
+		Help: "How many maintenances were executed for the notification log that failed.",
+	})
 	m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{
 		Name: "alertmanager_nflog_queries_total",
 		Help: "Number of notification log queries were received.",
@@ -146,6 +156,8 @@ func newMetrics(r prometheus.Registerer) *metrics {
 			m.queryErrorsTotal,
 			m.queryDuration,
 			m.propagatedMessagesTotal,
+			m.maintenanceTotal,
+			m.maintenanceErrorsTotal,
 		)
 	}
 	return m
@@ -317,12 +329,17 @@ func (l *Log) Maintenance(interval time.Duration, snapf string, stopc <-chan str
 	}
 
 	runMaintenance := func(do func() (int64, error)) error {
+		l.metrics.maintenanceTotal.Inc()
 		start := l.now().UTC()
 		level.Debug(l.logger).Log("msg", "Running maintenance")
 		size, err := do()
-		level.Debug(l.logger).Log("msg", "Maintenance done", "duration", l.now().Sub(start), "size", size)
 		l.metrics.snapshotSize.Set(float64(size))
-		return err
+		if err != nil {
+			l.metrics.maintenanceErrorsTotal.Inc()
+			return err
+		}
+		level.Debug(l.logger).Log("msg", "Maintenance done", "duration", l.now().Sub(start), "size", size)
+		return nil
 	}
 
 Loop:
diff --git a/nflog/nflog_test.go b/nflog/nflog_test.go
index d55ee761..be85b8df 100644
--- a/nflog/nflog_test.go
+++ b/nflog/nflog_test.go
@@ -18,7 +18,6 @@ import (
 	"io"
 	"os"
 	"path/filepath"
-	"runtime"
 	"sync"
 	"testing"
 	"time"
@@ -27,7 +26,9 @@ import (
 
 	"github.com/benbjohnson/clock"
 	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/client_golang/prometheus/testutil"
 	"github.com/stretchr/testify/require"
+	"go.uber.org/atomic"
 )
 
 func TestLogGC(t *testing.T) {
@@ -133,35 +134,53 @@ func TestLogSnapshot(t *testing.T) {
 func TestWithMaintenance_SupportsCustomCallback(t *testing.T) {
 	f, err := os.CreateTemp("", "snapshot")
 	require.NoError(t, err, "creating temp file failed")
-
 	stopc := make(chan struct{})
-	var mtx sync.Mutex
-	var mc int
+	reg := prometheus.NewPedanticRegistry()
 	opts := Options{
-		Metrics:      prometheus.NewPedanticRegistry(),
+		Metrics:      reg,
 		SnapshotFile: f.Name(),
 	}
 
 	l, err := New(opts)
-	mockClock := clock.NewMock()
-	l.clock = mockClock
+	clock := clock.NewMock()
+	l.clock = clock
 	require.NoError(t, err)
 
-	go l.Maintenance(100*time.Millisecond, f.Name(), stopc, func() (int64, error) {
-		mtx.Lock()
-		mc++
-		mtx.Unlock()
+	var calls atomic.Int32
+	var wg sync.WaitGroup
 
-		return 0, nil
-	})
-	runtime.Gosched() // ensure that the ticker is running.
+	wg.Add(1)
+	go func() {
+		defer wg.Done()
+		l.Maintenance(100*time.Millisecond, f.Name(), stopc, func() (int64, error) {
+			calls.Add(1)
+			return 0, nil
+		})
+	}()
+	gosched()
 
-	mockClock.Add(200 * time.Millisecond)
+	// Before the first tick, no maintenance executed.
+	clock.Add(99 * time.Millisecond)
+	require.EqualValues(t, 0, calls.Load())
+
+	// Tick once.
+	clock.Add(1 * time.Millisecond)
+	require.EqualValues(t, 1, calls.Load())
+
+	// Stop the maintenance loop. We should get exactly one more execution of the maintenance func.
 	close(stopc)
+	wg.Wait()
 
-	mtx.Lock()
-	defer mtx.Unlock()
-	require.Equal(t, 2, mc)
+	require.EqualValues(t, 2, calls.Load())
+	// Check the maintenance metrics.
+	require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
+# HELP alertmanager_nflog_maintenance_errors_total How many maintenances were executed for the notification log that failed.
+# TYPE alertmanager_nflog_maintenance_errors_total counter
+alertmanager_nflog_maintenance_errors_total 0
+# HELP alertmanager_nflog_maintenance_total How many maintenances were executed for the notification log.
+# TYPE alertmanager_nflog_maintenance_total counter
+alertmanager_nflog_maintenance_total 2
+`), "alertmanager_nflog_maintenance_total", "alertmanager_nflog_maintenance_errors_total"))
 }
 
 func TestReplaceFile(t *testing.T) {
@@ -356,3 +375,9 @@ func TestStateDecodingError(t *testing.T) {
 	_, err = decodeState(bytes.NewReader(msg))
 	require.Equal(t, ErrInvalidState, err)
 }
+
+// runtime.Gosched() does not "suspend" the current goroutine so there's no guarantee that the main goroutine won't
+// be able to continue. For more see https://pkg.go.dev/runtime#Gosched.
+func gosched() {
+	time.Sleep(1 * time.Millisecond)
+}