diff --git a/silence/silence.go b/silence/silence.go index 21be9438..68844d38 100644 --- a/silence/silence.go +++ b/silence/silence.go @@ -215,6 +215,8 @@ type metrics struct { silencesPending prometheus.GaugeFunc silencesExpired prometheus.GaugeFunc propagatedMessagesTotal prometheus.Counter + maintenanceTotal prometheus.Counter + maintenanceErrorsTotal prometheus.Counter } func newSilenceMetricByState(s *Silences, st types.SilenceState) prometheus.GaugeFunc { @@ -251,6 +253,14 @@ func newMetrics(r prometheus.Registerer, s *Silences) *metrics { Name: "alertmanager_silences_snapshot_size_bytes", Help: "Size of the last silence snapshot in bytes.", }) + m.maintenanceTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_silences_maintenance_total", + Help: "How many maintenances were executed for silences.", + }) + m.maintenanceErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "alertmanager_silences_maintenance_errors_total", + Help: "How many maintenances were executed for silences that failed.", + }) m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{ Name: "alertmanager_silences_queries_total", Help: "How many silence queries were received.", @@ -285,6 +295,8 @@ func newMetrics(r prometheus.Registerer, s *Silences) *metrics { m.silencesPending, m.silencesExpired, m.propagatedMessagesTotal, + m.maintenanceTotal, + m.maintenanceErrorsTotal, ) } return m @@ -395,12 +407,17 @@ func (s *Silences) Maintenance(interval time.Duration, snapf string, stopc <-cha } runMaintenance := func(do MaintenanceFunc) error { - start := s.nowUTC() + s.metrics.maintenanceTotal.Inc() level.Debug(s.logger).Log("msg", "Running maintenance") + start := s.nowUTC() size, err := do() - level.Debug(s.logger).Log("msg", "Maintenance done", "duration", s.clock.Since(start), "size", size) s.metrics.snapshotSize.Set(float64(size)) - return err + if err != nil { + s.metrics.maintenanceErrorsTotal.Inc() + return err + } + level.Debug(s.logger).Log("msg", "Maintenance done", "duration", s.clock.Since(start), "size", size) + return nil } Loop: diff --git a/silence/silence_test.go b/silence/silence_test.go index 7a95d1c6..a9c6f634 100644 --- a/silence/silence_test.go +++ b/silence/silence_test.go @@ -27,6 +27,7 @@ import ( "github.com/go-kit/log" "github.com/matttproud/golang_protobuf_extensions/pbutil" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" "github.com/prometheus/common/model" "github.com/stretchr/testify/require" "go.uber.org/atomic" @@ -208,7 +209,9 @@ func TestSilences_Maintenance_SupportsCustomCallback(t *testing.T) { f, err := os.CreateTemp("", "snapshot") require.NoError(t, err, "creating temp file failed") clock := clock.NewMock() - s := &Silences{st: state{}, logger: log.NewNopLogger(), clock: clock, metrics: newMetrics(nil, nil)} + reg := prometheus.NewRegistry() + s := &Silences{st: state{}, logger: log.NewNopLogger(), clock: clock} + s.metrics = newMetrics(reg, s) stopc := make(chan struct{}) var calls atomic.Int32 @@ -237,6 +240,16 @@ func TestSilences_Maintenance_SupportsCustomCallback(t *testing.T) { wg.Wait() require.EqualValues(t, 2, calls.Load()) + + // Check the maintenance metrics. + require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(` +# HELP alertmanager_silences_maintenance_errors_total How many maintenances were executed for silences that failed. +# TYPE alertmanager_silences_maintenance_errors_total counter +alertmanager_silences_maintenance_errors_total 0 +# HELP alertmanager_silences_maintenance_total How many maintenances were executed for silences. +# TYPE alertmanager_silences_maintenance_total counter +alertmanager_silences_maintenance_total 2 +`), "alertmanager_silences_maintenance_total", "alertmanager_silences_maintenance_errors_total")) } func TestSilencesSetSilence(t *testing.T) {