Metrics: Silence maintenance success and failure (#3285)
* Metrics: Silence maintenance success and failure Due to various reasons, we've observed different kind of errors on this area. From read-only disks to silly code bugs. Errors during maintenance are effectively a _data loss_ and therefore we should encourage proper monitoring of this area. This PR Introduces a total and failure metric for silence maintenance. If agreed, I'll do the same for the nflog and fix the flaky test like I did for silences while I'm there. Signed-off-by: gotjosh <josue.abreu@gmail.com>
This commit is contained in:
parent
28925efbd8
commit
3ee2cd0f12
|
@ -215,6 +215,8 @@ type metrics struct {
|
|||
silencesPending prometheus.GaugeFunc
|
||||
silencesExpired prometheus.GaugeFunc
|
||||
propagatedMessagesTotal prometheus.Counter
|
||||
maintenanceTotal prometheus.Counter
|
||||
maintenanceErrorsTotal prometheus.Counter
|
||||
}
|
||||
|
||||
func newSilenceMetricByState(s *Silences, st types.SilenceState) prometheus.GaugeFunc {
|
||||
|
@ -251,6 +253,14 @@ func newMetrics(r prometheus.Registerer, s *Silences) *metrics {
|
|||
Name: "alertmanager_silences_snapshot_size_bytes",
|
||||
Help: "Size of the last silence snapshot in bytes.",
|
||||
})
|
||||
m.maintenanceTotal = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Name: "alertmanager_silences_maintenance_total",
|
||||
Help: "How many maintenances were executed for silences.",
|
||||
})
|
||||
m.maintenanceErrorsTotal = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Name: "alertmanager_silences_maintenance_errors_total",
|
||||
Help: "How many maintenances were executed for silences that failed.",
|
||||
})
|
||||
m.queriesTotal = prometheus.NewCounter(prometheus.CounterOpts{
|
||||
Name: "alertmanager_silences_queries_total",
|
||||
Help: "How many silence queries were received.",
|
||||
|
@ -285,6 +295,8 @@ func newMetrics(r prometheus.Registerer, s *Silences) *metrics {
|
|||
m.silencesPending,
|
||||
m.silencesExpired,
|
||||
m.propagatedMessagesTotal,
|
||||
m.maintenanceTotal,
|
||||
m.maintenanceErrorsTotal,
|
||||
)
|
||||
}
|
||||
return m
|
||||
|
@ -395,12 +407,17 @@ func (s *Silences) Maintenance(interval time.Duration, snapf string, stopc <-cha
|
|||
}
|
||||
|
||||
runMaintenance := func(do MaintenanceFunc) error {
|
||||
start := s.nowUTC()
|
||||
s.metrics.maintenanceTotal.Inc()
|
||||
level.Debug(s.logger).Log("msg", "Running maintenance")
|
||||
start := s.nowUTC()
|
||||
size, err := do()
|
||||
level.Debug(s.logger).Log("msg", "Maintenance done", "duration", s.clock.Since(start), "size", size)
|
||||
s.metrics.snapshotSize.Set(float64(size))
|
||||
return err
|
||||
if err != nil {
|
||||
s.metrics.maintenanceErrorsTotal.Inc()
|
||||
return err
|
||||
}
|
||||
level.Debug(s.logger).Log("msg", "Maintenance done", "duration", s.clock.Since(start), "size", size)
|
||||
return nil
|
||||
}
|
||||
|
||||
Loop:
|
||||
|
|
|
@ -27,6 +27,7 @@ import (
|
|||
"github.com/go-kit/log"
|
||||
"github.com/matttproud/golang_protobuf_extensions/pbutil"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/testutil"
|
||||
"github.com/prometheus/common/model"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/atomic"
|
||||
|
@ -208,7 +209,9 @@ func TestSilences_Maintenance_SupportsCustomCallback(t *testing.T) {
|
|||
f, err := os.CreateTemp("", "snapshot")
|
||||
require.NoError(t, err, "creating temp file failed")
|
||||
clock := clock.NewMock()
|
||||
s := &Silences{st: state{}, logger: log.NewNopLogger(), clock: clock, metrics: newMetrics(nil, nil)}
|
||||
reg := prometheus.NewRegistry()
|
||||
s := &Silences{st: state{}, logger: log.NewNopLogger(), clock: clock}
|
||||
s.metrics = newMetrics(reg, s)
|
||||
stopc := make(chan struct{})
|
||||
|
||||
var calls atomic.Int32
|
||||
|
@ -237,6 +240,16 @@ func TestSilences_Maintenance_SupportsCustomCallback(t *testing.T) {
|
|||
wg.Wait()
|
||||
|
||||
require.EqualValues(t, 2, calls.Load())
|
||||
|
||||
// Check the maintenance metrics.
|
||||
require.NoError(t, testutil.GatherAndCompare(reg, bytes.NewBufferString(`
|
||||
# HELP alertmanager_silences_maintenance_errors_total How many maintenances were executed for silences that failed.
|
||||
# TYPE alertmanager_silences_maintenance_errors_total counter
|
||||
alertmanager_silences_maintenance_errors_total 0
|
||||
# HELP alertmanager_silences_maintenance_total How many maintenances were executed for silences.
|
||||
# TYPE alertmanager_silences_maintenance_total counter
|
||||
alertmanager_silences_maintenance_total 2
|
||||
`), "alertmanager_silences_maintenance_total", "alertmanager_silences_maintenance_errors_total"))
|
||||
}
|
||||
|
||||
func TestSilencesSetSilence(t *testing.T) {
|
||||
|
|
Loading…
Reference in New Issue