feat: add counter to track alerts dropped outside of time_intervals (#3565)

* feat: add counter to track alerts dropped outside of time_intervals

Addresses: #3512

This adds a new counter metric `alertmanager_alerts_supressed_total`
that is incremented by `len(alerts)` when an alert is suppressed for
being outside of a time_interval, ie inside of a mute_time_intervals or
outside of an active_time_intervals.

Signed-off-by: TJ Hoplock <t.hoplock@gmail.com>

* test: add time interval suppression metric checks for notify

Signed-off-by: TJ Hoplock <t.hoplock@gmail.com>

* test: fix failure message log values in notifier

Signed-off-by: TJ Hoplock <t.hoplock@gmail.com>

* ref: address PR feedback for #3565

Signed-off-by: TJ Hoplock <t.hoplock@gmail.com>

* fix: track suppressed notifications metric for inhibit/silence

Based on PR feedback:

https://github.com/prometheus/alertmanager/pull/3565/files#r1393068026

Signed-off-by: TJ Hoplock <t.hoplock@gmail.com>

* fix: broken notifier tests

- fixed metric count check to properly check the diff between
  input/output notifications from the suppression to compare to suppression
metric, was previously inverted to compare to how many notifications it
suppressed.
- stopped using `Reset()` to compare collection counts between the
  multiple stages that are executed in `TestMuteStageWithSilences()`.
the intent was to compare a clean metric collection after each stage
execution, but the final stage where all silences are lifted results in
no metric being created in the test, causing `prom_testutil.ToFloat64()`
to panic. changed to separate vars to check counts between each stage,
with care to consider prior counts.

Signed-off-by: TJ Hoplock <t.hoplock@gmail.com>

* rename metric and add constants

Signed-off-by: gotjosh <josue.abreu@gmail.com>

---------

Signed-off-by: TJ Hoplock <t.hoplock@gmail.com>
Signed-off-by: gotjosh <josue.abreu@gmail.com>
Co-authored-by: gotjosh <josue.abreu@gmail.com>
This commit is contained in:
TJ Hoplock 2024-02-13 06:17:24 -05:00 committed by GitHub
parent f69a508665
commit f00025d037
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 83 additions and 20 deletions

View File

@ -251,6 +251,7 @@ type Metrics struct {
numTotalFailedNotifications *prometheus.CounterVec
numNotificationRequestsTotal *prometheus.CounterVec
numNotificationRequestsFailedTotal *prometheus.CounterVec
numNotificationSuppressedTotal *prometheus.CounterVec
notificationLatencySeconds *prometheus.HistogramVec
ff featurecontrol.Flagger
@ -284,6 +285,11 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics {
Name: "notification_requests_failed_total",
Help: "The total number of failed notification requests.",
}, labels),
numNotificationSuppressedTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "alertmanager",
Name: "notifications_suppressed_total",
Help: "The total number of notifications suppressed for being outside of active time intervals or within muted time intervals.",
}, []string{"reason"}),
notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{
Namespace: "alertmanager",
Name: "notification_latency_seconds",
@ -296,7 +302,7 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics {
r.MustRegister(
m.numNotifications, m.numTotalFailedNotifications,
m.numNotificationRequestsTotal, m.numNotificationRequestsFailedTotal,
m.notificationLatencySeconds,
m.numNotificationSuppressedTotal, m.notificationLatencySeconds,
)
return m
@ -381,10 +387,10 @@ func (pb *PipelineBuilder) New(
rs := make(RoutingStage, len(receivers))
ms := NewGossipSettleStage(peer)
is := NewMuteStage(inhibitor)
tas := NewTimeActiveStage(intervener)
tms := NewTimeMuteStage(intervener)
ss := NewMuteStage(silencer)
is := NewMuteStage(inhibitor, pb.metrics)
tas := NewTimeActiveStage(intervener, pb.metrics)
tms := NewTimeMuteStage(intervener, pb.metrics)
ss := NewMuteStage(silencer, pb.metrics)
for name := range receivers {
st := createReceiverStage(name, receivers[name], wait, notificationLog, pb.metrics)
@ -507,14 +513,22 @@ func (n *GossipSettleStage) Exec(ctx context.Context, _ log.Logger, alerts ...*t
return ctx, alerts, nil
}
const (
suppressedReasonSilence = "silence"
suppressedReasonInhibition = "inhibition"
suppressedReasonMuteTimeInterval = "mute_time_interval"
suppressedReasonActiveTimeInterval = "active_time_interval"
)
// MuteStage filters alerts through a Muter.
type MuteStage struct {
muter types.Muter
muter types.Muter
metrics *Metrics
}
// NewMuteStage return a new MuteStage.
func NewMuteStage(m types.Muter) *MuteStage {
return &MuteStage{muter: m}
func NewMuteStage(m types.Muter, metrics *Metrics) *MuteStage {
return &MuteStage{muter: m, metrics: metrics}
}
// Exec implements the Stage interface.
@ -535,7 +549,18 @@ func (n *MuteStage) Exec(ctx context.Context, logger log.Logger, alerts ...*type
}
if len(muted) > 0 {
level.Debug(logger).Log("msg", "Notifications will not be sent for muted alerts", "alerts", fmt.Sprintf("%v", muted))
var reason string
switch n.muter.(type) {
case *silence.Silencer:
reason = suppressedReasonSilence
case *inhibit.Inhibitor:
reason = suppressedReasonInhibition
default:
}
n.metrics.numNotificationSuppressedTotal.WithLabelValues(reason).Add(float64(len(muted)))
}
return ctx, filtered, nil
}
@ -894,13 +919,14 @@ func (n SetNotifiesStage) Exec(ctx context.Context, l log.Logger, alerts ...*typ
}
type timeStage struct {
muter types.TimeMuter
muter types.TimeMuter
metrics *Metrics
}
type TimeMuteStage timeStage
func NewTimeMuteStage(m types.TimeMuter) *TimeMuteStage {
return &TimeMuteStage{m}
func NewTimeMuteStage(m types.TimeMuter, metrics *Metrics) *TimeMuteStage {
return &TimeMuteStage{m, metrics}
}
// Exec implements the stage interface for TimeMuteStage.
@ -927,7 +953,8 @@ func (tms TimeMuteStage) Exec(ctx context.Context, l log.Logger, alerts ...*type
// If the current time is inside a mute time, all alerts are removed from the pipeline.
if muted {
level.Debug(l).Log("msg", "Notifications not sent, route is within mute time")
tms.metrics.numNotificationSuppressedTotal.WithLabelValues(suppressedReasonMuteTimeInterval).Add(float64(len(alerts)))
level.Debug(l).Log("msg", "Notifications not sent, route is within mute time", "alerts", len(alerts))
return ctx, nil, nil
}
return ctx, alerts, nil
@ -935,8 +962,8 @@ func (tms TimeMuteStage) Exec(ctx context.Context, l log.Logger, alerts ...*type
type TimeActiveStage timeStage
func NewTimeActiveStage(m types.TimeMuter) *TimeActiveStage {
return &TimeActiveStage{m}
func NewTimeActiveStage(m types.TimeMuter, metrics *Metrics) *TimeActiveStage {
return &TimeActiveStage{m, metrics}
}
// Exec implements the stage interface for TimeActiveStage.
@ -964,7 +991,8 @@ func (tas TimeActiveStage) Exec(ctx context.Context, l log.Logger, alerts ...*ty
// If the current time is not inside an active time, all alerts are removed from the pipeline
if !muted {
level.Debug(l).Log("msg", "Notifications not sent, route is not within active time")
tas.metrics.numNotificationSuppressedTotal.WithLabelValues(suppressedReasonActiveTimeInterval).Add(float64(len(alerts)))
level.Debug(l).Log("msg", "Notifications not sent, route is not within active time", "alerts", len(alerts))
return ctx, nil, nil
}

View File

@ -666,7 +666,8 @@ func TestMuteStage(t *testing.T) {
return ok
})
stage := NewMuteStage(muter)
metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
stage := NewMuteStage(muter, metrics)
in := []model.LabelSet{
{},
@ -705,6 +706,10 @@ func TestMuteStage(t *testing.T) {
if !reflect.DeepEqual(got, out) {
t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
}
suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(in) - len(got)) != suppressed {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressed)
}
}
func TestMuteStageWithSilences(t *testing.T) {
@ -720,9 +725,11 @@ func TestMuteStageWithSilences(t *testing.T) {
t.Fatal(err)
}
marker := types.NewMarker(prometheus.NewRegistry())
reg := prometheus.NewRegistry()
marker := types.NewMarker(reg)
silencer := silence.NewSilencer(silences, marker, log.NewNopLogger())
stage := NewMuteStage(silencer)
metrics := NewMetrics(reg, featurecontrol.NoopFlags{})
stage := NewMuteStage(silencer, metrics)
in := []model.LabelSet{
{},
@ -765,6 +772,10 @@ func TestMuteStageWithSilences(t *testing.T) {
if !reflect.DeepEqual(got, out) {
t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
}
suppressedRoundOne := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(in) - len(got)) != suppressedRoundOne {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundOne)
}
// Do it again to exercise the version tracking of silences.
_, alerts, err = stage.Exec(context.Background(), log.NewNopLogger(), inAlerts...)
@ -781,6 +792,11 @@ func TestMuteStageWithSilences(t *testing.T) {
t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
}
suppressedRoundTwo := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(in) - len(got) + suppressedRoundOne) != suppressedRoundTwo {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundTwo)
}
// Expire the silence and verify that no alerts are silenced now.
if err := silences.Expire(silID); err != nil {
t.Fatal(err)
@ -798,6 +814,10 @@ func TestMuteStageWithSilences(t *testing.T) {
if !reflect.DeepEqual(got, in) {
t.Fatalf("Unmuting failed, expected: %v\ngot %v", in, got)
}
suppressedRoundThree := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(in) - len(got) + suppressedRoundTwo) != suppressedRoundThree {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundThree)
}
}
func TestTimeMuteStage(t *testing.T) {
@ -874,7 +894,8 @@ func TestTimeMuteStage(t *testing.T) {
}
m := map[string][]timeinterval.TimeInterval{"test": intervals}
intervener := timeinterval.NewIntervener(m)
stage := NewTimeMuteStage(intervener)
metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
stage := NewTimeMuteStage(intervener, metrics)
outAlerts := []*types.Alert{}
nonMuteCount := 0
@ -908,6 +929,10 @@ func TestTimeMuteStage(t *testing.T) {
if len(outAlerts) != nonMuteCount {
t.Fatalf("Expected %d alerts after time mute stage but got %d", nonMuteCount, len(outAlerts))
}
suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(cases) - nonMuteCount) != suppressed {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(cases) - nonMuteCount), suppressed)
}
}
func TestTimeActiveStage(t *testing.T) {
@ -933,6 +958,11 @@ func TestTimeActiveStage(t *testing.T) {
labels: model.LabelSet{"mute": "me"},
shouldMute: true,
},
{
fireTime: "02 Dec 20 16:59 +0000",
labels: model.LabelSet{"mute": "me"},
shouldMute: true,
},
{
// Tuesday before 5pm
fireTime: "01 Dec 20 16:59 +0000",
@ -959,7 +989,8 @@ func TestTimeActiveStage(t *testing.T) {
}
m := map[string][]timeinterval.TimeInterval{"test": intervals}
intervener := timeinterval.NewIntervener(m)
stage := NewTimeActiveStage(intervener)
metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
stage := NewTimeActiveStage(intervener, metrics)
outAlerts := []*types.Alert{}
nonMuteCount := 0
@ -993,6 +1024,10 @@ func TestTimeActiveStage(t *testing.T) {
if len(outAlerts) != nonMuteCount {
t.Fatalf("Expected %d alerts after time mute stage but got %d", nonMuteCount, len(outAlerts))
}
suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
if (len(cases) - nonMuteCount) != suppressed {
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(cases) - nonMuteCount), suppressed)
}
}
func BenchmarkHashAlert(b *testing.B) {