mirror of
https://github.com/prometheus/alertmanager
synced 2024-12-25 15:42:18 +00:00
feat: add counter to track alerts dropped outside of time_intervals (#3565)
* feat: add counter to track alerts dropped outside of time_intervals Addresses: #3512 This adds a new counter metric `alertmanager_alerts_supressed_total` that is incremented by `len(alerts)` when an alert is suppressed for being outside of a time_interval, ie inside of a mute_time_intervals or outside of an active_time_intervals. Signed-off-by: TJ Hoplock <t.hoplock@gmail.com> * test: add time interval suppression metric checks for notify Signed-off-by: TJ Hoplock <t.hoplock@gmail.com> * test: fix failure message log values in notifier Signed-off-by: TJ Hoplock <t.hoplock@gmail.com> * ref: address PR feedback for #3565 Signed-off-by: TJ Hoplock <t.hoplock@gmail.com> * fix: track suppressed notifications metric for inhibit/silence Based on PR feedback: https://github.com/prometheus/alertmanager/pull/3565/files#r1393068026 Signed-off-by: TJ Hoplock <t.hoplock@gmail.com> * fix: broken notifier tests - fixed metric count check to properly check the diff between input/output notifications from the suppression to compare to suppression metric, was previously inverted to compare to how many notifications it suppressed. - stopped using `Reset()` to compare collection counts between the multiple stages that are executed in `TestMuteStageWithSilences()`. the intent was to compare a clean metric collection after each stage execution, but the final stage where all silences are lifted results in no metric being created in the test, causing `prom_testutil.ToFloat64()` to panic. changed to separate vars to check counts between each stage, with care to consider prior counts. Signed-off-by: TJ Hoplock <t.hoplock@gmail.com> * rename metric and add constants Signed-off-by: gotjosh <josue.abreu@gmail.com> --------- Signed-off-by: TJ Hoplock <t.hoplock@gmail.com> Signed-off-by: gotjosh <josue.abreu@gmail.com> Co-authored-by: gotjosh <josue.abreu@gmail.com>
This commit is contained in:
parent
f69a508665
commit
f00025d037
@ -251,6 +251,7 @@ type Metrics struct {
|
||||
numTotalFailedNotifications *prometheus.CounterVec
|
||||
numNotificationRequestsTotal *prometheus.CounterVec
|
||||
numNotificationRequestsFailedTotal *prometheus.CounterVec
|
||||
numNotificationSuppressedTotal *prometheus.CounterVec
|
||||
notificationLatencySeconds *prometheus.HistogramVec
|
||||
|
||||
ff featurecontrol.Flagger
|
||||
@ -284,6 +285,11 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics {
|
||||
Name: "notification_requests_failed_total",
|
||||
Help: "The total number of failed notification requests.",
|
||||
}, labels),
|
||||
numNotificationSuppressedTotal: prometheus.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "alertmanager",
|
||||
Name: "notifications_suppressed_total",
|
||||
Help: "The total number of notifications suppressed for being outside of active time intervals or within muted time intervals.",
|
||||
}, []string{"reason"}),
|
||||
notificationLatencySeconds: prometheus.NewHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: "alertmanager",
|
||||
Name: "notification_latency_seconds",
|
||||
@ -296,7 +302,7 @@ func NewMetrics(r prometheus.Registerer, ff featurecontrol.Flagger) *Metrics {
|
||||
r.MustRegister(
|
||||
m.numNotifications, m.numTotalFailedNotifications,
|
||||
m.numNotificationRequestsTotal, m.numNotificationRequestsFailedTotal,
|
||||
m.notificationLatencySeconds,
|
||||
m.numNotificationSuppressedTotal, m.notificationLatencySeconds,
|
||||
)
|
||||
|
||||
return m
|
||||
@ -381,10 +387,10 @@ func (pb *PipelineBuilder) New(
|
||||
rs := make(RoutingStage, len(receivers))
|
||||
|
||||
ms := NewGossipSettleStage(peer)
|
||||
is := NewMuteStage(inhibitor)
|
||||
tas := NewTimeActiveStage(intervener)
|
||||
tms := NewTimeMuteStage(intervener)
|
||||
ss := NewMuteStage(silencer)
|
||||
is := NewMuteStage(inhibitor, pb.metrics)
|
||||
tas := NewTimeActiveStage(intervener, pb.metrics)
|
||||
tms := NewTimeMuteStage(intervener, pb.metrics)
|
||||
ss := NewMuteStage(silencer, pb.metrics)
|
||||
|
||||
for name := range receivers {
|
||||
st := createReceiverStage(name, receivers[name], wait, notificationLog, pb.metrics)
|
||||
@ -507,14 +513,22 @@ func (n *GossipSettleStage) Exec(ctx context.Context, _ log.Logger, alerts ...*t
|
||||
return ctx, alerts, nil
|
||||
}
|
||||
|
||||
const (
|
||||
suppressedReasonSilence = "silence"
|
||||
suppressedReasonInhibition = "inhibition"
|
||||
suppressedReasonMuteTimeInterval = "mute_time_interval"
|
||||
suppressedReasonActiveTimeInterval = "active_time_interval"
|
||||
)
|
||||
|
||||
// MuteStage filters alerts through a Muter.
|
||||
type MuteStage struct {
|
||||
muter types.Muter
|
||||
muter types.Muter
|
||||
metrics *Metrics
|
||||
}
|
||||
|
||||
// NewMuteStage return a new MuteStage.
|
||||
func NewMuteStage(m types.Muter) *MuteStage {
|
||||
return &MuteStage{muter: m}
|
||||
func NewMuteStage(m types.Muter, metrics *Metrics) *MuteStage {
|
||||
return &MuteStage{muter: m, metrics: metrics}
|
||||
}
|
||||
|
||||
// Exec implements the Stage interface.
|
||||
@ -535,7 +549,18 @@ func (n *MuteStage) Exec(ctx context.Context, logger log.Logger, alerts ...*type
|
||||
}
|
||||
if len(muted) > 0 {
|
||||
level.Debug(logger).Log("msg", "Notifications will not be sent for muted alerts", "alerts", fmt.Sprintf("%v", muted))
|
||||
|
||||
var reason string
|
||||
switch n.muter.(type) {
|
||||
case *silence.Silencer:
|
||||
reason = suppressedReasonSilence
|
||||
case *inhibit.Inhibitor:
|
||||
reason = suppressedReasonInhibition
|
||||
default:
|
||||
}
|
||||
n.metrics.numNotificationSuppressedTotal.WithLabelValues(reason).Add(float64(len(muted)))
|
||||
}
|
||||
|
||||
return ctx, filtered, nil
|
||||
}
|
||||
|
||||
@ -894,13 +919,14 @@ func (n SetNotifiesStage) Exec(ctx context.Context, l log.Logger, alerts ...*typ
|
||||
}
|
||||
|
||||
type timeStage struct {
|
||||
muter types.TimeMuter
|
||||
muter types.TimeMuter
|
||||
metrics *Metrics
|
||||
}
|
||||
|
||||
type TimeMuteStage timeStage
|
||||
|
||||
func NewTimeMuteStage(m types.TimeMuter) *TimeMuteStage {
|
||||
return &TimeMuteStage{m}
|
||||
func NewTimeMuteStage(m types.TimeMuter, metrics *Metrics) *TimeMuteStage {
|
||||
return &TimeMuteStage{m, metrics}
|
||||
}
|
||||
|
||||
// Exec implements the stage interface for TimeMuteStage.
|
||||
@ -927,7 +953,8 @@ func (tms TimeMuteStage) Exec(ctx context.Context, l log.Logger, alerts ...*type
|
||||
|
||||
// If the current time is inside a mute time, all alerts are removed from the pipeline.
|
||||
if muted {
|
||||
level.Debug(l).Log("msg", "Notifications not sent, route is within mute time")
|
||||
tms.metrics.numNotificationSuppressedTotal.WithLabelValues(suppressedReasonMuteTimeInterval).Add(float64(len(alerts)))
|
||||
level.Debug(l).Log("msg", "Notifications not sent, route is within mute time", "alerts", len(alerts))
|
||||
return ctx, nil, nil
|
||||
}
|
||||
return ctx, alerts, nil
|
||||
@ -935,8 +962,8 @@ func (tms TimeMuteStage) Exec(ctx context.Context, l log.Logger, alerts ...*type
|
||||
|
||||
type TimeActiveStage timeStage
|
||||
|
||||
func NewTimeActiveStage(m types.TimeMuter) *TimeActiveStage {
|
||||
return &TimeActiveStage{m}
|
||||
func NewTimeActiveStage(m types.TimeMuter, metrics *Metrics) *TimeActiveStage {
|
||||
return &TimeActiveStage{m, metrics}
|
||||
}
|
||||
|
||||
// Exec implements the stage interface for TimeActiveStage.
|
||||
@ -964,7 +991,8 @@ func (tas TimeActiveStage) Exec(ctx context.Context, l log.Logger, alerts ...*ty
|
||||
|
||||
// If the current time is not inside an active time, all alerts are removed from the pipeline
|
||||
if !muted {
|
||||
level.Debug(l).Log("msg", "Notifications not sent, route is not within active time")
|
||||
tas.metrics.numNotificationSuppressedTotal.WithLabelValues(suppressedReasonActiveTimeInterval).Add(float64(len(alerts)))
|
||||
level.Debug(l).Log("msg", "Notifications not sent, route is not within active time", "alerts", len(alerts))
|
||||
return ctx, nil, nil
|
||||
}
|
||||
|
||||
|
@ -666,7 +666,8 @@ func TestMuteStage(t *testing.T) {
|
||||
return ok
|
||||
})
|
||||
|
||||
stage := NewMuteStage(muter)
|
||||
metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
|
||||
stage := NewMuteStage(muter, metrics)
|
||||
|
||||
in := []model.LabelSet{
|
||||
{},
|
||||
@ -705,6 +706,10 @@ func TestMuteStage(t *testing.T) {
|
||||
if !reflect.DeepEqual(got, out) {
|
||||
t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
|
||||
}
|
||||
suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
|
||||
if (len(in) - len(got)) != suppressed {
|
||||
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMuteStageWithSilences(t *testing.T) {
|
||||
@ -720,9 +725,11 @@ func TestMuteStageWithSilences(t *testing.T) {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
marker := types.NewMarker(prometheus.NewRegistry())
|
||||
reg := prometheus.NewRegistry()
|
||||
marker := types.NewMarker(reg)
|
||||
silencer := silence.NewSilencer(silences, marker, log.NewNopLogger())
|
||||
stage := NewMuteStage(silencer)
|
||||
metrics := NewMetrics(reg, featurecontrol.NoopFlags{})
|
||||
stage := NewMuteStage(silencer, metrics)
|
||||
|
||||
in := []model.LabelSet{
|
||||
{},
|
||||
@ -765,6 +772,10 @@ func TestMuteStageWithSilences(t *testing.T) {
|
||||
if !reflect.DeepEqual(got, out) {
|
||||
t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
|
||||
}
|
||||
suppressedRoundOne := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
|
||||
if (len(in) - len(got)) != suppressedRoundOne {
|
||||
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundOne)
|
||||
}
|
||||
|
||||
// Do it again to exercise the version tracking of silences.
|
||||
_, alerts, err = stage.Exec(context.Background(), log.NewNopLogger(), inAlerts...)
|
||||
@ -781,6 +792,11 @@ func TestMuteStageWithSilences(t *testing.T) {
|
||||
t.Fatalf("Muting failed, expected: %v\ngot %v", out, got)
|
||||
}
|
||||
|
||||
suppressedRoundTwo := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
|
||||
if (len(in) - len(got) + suppressedRoundOne) != suppressedRoundTwo {
|
||||
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundTwo)
|
||||
}
|
||||
|
||||
// Expire the silence and verify that no alerts are silenced now.
|
||||
if err := silences.Expire(silID); err != nil {
|
||||
t.Fatal(err)
|
||||
@ -798,6 +814,10 @@ func TestMuteStageWithSilences(t *testing.T) {
|
||||
if !reflect.DeepEqual(got, in) {
|
||||
t.Fatalf("Unmuting failed, expected: %v\ngot %v", in, got)
|
||||
}
|
||||
suppressedRoundThree := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
|
||||
if (len(in) - len(got) + suppressedRoundTwo) != suppressedRoundThree {
|
||||
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(in) - len(got)), suppressedRoundThree)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTimeMuteStage(t *testing.T) {
|
||||
@ -874,7 +894,8 @@ func TestTimeMuteStage(t *testing.T) {
|
||||
}
|
||||
m := map[string][]timeinterval.TimeInterval{"test": intervals}
|
||||
intervener := timeinterval.NewIntervener(m)
|
||||
stage := NewTimeMuteStage(intervener)
|
||||
metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
|
||||
stage := NewTimeMuteStage(intervener, metrics)
|
||||
|
||||
outAlerts := []*types.Alert{}
|
||||
nonMuteCount := 0
|
||||
@ -908,6 +929,10 @@ func TestTimeMuteStage(t *testing.T) {
|
||||
if len(outAlerts) != nonMuteCount {
|
||||
t.Fatalf("Expected %d alerts after time mute stage but got %d", nonMuteCount, len(outAlerts))
|
||||
}
|
||||
suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
|
||||
if (len(cases) - nonMuteCount) != suppressed {
|
||||
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(cases) - nonMuteCount), suppressed)
|
||||
}
|
||||
}
|
||||
|
||||
func TestTimeActiveStage(t *testing.T) {
|
||||
@ -933,6 +958,11 @@ func TestTimeActiveStage(t *testing.T) {
|
||||
labels: model.LabelSet{"mute": "me"},
|
||||
shouldMute: true,
|
||||
},
|
||||
{
|
||||
fireTime: "02 Dec 20 16:59 +0000",
|
||||
labels: model.LabelSet{"mute": "me"},
|
||||
shouldMute: true,
|
||||
},
|
||||
{
|
||||
// Tuesday before 5pm
|
||||
fireTime: "01 Dec 20 16:59 +0000",
|
||||
@ -959,7 +989,8 @@ func TestTimeActiveStage(t *testing.T) {
|
||||
}
|
||||
m := map[string][]timeinterval.TimeInterval{"test": intervals}
|
||||
intervener := timeinterval.NewIntervener(m)
|
||||
stage := NewTimeActiveStage(intervener)
|
||||
metrics := NewMetrics(prometheus.NewRegistry(), featurecontrol.NoopFlags{})
|
||||
stage := NewTimeActiveStage(intervener, metrics)
|
||||
|
||||
outAlerts := []*types.Alert{}
|
||||
nonMuteCount := 0
|
||||
@ -993,6 +1024,10 @@ func TestTimeActiveStage(t *testing.T) {
|
||||
if len(outAlerts) != nonMuteCount {
|
||||
t.Fatalf("Expected %d alerts after time mute stage but got %d", nonMuteCount, len(outAlerts))
|
||||
}
|
||||
suppressed := int(prom_testutil.ToFloat64(metrics.numNotificationSuppressedTotal))
|
||||
if (len(cases) - nonMuteCount) != suppressed {
|
||||
t.Fatalf("Expected %d alerts counted in suppressed metric but got %d", (len(cases) - nonMuteCount), suppressed)
|
||||
}
|
||||
}
|
||||
|
||||
func BenchmarkHashAlert(b *testing.B) {
|
||||
|
Loading…
Reference in New Issue
Block a user