mirror of
https://github.com/prometheus/alertmanager
synced 2024-12-28 17:12:13 +00:00
Simplify and fix notification grouping.
This commit changes the notification grouping behavior to simply send all alerts of a group as soon as a single one of them needs updating. This fixes a critical bug which caused erroneous resolved notifications to be sent.
This commit is contained in:
parent
3de6b062a2
commit
11fae2a719
@ -213,6 +213,29 @@ func Dedup(notifies provider.Notifies, n Notifier) *DedupingNotifier {
|
||||
return &DedupingNotifier{notifies: notifies, notifier: n}
|
||||
}
|
||||
|
||||
// hasUpdates checks an alert against the last notification that was made
|
||||
// about it.
|
||||
func (n *DedupingNotifier) hasUpdate(alert *types.Alert, last *types.NotifyInfo, now time.Time, interval time.Duration) bool {
|
||||
if last != nil {
|
||||
if alert.Resolved() {
|
||||
if last.Resolved {
|
||||
return false
|
||||
}
|
||||
} else if !last.Resolved {
|
||||
// Do not send again if last was delivered unless
|
||||
// the repeat interval has already passed.
|
||||
if !now.After(last.Timestamp.Add(interval)) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
} else if alert.Resolved() {
|
||||
// If the alert is resolved but we never notified about it firing,
|
||||
// there is nothing to do.
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
// Notify implements the Notifier interface.
|
||||
func (n *DedupingNotifier) Notify(ctx context.Context, alerts ...*types.Alert) error {
|
||||
name, ok := Receiver(ctx)
|
||||
@ -235,57 +258,27 @@ func (n *DedupingNotifier) Notify(ctx context.Context, alerts ...*types.Alert) e
|
||||
fps = append(fps, a.Fingerprint())
|
||||
}
|
||||
|
||||
notifies, err := n.notifies.Get(name, fps...)
|
||||
notifyInfo, err := n.notifies.Get(name, fps...)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
var (
|
||||
doResend bool
|
||||
resendQueue []*types.Alert
|
||||
filtered []*types.Alert
|
||||
)
|
||||
for i, a := range alerts {
|
||||
last := notifies[i]
|
||||
|
||||
if last != nil {
|
||||
if a.Resolved() {
|
||||
if last.Resolved {
|
||||
continue
|
||||
}
|
||||
} else if !last.Resolved {
|
||||
// Do not send again if last was delivered unless
|
||||
// the repeat interval has already passed.
|
||||
if !now.After(last.Timestamp.Add(repeatInterval)) {
|
||||
// To not repeat initial batch fragmentation after the repeat interval
|
||||
// has passed, store them and send them anyway if on of the other
|
||||
// alerts has already passed the repeat interval.
|
||||
// This way we unify batches again.
|
||||
resendQueue = append(resendQueue, a)
|
||||
|
||||
continue
|
||||
} else {
|
||||
doResend = true
|
||||
}
|
||||
}
|
||||
} else if a.Resolved() {
|
||||
// If the alert is resolved but we never notified about it firing,
|
||||
// there is nothing to do.
|
||||
continue
|
||||
// If we have to notify about any of the alerts, we send a notification
|
||||
// for the entire batch.
|
||||
var send bool
|
||||
for i, alert := range alerts {
|
||||
if n.hasUpdate(alert, notifyInfo[i], now, repeatInterval) {
|
||||
send = true
|
||||
break
|
||||
}
|
||||
|
||||
filtered = append(filtered, a)
|
||||
}
|
||||
|
||||
// As we are resending an alert anyway, resend all of them even if their
|
||||
// repeat interval has not yet passed.
|
||||
if doResend {
|
||||
filtered = append(filtered, resendQueue...)
|
||||
if !send {
|
||||
return nil
|
||||
}
|
||||
|
||||
var newNotifies []*types.NotifyInfo
|
||||
|
||||
for _, a := range filtered {
|
||||
for _, a := range alerts {
|
||||
newNotifies = append(newNotifies, &types.NotifyInfo{
|
||||
Alert: a.Fingerprint(),
|
||||
Receiver: name,
|
||||
@ -294,7 +287,7 @@ func (n *DedupingNotifier) Notify(ctx context.Context, alerts ...*types.Alert) e
|
||||
})
|
||||
}
|
||||
|
||||
if err := n.notifier.Notify(ctx, filtered...); err != nil {
|
||||
if err := n.notifier.Notify(ctx, alerts...); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -43,6 +43,129 @@ func (n *failNotifier) Notify(ctx context.Context, as ...*types.Alert) error {
|
||||
return fmt.Errorf("some error")
|
||||
}
|
||||
|
||||
func TestDedupingNotifierHasUpdate(t *testing.T) {
|
||||
var (
|
||||
n = &DedupingNotifier{}
|
||||
now = time.Now()
|
||||
interval = 100 * time.Minute
|
||||
)
|
||||
cases := []struct {
|
||||
inAlert *types.Alert
|
||||
inNotifyInfo *types.NotifyInfo
|
||||
result bool
|
||||
}{
|
||||
// A new alert about which there's no previous notification information.
|
||||
{
|
||||
inAlert: &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{"alertname": "a"},
|
||||
StartsAt: now.Add(-10 * time.Minute),
|
||||
},
|
||||
},
|
||||
inNotifyInfo: nil,
|
||||
result: true,
|
||||
},
|
||||
// A new alert about which there's no previous notification information.
|
||||
// It is already resolved, so there's no use in sending a notification.
|
||||
{
|
||||
inAlert: &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{"alertname": "a"},
|
||||
StartsAt: now.Add(-10 * time.Minute),
|
||||
EndsAt: now,
|
||||
},
|
||||
},
|
||||
inNotifyInfo: nil,
|
||||
result: false,
|
||||
},
|
||||
// An alert that has been firing is now resolved for the first time.
|
||||
{
|
||||
inAlert: &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{"alertname": "a"},
|
||||
StartsAt: now.Add(-10 * time.Minute),
|
||||
EndsAt: now,
|
||||
},
|
||||
},
|
||||
inNotifyInfo: &types.NotifyInfo{
|
||||
Alert: model.LabelSet{"alertname": "a"}.Fingerprint(),
|
||||
Resolved: false,
|
||||
Timestamp: now.Add(-time.Minute),
|
||||
},
|
||||
result: true,
|
||||
},
|
||||
// A resolved alert for which we have already sent a resolved notification.
|
||||
{
|
||||
inAlert: &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{"alertname": "a"},
|
||||
StartsAt: now.Add(-10 * time.Minute),
|
||||
EndsAt: now,
|
||||
},
|
||||
},
|
||||
inNotifyInfo: &types.NotifyInfo{
|
||||
Alert: model.LabelSet{"alertname": "a"}.Fingerprint(),
|
||||
Resolved: true,
|
||||
Timestamp: now.Add(-time.Minute),
|
||||
},
|
||||
result: false,
|
||||
},
|
||||
// An alert that was resolved last time but is now firing again.
|
||||
{
|
||||
inAlert: &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{"alertname": "a"},
|
||||
StartsAt: now.Add(-3 * time.Minute),
|
||||
},
|
||||
},
|
||||
inNotifyInfo: &types.NotifyInfo{
|
||||
Alert: model.LabelSet{"alertname": "a"}.Fingerprint(),
|
||||
Resolved: true,
|
||||
Timestamp: now.Add(-4 * time.Minute),
|
||||
},
|
||||
result: true,
|
||||
},
|
||||
// A firing alert about which we already notified. The last notification
|
||||
// is less than the repeat interval ago.
|
||||
{
|
||||
inAlert: &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{"alertname": "a"},
|
||||
StartsAt: now.Add(-10 * time.Minute),
|
||||
},
|
||||
},
|
||||
inNotifyInfo: &types.NotifyInfo{
|
||||
Alert: model.LabelSet{"alertname": "a"}.Fingerprint(),
|
||||
Resolved: false,
|
||||
Timestamp: now.Add(-15 * time.Minute),
|
||||
},
|
||||
result: false,
|
||||
},
|
||||
// A firing alert about which we already notified. The last notification
|
||||
// is more than the repeat interval ago.
|
||||
{
|
||||
inAlert: &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{"alertname": "a"},
|
||||
StartsAt: now.Add(-10 * time.Minute),
|
||||
},
|
||||
},
|
||||
inNotifyInfo: &types.NotifyInfo{
|
||||
Alert: model.LabelSet{"alertname": "a"}.Fingerprint(),
|
||||
Resolved: false,
|
||||
Timestamp: now.Add(-115 * time.Minute),
|
||||
},
|
||||
result: true,
|
||||
},
|
||||
}
|
||||
|
||||
for i, c := range cases {
|
||||
if n.hasUpdate(c.inAlert, c.inNotifyInfo, now, interval) != c.result {
|
||||
t.Errorf("unexpected hasUpdates result for case %d", i)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestDedupingNotifier(t *testing.T) {
|
||||
var (
|
||||
record = &recordNotifier{}
|
||||
@ -66,69 +189,19 @@ func TestDedupingNotifier(t *testing.T) {
|
||||
Labels: model.LabelSet{"alertname": "1"},
|
||||
EndsAt: now.Add(-5 * time.Minute),
|
||||
},
|
||||
}, {
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{"alertname": "2"},
|
||||
EndsAt: now.Add(-9 * time.Minute),
|
||||
},
|
||||
}, {
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{"alertname": "3"},
|
||||
EndsAt: now.Add(-10 * time.Minute),
|
||||
},
|
||||
}, {
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{"alertname": "4"},
|
||||
},
|
||||
}, {
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{"alertname": "5"},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
var fps []model.Fingerprint
|
||||
for _, a := range alerts {
|
||||
fps = append(fps, a.Fingerprint())
|
||||
}
|
||||
|
||||
// Set an initial NotifyInfo to ensure that on notification failure
|
||||
// nothing changes.
|
||||
nsBefore := []*types.NotifyInfo{
|
||||
// The first a new alert starting now.
|
||||
nil,
|
||||
// The second alert was not previously notified about and
|
||||
// is already resolved.
|
||||
nil,
|
||||
// The third alert is an attempt to resolve a previously
|
||||
// firing alert.
|
||||
{
|
||||
Alert: fps[2],
|
||||
Alert: alerts[1].Fingerprint(),
|
||||
Receiver: "name",
|
||||
Resolved: false,
|
||||
Timestamp: now.Add(-10 * time.Minute),
|
||||
},
|
||||
// The fourth alert is an attempt to resolve an alert again
|
||||
// even though the previous notification succeeded.
|
||||
{
|
||||
Alert: fps[3],
|
||||
Receiver: "name",
|
||||
Resolved: true,
|
||||
Timestamp: now.Add(-10 * time.Minute),
|
||||
},
|
||||
// The fifth alert resends a previously successful notification
|
||||
// that was longer than ago than the repeat interval.
|
||||
{
|
||||
Alert: fps[4],
|
||||
Receiver: "name",
|
||||
Resolved: false,
|
||||
Timestamp: now.Add(-110 * time.Minute),
|
||||
},
|
||||
// The sixth alert is a firing again after being resolved before.
|
||||
{
|
||||
Alert: fps[5],
|
||||
Receiver: "name",
|
||||
Resolved: true,
|
||||
Timestamp: now.Add(3 * time.Minute),
|
||||
},
|
||||
}
|
||||
|
||||
if err := notifies.Set(nsBefore...); err != nil {
|
||||
@ -140,7 +213,7 @@ func TestDedupingNotifier(t *testing.T) {
|
||||
t.Fatalf("Fail notifier did not fail")
|
||||
}
|
||||
// After a failing notify the notifies data must be unchanged.
|
||||
nsCur, err := notifies.Get("name", fps...)
|
||||
nsCur, err := notifies.Get("name", alerts[0].Fingerprint(), alerts[1].Fingerprint())
|
||||
if err != nil {
|
||||
t.Fatalf("Error getting notify info: %s", err)
|
||||
}
|
||||
@ -153,46 +226,29 @@ func TestDedupingNotifier(t *testing.T) {
|
||||
t.Fatalf("Notify failed: %s", err)
|
||||
}
|
||||
|
||||
alertsExp := []*types.Alert{
|
||||
alerts[0],
|
||||
alerts[2],
|
||||
alerts[4],
|
||||
alerts[5],
|
||||
if !reflect.DeepEqual(record.alerts, alerts) {
|
||||
t.Fatalf("Expected alerts %v, got %v", alerts, record.alerts)
|
||||
}
|
||||
nsCur, err = notifies.Get("name", alerts[0].Fingerprint(), alerts[1].Fingerprint())
|
||||
if err != nil {
|
||||
t.Fatalf("Error getting notifies: %s", err)
|
||||
}
|
||||
|
||||
nsAfter := []*types.NotifyInfo{
|
||||
{
|
||||
Alert: fps[0],
|
||||
Receiver: "name",
|
||||
Resolved: false,
|
||||
},
|
||||
nil,
|
||||
{
|
||||
Alert: fps[2],
|
||||
Receiver: "name",
|
||||
Resolved: true,
|
||||
},
|
||||
nsBefore[3],
|
||||
{
|
||||
Alert: fps[4],
|
||||
Receiver: "name",
|
||||
Resolved: false,
|
||||
Alert: alerts[0].Fingerprint(),
|
||||
Receiver: "name",
|
||||
Resolved: false,
|
||||
Timestamp: now,
|
||||
},
|
||||
{
|
||||
Alert: fps[5],
|
||||
Receiver: "name",
|
||||
Resolved: false,
|
||||
Alert: alerts[1].Fingerprint(),
|
||||
Receiver: "name",
|
||||
Resolved: true,
|
||||
Timestamp: now,
|
||||
},
|
||||
}
|
||||
|
||||
if !reflect.DeepEqual(record.alerts, alertsExp) {
|
||||
t.Fatalf("Expected alerts %v, got %v", alertsExp, record.alerts)
|
||||
}
|
||||
nsCur, err = notifies.Get("name", fps...)
|
||||
if err != nil {
|
||||
t.Fatalf("Error getting notifies: %s", err)
|
||||
}
|
||||
|
||||
for i, after := range nsAfter {
|
||||
cur := nsCur[i]
|
||||
|
||||
|
@ -229,19 +229,22 @@ receivers:
|
||||
am := at.Alertmanager(fmt.Sprintf(conf, wh.Address()))
|
||||
|
||||
am.Push(At(1.1), Alert("alertname", "test1").Active(1))
|
||||
am.Push(At(1.9), Alert("alertname", "test5").Active(1))
|
||||
am.Push(At(2.3),
|
||||
Alert("alertname", "test2").Active(1.5),
|
||||
Alert("alertname", "test3").Active(1.5),
|
||||
Alert("alertname", "test4").Active(1.6),
|
||||
)
|
||||
am.Push(At(1.7), Alert("alertname", "test5").Active(1))
|
||||
|
||||
co.Want(Between(2.0, 2.5),
|
||||
Alert("alertname", "test1").Active(1),
|
||||
Alert("alertname", "test5").Active(1),
|
||||
)
|
||||
// Only expect the new ones with the next group interval.
|
||||
co.Want(Between(3, 3.5),
|
||||
|
||||
am.Push(At(3.3),
|
||||
Alert("alertname", "test2").Active(1.5),
|
||||
Alert("alertname", "test3").Active(1.5),
|
||||
Alert("alertname", "test4").Active(1.6),
|
||||
)
|
||||
|
||||
co.Want(Between(4.1, 4.5),
|
||||
Alert("alertname", "test1").Active(1),
|
||||
Alert("alertname", "test5").Active(1),
|
||||
Alert("alertname", "test2").Active(1.5),
|
||||
Alert("alertname", "test3").Active(1.5),
|
||||
Alert("alertname", "test4").Active(1.6),
|
||||
@ -250,10 +253,7 @@ receivers:
|
||||
// While no changes happen expect no additional notifications
|
||||
// until the 5s repeat interval has ended.
|
||||
|
||||
// The last three notifications should sent with the first two even
|
||||
// though their repeat interval has not yet passed. This way fragmented
|
||||
// batches are unified and notification noise reduced.
|
||||
co.Want(Between(7, 7.5),
|
||||
co.Want(Between(9.1, 9.5),
|
||||
Alert("alertname", "test1").Active(1),
|
||||
Alert("alertname", "test5").Active(1),
|
||||
Alert("alertname", "test2").Active(1.5),
|
||||
|
Loading…
Reference in New Issue
Block a user