Simplify and fix notification grouping.

This commit changes the notification grouping behavior
to simply send all alerts of a group as soon as a single
one of them needs updating.

This fixes a critical bug which caused erroneous resolved
notifications to be sent.
This commit is contained in:
Fabian Reinartz 2016-01-08 15:15:14 +01:00
parent 3de6b062a2
commit 11fae2a719
3 changed files with 188 additions and 139 deletions

View File

@ -213,6 +213,29 @@ func Dedup(notifies provider.Notifies, n Notifier) *DedupingNotifier {
return &DedupingNotifier{notifies: notifies, notifier: n}
}
// hasUpdates checks an alert against the last notification that was made
// about it.
func (n *DedupingNotifier) hasUpdate(alert *types.Alert, last *types.NotifyInfo, now time.Time, interval time.Duration) bool {
if last != nil {
if alert.Resolved() {
if last.Resolved {
return false
}
} else if !last.Resolved {
// Do not send again if last was delivered unless
// the repeat interval has already passed.
if !now.After(last.Timestamp.Add(interval)) {
return false
}
}
} else if alert.Resolved() {
// If the alert is resolved but we never notified about it firing,
// there is nothing to do.
return false
}
return true
}
// Notify implements the Notifier interface.
func (n *DedupingNotifier) Notify(ctx context.Context, alerts ...*types.Alert) error {
name, ok := Receiver(ctx)
@ -235,57 +258,27 @@ func (n *DedupingNotifier) Notify(ctx context.Context, alerts ...*types.Alert) e
fps = append(fps, a.Fingerprint())
}
notifies, err := n.notifies.Get(name, fps...)
notifyInfo, err := n.notifies.Get(name, fps...)
if err != nil {
return err
}
var (
doResend bool
resendQueue []*types.Alert
filtered []*types.Alert
)
for i, a := range alerts {
last := notifies[i]
if last != nil {
if a.Resolved() {
if last.Resolved {
continue
}
} else if !last.Resolved {
// Do not send again if last was delivered unless
// the repeat interval has already passed.
if !now.After(last.Timestamp.Add(repeatInterval)) {
// To not repeat initial batch fragmentation after the repeat interval
// has passed, store them and send them anyway if on of the other
// alerts has already passed the repeat interval.
// This way we unify batches again.
resendQueue = append(resendQueue, a)
continue
} else {
doResend = true
}
}
} else if a.Resolved() {
// If the alert is resolved but we never notified about it firing,
// there is nothing to do.
continue
// If we have to notify about any of the alerts, we send a notification
// for the entire batch.
var send bool
for i, alert := range alerts {
if n.hasUpdate(alert, notifyInfo[i], now, repeatInterval) {
send = true
break
}
filtered = append(filtered, a)
}
// As we are resending an alert anyway, resend all of them even if their
// repeat interval has not yet passed.
if doResend {
filtered = append(filtered, resendQueue...)
if !send {
return nil
}
var newNotifies []*types.NotifyInfo
for _, a := range filtered {
for _, a := range alerts {
newNotifies = append(newNotifies, &types.NotifyInfo{
Alert: a.Fingerprint(),
Receiver: name,
@ -294,7 +287,7 @@ func (n *DedupingNotifier) Notify(ctx context.Context, alerts ...*types.Alert) e
})
}
if err := n.notifier.Notify(ctx, filtered...); err != nil {
if err := n.notifier.Notify(ctx, alerts...); err != nil {
return err
}

View File

@ -43,6 +43,129 @@ func (n *failNotifier) Notify(ctx context.Context, as ...*types.Alert) error {
return fmt.Errorf("some error")
}
func TestDedupingNotifierHasUpdate(t *testing.T) {
var (
n = &DedupingNotifier{}
now = time.Now()
interval = 100 * time.Minute
)
cases := []struct {
inAlert *types.Alert
inNotifyInfo *types.NotifyInfo
result bool
}{
// A new alert about which there's no previous notification information.
{
inAlert: &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "a"},
StartsAt: now.Add(-10 * time.Minute),
},
},
inNotifyInfo: nil,
result: true,
},
// A new alert about which there's no previous notification information.
// It is already resolved, so there's no use in sending a notification.
{
inAlert: &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "a"},
StartsAt: now.Add(-10 * time.Minute),
EndsAt: now,
},
},
inNotifyInfo: nil,
result: false,
},
// An alert that has been firing is now resolved for the first time.
{
inAlert: &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "a"},
StartsAt: now.Add(-10 * time.Minute),
EndsAt: now,
},
},
inNotifyInfo: &types.NotifyInfo{
Alert: model.LabelSet{"alertname": "a"}.Fingerprint(),
Resolved: false,
Timestamp: now.Add(-time.Minute),
},
result: true,
},
// A resolved alert for which we have already sent a resolved notification.
{
inAlert: &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "a"},
StartsAt: now.Add(-10 * time.Minute),
EndsAt: now,
},
},
inNotifyInfo: &types.NotifyInfo{
Alert: model.LabelSet{"alertname": "a"}.Fingerprint(),
Resolved: true,
Timestamp: now.Add(-time.Minute),
},
result: false,
},
// An alert that was resolved last time but is now firing again.
{
inAlert: &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "a"},
StartsAt: now.Add(-3 * time.Minute),
},
},
inNotifyInfo: &types.NotifyInfo{
Alert: model.LabelSet{"alertname": "a"}.Fingerprint(),
Resolved: true,
Timestamp: now.Add(-4 * time.Minute),
},
result: true,
},
// A firing alert about which we already notified. The last notification
// is less than the repeat interval ago.
{
inAlert: &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "a"},
StartsAt: now.Add(-10 * time.Minute),
},
},
inNotifyInfo: &types.NotifyInfo{
Alert: model.LabelSet{"alertname": "a"}.Fingerprint(),
Resolved: false,
Timestamp: now.Add(-15 * time.Minute),
},
result: false,
},
// A firing alert about which we already notified. The last notification
// is more than the repeat interval ago.
{
inAlert: &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "a"},
StartsAt: now.Add(-10 * time.Minute),
},
},
inNotifyInfo: &types.NotifyInfo{
Alert: model.LabelSet{"alertname": "a"}.Fingerprint(),
Resolved: false,
Timestamp: now.Add(-115 * time.Minute),
},
result: true,
},
}
for i, c := range cases {
if n.hasUpdate(c.inAlert, c.inNotifyInfo, now, interval) != c.result {
t.Errorf("unexpected hasUpdates result for case %d", i)
}
}
}
func TestDedupingNotifier(t *testing.T) {
var (
record = &recordNotifier{}
@ -66,69 +189,19 @@ func TestDedupingNotifier(t *testing.T) {
Labels: model.LabelSet{"alertname": "1"},
EndsAt: now.Add(-5 * time.Minute),
},
}, {
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "2"},
EndsAt: now.Add(-9 * time.Minute),
},
}, {
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "3"},
EndsAt: now.Add(-10 * time.Minute),
},
}, {
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "4"},
},
}, {
Alert: model.Alert{
Labels: model.LabelSet{"alertname": "5"},
},
},
}
var fps []model.Fingerprint
for _, a := range alerts {
fps = append(fps, a.Fingerprint())
}
// Set an initial NotifyInfo to ensure that on notification failure
// nothing changes.
nsBefore := []*types.NotifyInfo{
// The first a new alert starting now.
nil,
// The second alert was not previously notified about and
// is already resolved.
nil,
// The third alert is an attempt to resolve a previously
// firing alert.
{
Alert: fps[2],
Alert: alerts[1].Fingerprint(),
Receiver: "name",
Resolved: false,
Timestamp: now.Add(-10 * time.Minute),
},
// The fourth alert is an attempt to resolve an alert again
// even though the previous notification succeeded.
{
Alert: fps[3],
Receiver: "name",
Resolved: true,
Timestamp: now.Add(-10 * time.Minute),
},
// The fifth alert resends a previously successful notification
// that was longer than ago than the repeat interval.
{
Alert: fps[4],
Receiver: "name",
Resolved: false,
Timestamp: now.Add(-110 * time.Minute),
},
// The sixth alert is a firing again after being resolved before.
{
Alert: fps[5],
Receiver: "name",
Resolved: true,
Timestamp: now.Add(3 * time.Minute),
},
}
if err := notifies.Set(nsBefore...); err != nil {
@ -140,7 +213,7 @@ func TestDedupingNotifier(t *testing.T) {
t.Fatalf("Fail notifier did not fail")
}
// After a failing notify the notifies data must be unchanged.
nsCur, err := notifies.Get("name", fps...)
nsCur, err := notifies.Get("name", alerts[0].Fingerprint(), alerts[1].Fingerprint())
if err != nil {
t.Fatalf("Error getting notify info: %s", err)
}
@ -153,46 +226,29 @@ func TestDedupingNotifier(t *testing.T) {
t.Fatalf("Notify failed: %s", err)
}
alertsExp := []*types.Alert{
alerts[0],
alerts[2],
alerts[4],
alerts[5],
if !reflect.DeepEqual(record.alerts, alerts) {
t.Fatalf("Expected alerts %v, got %v", alerts, record.alerts)
}
nsCur, err = notifies.Get("name", alerts[0].Fingerprint(), alerts[1].Fingerprint())
if err != nil {
t.Fatalf("Error getting notifies: %s", err)
}
nsAfter := []*types.NotifyInfo{
{
Alert: fps[0],
Receiver: "name",
Resolved: false,
},
nil,
{
Alert: fps[2],
Receiver: "name",
Resolved: true,
},
nsBefore[3],
{
Alert: fps[4],
Receiver: "name",
Resolved: false,
Alert: alerts[0].Fingerprint(),
Receiver: "name",
Resolved: false,
Timestamp: now,
},
{
Alert: fps[5],
Receiver: "name",
Resolved: false,
Alert: alerts[1].Fingerprint(),
Receiver: "name",
Resolved: true,
Timestamp: now,
},
}
if !reflect.DeepEqual(record.alerts, alertsExp) {
t.Fatalf("Expected alerts %v, got %v", alertsExp, record.alerts)
}
nsCur, err = notifies.Get("name", fps...)
if err != nil {
t.Fatalf("Error getting notifies: %s", err)
}
for i, after := range nsAfter {
cur := nsCur[i]

View File

@ -229,19 +229,22 @@ receivers:
am := at.Alertmanager(fmt.Sprintf(conf, wh.Address()))
am.Push(At(1.1), Alert("alertname", "test1").Active(1))
am.Push(At(1.9), Alert("alertname", "test5").Active(1))
am.Push(At(2.3),
Alert("alertname", "test2").Active(1.5),
Alert("alertname", "test3").Active(1.5),
Alert("alertname", "test4").Active(1.6),
)
am.Push(At(1.7), Alert("alertname", "test5").Active(1))
co.Want(Between(2.0, 2.5),
Alert("alertname", "test1").Active(1),
Alert("alertname", "test5").Active(1),
)
// Only expect the new ones with the next group interval.
co.Want(Between(3, 3.5),
am.Push(At(3.3),
Alert("alertname", "test2").Active(1.5),
Alert("alertname", "test3").Active(1.5),
Alert("alertname", "test4").Active(1.6),
)
co.Want(Between(4.1, 4.5),
Alert("alertname", "test1").Active(1),
Alert("alertname", "test5").Active(1),
Alert("alertname", "test2").Active(1.5),
Alert("alertname", "test3").Active(1.5),
Alert("alertname", "test4").Active(1.6),
@ -250,10 +253,7 @@ receivers:
// While no changes happen expect no additional notifications
// until the 5s repeat interval has ended.
// The last three notifications should sent with the first two even
// though their repeat interval has not yet passed. This way fragmented
// batches are unified and notification noise reduced.
co.Want(Between(7, 7.5),
co.Want(Between(9.1, 9.5),
Alert("alertname", "test1").Active(1),
Alert("alertname", "test5").Active(1),
Alert("alertname", "test2").Active(1.5),