mirror of
https://github.com/prometheus/alertmanager
synced 2025-02-16 18:47:10 +00:00
While merging #2944, I noticed the CI failed: https://app.circleci.com/pipelines/github/prometheus/alertmanager/2686/workflows/b6f87b0a-20c3-455b-b706-432c38a77511/jobs/12028. It seemed like a deadlock between uncoordinated routines but I couldn't pin point (or reproduce, I tried with -race and -count) the exact problem. However, from the logs, I could point out where the problem originated and kind of have a hunch it had to do with the way net listeners are handled by the TODO removed. The more worrying bit of the CI failure is that it took 10m to timeout, with this change we'll force close the connection with a 5s deadline so at the very least we'll get the feedback faster. Signed-off-by: gotjosh <josue.abreu@gmail.com>
467 lines
12 KiB
Go
467 lines
12 KiB
Go
// Copyright 2018 Prometheus Team
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package test
|
|
|
|
import (
|
|
"fmt"
|
|
"testing"
|
|
"time"
|
|
|
|
. "github.com/prometheus/alertmanager/test/with_api_v2"
|
|
)
|
|
|
|
// This file contains acceptance tests around the basic sending logic
|
|
// for notifications, which includes batching and ensuring that each
|
|
// notification is eventually sent at least once and ideally exactly
|
|
// once.
|
|
|
|
func testMergeAlerts(t *testing.T, endsAt bool) {
|
|
t.Parallel()
|
|
|
|
timerange := func(ts float64) []float64 {
|
|
if !endsAt {
|
|
return []float64{ts}
|
|
}
|
|
return []float64{ts, ts + 3.0}
|
|
}
|
|
|
|
conf := `
|
|
route:
|
|
receiver: "default"
|
|
group_by: [alertname]
|
|
group_wait: 1s
|
|
group_interval: 1s
|
|
repeat_interval: 1ms
|
|
|
|
receivers:
|
|
- name: "default"
|
|
webhook_configs:
|
|
- url: 'http://%s'
|
|
send_resolved: true
|
|
`
|
|
|
|
at := NewAcceptanceTest(t, &AcceptanceOpts{
|
|
Tolerance: 150 * time.Millisecond,
|
|
})
|
|
|
|
co := at.Collector("webhook")
|
|
wh := NewWebhook(t, co)
|
|
|
|
am := at.AlertmanagerCluster(fmt.Sprintf(conf, wh.Address()), 1)
|
|
|
|
// Refresh an alert several times. The starting time must remain at the earliest
|
|
// point in time.
|
|
am.Push(At(1), Alert("alertname", "test").Active(timerange(1.1)...))
|
|
// Another Prometheus server might be sending later but with an earlier start time.
|
|
am.Push(At(1.2), Alert("alertname", "test").Active(1))
|
|
|
|
co.Want(Between(2, 2.5), Alert("alertname", "test").Active(1))
|
|
|
|
am.Push(At(2.1), Alert("alertname", "test").Annotate("ann", "v1").Active(timerange(2)...))
|
|
|
|
co.Want(Between(3, 3.5), Alert("alertname", "test").Annotate("ann", "v1").Active(1))
|
|
|
|
// Annotations are always overwritten by the alert that arrived most recently.
|
|
am.Push(At(3.6), Alert("alertname", "test").Annotate("ann", "v2").Active(timerange(1.5)...))
|
|
|
|
co.Want(Between(4, 4.5), Alert("alertname", "test").Annotate("ann", "v2").Active(1))
|
|
|
|
// If an alert is marked resolved twice, the latest point in time must be
|
|
// set as the eventual resolve time.
|
|
am.Push(At(4.6), Alert("alertname", "test").Annotate("ann", "v2").Active(3, 4.5))
|
|
am.Push(At(4.8), Alert("alertname", "test").Annotate("ann", "v3").Active(2.9, 4.8))
|
|
am.Push(At(4.8), Alert("alertname", "test").Annotate("ann", "v3").Active(2.9, 4.1))
|
|
|
|
co.Want(Between(5, 5.5), Alert("alertname", "test").Annotate("ann", "v3").Active(1, 4.8))
|
|
|
|
// Reactivate an alert after a previous occurrence has been resolved.
|
|
// No overlap, no merge must occur.
|
|
am.Push(At(5.3), Alert("alertname", "test").Active(timerange(5)...))
|
|
|
|
co.Want(Between(6, 6.5), Alert("alertname", "test").Active(5))
|
|
|
|
at.Run()
|
|
|
|
t.Log(co.Check())
|
|
}
|
|
|
|
func TestMergeAlerts(t *testing.T) {
|
|
testMergeAlerts(t, false)
|
|
}
|
|
|
|
// This test is similar to TestMergeAlerts except that the firing alerts have
|
|
// the EndsAt field set to StartsAt + 3s. This is what Prometheus starting from
|
|
// version 2.4.0 sends to AlertManager.
|
|
func TestMergeAlertsWithEndsAt(t *testing.T) {
|
|
testMergeAlerts(t, true)
|
|
}
|
|
|
|
func TestRepeat(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
conf := `
|
|
route:
|
|
receiver: "default"
|
|
group_by: [alertname]
|
|
group_wait: 1s
|
|
group_interval: 1s
|
|
repeat_interval: 1ms
|
|
|
|
receivers:
|
|
- name: "default"
|
|
webhook_configs:
|
|
- url: 'http://%s'
|
|
`
|
|
|
|
// Create a new acceptance test that instantiates new Alertmanagers
|
|
// with the given configuration and verifies times with the given
|
|
// tolerance.
|
|
at := NewAcceptanceTest(t, &AcceptanceOpts{
|
|
Tolerance: 150 * time.Millisecond,
|
|
})
|
|
|
|
// Create a collector to which alerts can be written and verified
|
|
// against a set of expected alert notifications.
|
|
co := at.Collector("webhook")
|
|
// Run something that satisfies the webhook interface to which the
|
|
// Alertmanager pushes as defined by its configuration.
|
|
wh := NewWebhook(t, co)
|
|
|
|
// Create a new Alertmanager process listening to a random port
|
|
am := at.AlertmanagerCluster(fmt.Sprintf(conf, wh.Address()), 1)
|
|
|
|
// Declare pushes to be made to the Alertmanager at the given time.
|
|
// Times are provided in fractions of seconds.
|
|
am.Push(At(1), Alert("alertname", "test").Active(1))
|
|
|
|
// XXX(fabxc): disabled as long as alerts are not persisted.
|
|
// at.Do(At(1.2), func() {
|
|
// am.Terminate()
|
|
// am.Start()
|
|
// })
|
|
am.Push(At(3.5), Alert("alertname", "test").Active(1, 3))
|
|
|
|
// Declare which alerts are expected to arrive at the collector within
|
|
// the defined time intervals.
|
|
co.Want(Between(2, 2.5), Alert("alertname", "test").Active(1))
|
|
co.Want(Between(3, 3.5), Alert("alertname", "test").Active(1))
|
|
co.Want(Between(4, 4.5), Alert("alertname", "test").Active(1, 3))
|
|
|
|
// Start the flow as defined above and run the checks afterwards.
|
|
at.Run()
|
|
|
|
t.Log(co.Check())
|
|
}
|
|
|
|
func TestRetry(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// We create a notification config that fans out into two different
|
|
// webhooks.
|
|
// The succeeding one must still only receive the first successful
|
|
// notifications. Sending to the succeeding one must eventually succeed.
|
|
conf := `
|
|
route:
|
|
receiver: "default"
|
|
group_by: [alertname]
|
|
group_wait: 1s
|
|
group_interval: 2s
|
|
repeat_interval: 3s
|
|
|
|
receivers:
|
|
- name: "default"
|
|
webhook_configs:
|
|
- url: 'http://%s'
|
|
- url: 'http://%s'
|
|
`
|
|
|
|
at := NewAcceptanceTest(t, &AcceptanceOpts{
|
|
Tolerance: 150 * time.Millisecond,
|
|
})
|
|
|
|
co1 := at.Collector("webhook")
|
|
wh1 := NewWebhook(t, co1)
|
|
|
|
co2 := at.Collector("webhook_failing")
|
|
wh2 := NewWebhook(t, co2)
|
|
|
|
wh2.Func = func(ts float64) bool {
|
|
// Fail the first interval period but eventually succeed in the third
|
|
// interval after a few failed attempts.
|
|
return ts < 4.5
|
|
}
|
|
|
|
am := at.AlertmanagerCluster(fmt.Sprintf(conf, wh1.Address(), wh2.Address()), 1)
|
|
|
|
am.Push(At(1), Alert("alertname", "test1"))
|
|
|
|
co1.Want(Between(2, 2.5), Alert("alertname", "test1").Active(1))
|
|
co1.Want(Between(6, 6.5), Alert("alertname", "test1").Active(1))
|
|
|
|
co2.Want(Between(6, 6.5), Alert("alertname", "test1").Active(1))
|
|
|
|
at.Run()
|
|
|
|
for _, c := range []*Collector{co1, co2} {
|
|
t.Log(c.Check())
|
|
}
|
|
}
|
|
|
|
func TestBatching(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
conf := `
|
|
route:
|
|
receiver: "default"
|
|
group_by: []
|
|
group_wait: 1s
|
|
group_interval: 1s
|
|
# use a value slightly below the 5s interval to avoid timing issues
|
|
repeat_interval: 4900ms
|
|
|
|
receivers:
|
|
- name: "default"
|
|
webhook_configs:
|
|
- url: 'http://%s'
|
|
`
|
|
|
|
at := NewAcceptanceTest(t, &AcceptanceOpts{
|
|
Tolerance: 150 * time.Millisecond,
|
|
})
|
|
|
|
co := at.Collector("webhook")
|
|
wh := NewWebhook(t, co)
|
|
|
|
am := at.AlertmanagerCluster(fmt.Sprintf(conf, wh.Address()), 1)
|
|
|
|
am.Push(At(1.1), Alert("alertname", "test1").Active(1))
|
|
am.Push(At(1.7), Alert("alertname", "test5").Active(1))
|
|
|
|
co.Want(Between(2.0, 2.5),
|
|
Alert("alertname", "test1").Active(1),
|
|
Alert("alertname", "test5").Active(1),
|
|
)
|
|
|
|
am.Push(At(3.3),
|
|
Alert("alertname", "test2").Active(1.5),
|
|
Alert("alertname", "test3").Active(1.5),
|
|
Alert("alertname", "test4").Active(1.6),
|
|
)
|
|
|
|
co.Want(Between(4.1, 4.5),
|
|
Alert("alertname", "test1").Active(1),
|
|
Alert("alertname", "test5").Active(1),
|
|
Alert("alertname", "test2").Active(1.5),
|
|
Alert("alertname", "test3").Active(1.5),
|
|
Alert("alertname", "test4").Active(1.6),
|
|
)
|
|
|
|
// While no changes happen expect no additional notifications
|
|
// until the 5s repeat interval has ended.
|
|
|
|
co.Want(Between(9.1, 9.5),
|
|
Alert("alertname", "test1").Active(1),
|
|
Alert("alertname", "test5").Active(1),
|
|
Alert("alertname", "test2").Active(1.5),
|
|
Alert("alertname", "test3").Active(1.5),
|
|
Alert("alertname", "test4").Active(1.6),
|
|
)
|
|
|
|
at.Run()
|
|
|
|
t.Log(co.Check())
|
|
}
|
|
|
|
func TestResolved(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
for i := 0; i < 2; i++ {
|
|
conf := `
|
|
global:
|
|
resolve_timeout: 10s
|
|
|
|
route:
|
|
receiver: "default"
|
|
group_by: [alertname]
|
|
group_wait: 1s
|
|
group_interval: 5s
|
|
|
|
receivers:
|
|
- name: "default"
|
|
webhook_configs:
|
|
- url: 'http://%s'
|
|
`
|
|
|
|
at := NewAcceptanceTest(t, &AcceptanceOpts{
|
|
Tolerance: 150 * time.Millisecond,
|
|
})
|
|
|
|
co := at.Collector("webhook")
|
|
wh := NewWebhook(t, co)
|
|
|
|
am := at.AlertmanagerCluster(fmt.Sprintf(conf, wh.Address()), 1)
|
|
|
|
am.Push(At(1),
|
|
Alert("alertname", "test", "lbl", "v1"),
|
|
Alert("alertname", "test", "lbl", "v2"),
|
|
Alert("alertname", "test", "lbl", "v3"),
|
|
)
|
|
|
|
co.Want(Between(2, 2.5),
|
|
Alert("alertname", "test", "lbl", "v1").Active(1),
|
|
Alert("alertname", "test", "lbl", "v2").Active(1),
|
|
Alert("alertname", "test", "lbl", "v3").Active(1),
|
|
)
|
|
co.Want(Between(12, 13),
|
|
Alert("alertname", "test", "lbl", "v1").Active(1, 11),
|
|
Alert("alertname", "test", "lbl", "v2").Active(1, 11),
|
|
Alert("alertname", "test", "lbl", "v3").Active(1, 11),
|
|
)
|
|
|
|
at.Run()
|
|
|
|
t.Log(co.Check())
|
|
}
|
|
}
|
|
|
|
func TestResolvedFilter(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// This integration test ensures that even though resolved alerts may not be
|
|
// notified about, they must be set as notified. Resolved alerts, even when
|
|
// filtered, have to end up in the SetNotifiesStage, otherwise when an alert
|
|
// fires again it is ambiguous whether it was resolved in between or not.
|
|
|
|
conf := `
|
|
global:
|
|
resolve_timeout: 10s
|
|
|
|
route:
|
|
receiver: "default"
|
|
group_by: [alertname]
|
|
group_wait: 1s
|
|
group_interval: 5s
|
|
|
|
receivers:
|
|
- name: "default"
|
|
webhook_configs:
|
|
- url: 'http://%s'
|
|
send_resolved: true
|
|
- url: 'http://%s'
|
|
send_resolved: false
|
|
`
|
|
|
|
at := NewAcceptanceTest(t, &AcceptanceOpts{
|
|
Tolerance: 150 * time.Millisecond,
|
|
})
|
|
|
|
co1 := at.Collector("webhook1")
|
|
wh1 := NewWebhook(t, co1)
|
|
|
|
co2 := at.Collector("webhook2")
|
|
wh2 := NewWebhook(t, co2)
|
|
|
|
amc := at.AlertmanagerCluster(fmt.Sprintf(conf, wh1.Address(), wh2.Address()), 1)
|
|
|
|
amc.Push(At(1),
|
|
Alert("alertname", "test", "lbl", "v1"),
|
|
Alert("alertname", "test", "lbl", "v2"),
|
|
)
|
|
amc.Push(At(3),
|
|
Alert("alertname", "test", "lbl", "v1").Active(1, 4),
|
|
Alert("alertname", "test", "lbl", "v3"),
|
|
)
|
|
amc.Push(At(8),
|
|
Alert("alertname", "test", "lbl", "v3").Active(3),
|
|
)
|
|
|
|
co1.Want(Between(2, 2.5),
|
|
Alert("alertname", "test", "lbl", "v1").Active(1),
|
|
Alert("alertname", "test", "lbl", "v2").Active(1),
|
|
)
|
|
co1.Want(Between(7, 7.5),
|
|
Alert("alertname", "test", "lbl", "v1").Active(1, 4),
|
|
Alert("alertname", "test", "lbl", "v2").Active(1),
|
|
Alert("alertname", "test", "lbl", "v3").Active(3),
|
|
)
|
|
// Notification should be sent because the v2 alert is resolved due to the time-out.
|
|
co1.Want(Between(12, 12.5),
|
|
Alert("alertname", "test", "lbl", "v2").Active(1, 11),
|
|
Alert("alertname", "test", "lbl", "v3").Active(3),
|
|
)
|
|
|
|
co2.Want(Between(2, 2.5),
|
|
Alert("alertname", "test", "lbl", "v1").Active(1),
|
|
Alert("alertname", "test", "lbl", "v2").Active(1),
|
|
)
|
|
co2.Want(Between(7, 7.5),
|
|
Alert("alertname", "test", "lbl", "v2").Active(1),
|
|
Alert("alertname", "test", "lbl", "v3").Active(3),
|
|
)
|
|
// No notification should be sent after group_interval because no new alert has been fired.
|
|
co2.Want(Between(12, 12.5))
|
|
|
|
at.Run()
|
|
|
|
for _, c := range []*Collector{co1, co2} {
|
|
t.Log(c.Check())
|
|
}
|
|
}
|
|
|
|
func TestReload(t *testing.T) {
|
|
t.Parallel()
|
|
|
|
// This integration test ensures that the first alert isn't notified twice
|
|
// and repeat_interval applies after the AlertManager process has been
|
|
// reloaded.
|
|
conf := `
|
|
route:
|
|
receiver: "default"
|
|
group_by: []
|
|
group_wait: 1s
|
|
group_interval: 6s
|
|
repeat_interval: 10m
|
|
|
|
receivers:
|
|
- name: "default"
|
|
webhook_configs:
|
|
- url: 'http://%s'
|
|
`
|
|
|
|
at := NewAcceptanceTest(t, &AcceptanceOpts{
|
|
Tolerance: 150 * time.Millisecond,
|
|
})
|
|
|
|
co := at.Collector("webhook")
|
|
wh := NewWebhook(t, co)
|
|
|
|
amc := at.AlertmanagerCluster(fmt.Sprintf(conf, wh.Address()), 1)
|
|
|
|
amc.Push(At(1), Alert("alertname", "test1"))
|
|
at.Do(At(3), amc.Reload)
|
|
amc.Push(At(4), Alert("alertname", "test2"))
|
|
|
|
co.Want(Between(2, 2.5), Alert("alertname", "test1").Active(1))
|
|
// Timers are reset on reload regardless, so we count the 6 second group
|
|
// interval from 3 onwards.
|
|
co.Want(Between(9, 9.5),
|
|
Alert("alertname", "test1").Active(1),
|
|
Alert("alertname", "test2").Active(4),
|
|
)
|
|
|
|
at.Run()
|
|
|
|
t.Log(co.Check())
|
|
}
|