2015-10-11 15:24:49 +00:00
|
|
|
// Copyright 2015 Prometheus Team
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
2015-09-30 14:13:00 +00:00
|
|
|
package test
|
|
|
|
|
|
|
|
import (
|
2015-10-02 12:10:04 +00:00
|
|
|
"fmt"
|
2015-09-30 14:13:00 +00:00
|
|
|
"testing"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
. "github.com/prometheus/alertmanager/test"
|
|
|
|
)
|
|
|
|
|
2015-10-12 05:35:22 +00:00
|
|
|
// This file contains acceptance tests around the basic sending logic
|
|
|
|
// for notifications, which includes batching and ensuring that each
|
|
|
|
// notification is eventually sent at least once and ideally exactly
|
|
|
|
// once.
|
|
|
|
|
2015-10-15 10:46:51 +00:00
|
|
|
func TestMergeAlerts(t *testing.T) {
|
|
|
|
t.Parallel()
|
|
|
|
|
|
|
|
conf := `
|
2015-10-19 14:52:54 +00:00
|
|
|
route:
|
2015-11-10 13:08:20 +00:00
|
|
|
receiver: "default"
|
2015-10-19 15:35:59 +00:00
|
|
|
group_by: []
|
2015-10-15 10:46:51 +00:00
|
|
|
group_wait: 1s
|
|
|
|
group_interval: 1s
|
|
|
|
repeat_interval: 1s
|
|
|
|
|
2015-11-10 13:08:20 +00:00
|
|
|
receivers:
|
2015-10-15 10:46:51 +00:00
|
|
|
- name: "default"
|
|
|
|
webhook_configs:
|
|
|
|
- url: 'http://%s'
|
|
|
|
`
|
|
|
|
|
|
|
|
at := NewAcceptanceTest(t, &AcceptanceOpts{
|
|
|
|
Tolerance: 150 * time.Millisecond,
|
|
|
|
})
|
|
|
|
|
|
|
|
co := at.Collector("webhook")
|
|
|
|
wh := NewWebhook(co)
|
|
|
|
|
|
|
|
am := at.Alertmanager(fmt.Sprintf(conf, wh.Address()))
|
|
|
|
|
2015-10-20 09:59:40 +00:00
|
|
|
// Refresh an alert several times. The starting time must remain at the earliest
|
|
|
|
// point in time.
|
|
|
|
am.Push(At(1), Alert("alertname", "test").Active(1.1))
|
|
|
|
// Another Prometheus server might be sending later but with an earlier start time.
|
|
|
|
am.Push(At(1.2), Alert("alertname", "test").Active(1))
|
2015-10-15 10:46:51 +00:00
|
|
|
|
|
|
|
co.Want(Between(2, 2.5), Alert("alertname", "test").Active(1))
|
|
|
|
|
|
|
|
am.Push(At(2.1), Alert("alertname", "test").Annotate("ann", "v1").Active(2))
|
|
|
|
|
|
|
|
co.Want(Between(3, 3.5), Alert("alertname", "test").Annotate("ann", "v1").Active(1))
|
|
|
|
|
2015-10-20 09:59:40 +00:00
|
|
|
// Annotations are always overwritten by the alert that arrived most recently.
|
2015-10-15 10:46:51 +00:00
|
|
|
am.Push(At(3.6), Alert("alertname", "test").Annotate("ann", "v2").Active(1.5))
|
|
|
|
|
|
|
|
co.Want(Between(4, 4.5), Alert("alertname", "test").Annotate("ann", "v2").Active(1))
|
|
|
|
|
2015-10-20 09:59:40 +00:00
|
|
|
// If an alert is marked resolved twice, the latest point in time must be
|
|
|
|
// set as the eventual resolve time.
|
|
|
|
am.Push(At(4.6), Alert("alertname", "test").Annotate("ann", "v2").Active(3, 4.5))
|
|
|
|
am.Push(At(4.8), Alert("alertname", "test").Annotate("ann", "v3").Active(2.9, 4.8))
|
|
|
|
am.Push(At(4.8), Alert("alertname", "test").Annotate("ann", "v3").Active(2.9, 4.1))
|
|
|
|
|
|
|
|
co.Want(Between(5, 5.5), Alert("alertname", "test").Annotate("ann", "v3").Active(1, 4.8))
|
|
|
|
|
|
|
|
// Reactivate an alert after a previous occurrence has been resolved.
|
|
|
|
// No overlap, no merge must occur.
|
|
|
|
am.Push(At(5.3), Alert("alertname", "test"))
|
|
|
|
|
|
|
|
co.Want(Between(6, 6.5), Alert("alertname", "test").Active(5.3))
|
|
|
|
|
|
|
|
// Test against a bug which ocurrec after a restart. The previous occurrence of
|
|
|
|
// the alert was sent rather than the most recent one.
|
|
|
|
at.Do(At(6.7), func() {
|
|
|
|
am.Terminate()
|
|
|
|
am.Start()
|
|
|
|
})
|
|
|
|
|
|
|
|
// On restart the alert is flushed right away as the group_wait has already passed.
|
|
|
|
// However, it must be caught in the deduplication stage.
|
|
|
|
// The next attempt will be 1s later and won't be filtered in deduping.
|
|
|
|
co.Want(Between(7.7, 8), Alert("alertname", "test").Active(5.3))
|
|
|
|
|
2015-10-15 10:46:51 +00:00
|
|
|
at.Run()
|
|
|
|
}
|
|
|
|
|
2015-10-12 05:35:22 +00:00
|
|
|
func TestRepeat(t *testing.T) {
|
2015-10-02 12:10:04 +00:00
|
|
|
t.Parallel()
|
|
|
|
|
|
|
|
conf := `
|
2015-10-19 14:52:54 +00:00
|
|
|
route:
|
2015-11-10 13:08:20 +00:00
|
|
|
receiver: "default"
|
2015-10-19 15:35:59 +00:00
|
|
|
group_by: []
|
2015-10-08 08:50:37 +00:00
|
|
|
group_wait: 1s
|
|
|
|
group_interval: 1s
|
|
|
|
repeat_interval: 1s
|
2015-09-30 14:13:00 +00:00
|
|
|
|
2015-11-10 13:08:20 +00:00
|
|
|
receivers:
|
2015-09-30 14:13:00 +00:00
|
|
|
- name: "default"
|
|
|
|
webhook_configs:
|
2015-10-02 12:10:04 +00:00
|
|
|
- url: 'http://%s'
|
2015-09-30 14:13:00 +00:00
|
|
|
`
|
|
|
|
|
2015-09-30 14:18:44 +00:00
|
|
|
// Create a new acceptance test that instantiates new Alertmanagers
|
|
|
|
// with the given configuration and verifies times with the given
|
|
|
|
// tollerance.
|
2015-09-30 14:13:00 +00:00
|
|
|
at := NewAcceptanceTest(t, &AcceptanceOpts{
|
|
|
|
Tolerance: 150 * time.Millisecond,
|
|
|
|
})
|
|
|
|
|
2015-09-30 14:18:44 +00:00
|
|
|
// Create a collector to which alerts can be written and verified
|
|
|
|
// against a set of expected alert notifications.
|
2015-09-30 14:13:00 +00:00
|
|
|
co := at.Collector("webhook")
|
2015-09-30 14:18:44 +00:00
|
|
|
// Run something that satisfies the webhook interface to which the
|
|
|
|
// Alertmanager pushes as defined by its configuration.
|
2015-10-02 12:10:04 +00:00
|
|
|
wh := NewWebhook(co)
|
|
|
|
|
|
|
|
// Create a new Alertmanager process listening to a random port
|
|
|
|
am := at.Alertmanager(fmt.Sprintf(conf, wh.Address()))
|
2015-09-30 14:13:00 +00:00
|
|
|
|
2015-09-30 14:18:44 +00:00
|
|
|
// Declare pushes to be made to the Alertmanager at the given time.
|
|
|
|
// Times are provided in fractions of seconds.
|
2015-09-30 14:13:00 +00:00
|
|
|
am.Push(At(1), Alert("alertname", "test").Active(1))
|
2015-10-07 14:18:55 +00:00
|
|
|
|
|
|
|
at.Do(At(1.2), func() {
|
|
|
|
am.Terminate()
|
|
|
|
am.Start()
|
|
|
|
})
|
2015-09-30 14:13:00 +00:00
|
|
|
am.Push(At(3.5), Alert("alertname", "test").Active(1, 3))
|
|
|
|
|
2015-09-30 14:18:44 +00:00
|
|
|
// Declare which alerts are expected to arrive at the collector within
|
|
|
|
// the defined time intervals.
|
2015-09-30 14:13:00 +00:00
|
|
|
co.Want(Between(2, 2.5), Alert("alertname", "test").Active(1))
|
|
|
|
co.Want(Between(3, 3.5), Alert("alertname", "test").Active(1))
|
2015-09-30 15:35:33 +00:00
|
|
|
co.Want(Between(4, 4.5), Alert("alertname", "test").Active(1, 3))
|
2015-09-30 14:13:00 +00:00
|
|
|
|
2015-09-30 14:18:44 +00:00
|
|
|
// Start the flow as defined above and run the checks afterwards.
|
2015-09-30 14:13:00 +00:00
|
|
|
at.Run()
|
|
|
|
}
|
2015-09-30 16:02:47 +00:00
|
|
|
|
2015-10-12 05:35:22 +00:00
|
|
|
func TestRetry(t *testing.T) {
|
2015-10-02 12:10:04 +00:00
|
|
|
t.Parallel()
|
|
|
|
|
2015-10-12 05:35:22 +00:00
|
|
|
// We create a notification config that fans out into two different
|
|
|
|
// webhooks.
|
|
|
|
// The succeeding one must still only receive the first successful
|
|
|
|
// notifications. Sending to the succeeding one must eventually succeed.
|
2015-10-02 12:10:04 +00:00
|
|
|
conf := `
|
2015-10-19 14:52:54 +00:00
|
|
|
route:
|
2015-11-10 13:08:20 +00:00
|
|
|
receiver: "default"
|
2015-10-19 15:35:59 +00:00
|
|
|
group_by: []
|
2015-10-08 08:50:37 +00:00
|
|
|
group_wait: 1s
|
|
|
|
group_interval: 1s
|
2015-10-12 05:35:22 +00:00
|
|
|
repeat_interval: 3s
|
2015-10-01 19:28:18 +00:00
|
|
|
|
2015-11-10 13:08:20 +00:00
|
|
|
receivers:
|
2015-10-01 19:28:18 +00:00
|
|
|
- name: "default"
|
|
|
|
webhook_configs:
|
2015-10-02 12:10:04 +00:00
|
|
|
- url: 'http://%s'
|
2015-10-12 05:35:22 +00:00
|
|
|
- url: 'http://%s'
|
2015-10-01 19:28:18 +00:00
|
|
|
`
|
|
|
|
|
|
|
|
at := NewAcceptanceTest(t, &AcceptanceOpts{
|
|
|
|
Tolerance: 150 * time.Millisecond,
|
|
|
|
})
|
|
|
|
|
2015-10-12 05:35:22 +00:00
|
|
|
co1 := at.Collector("webhook")
|
|
|
|
wh1 := NewWebhook(co1)
|
2015-10-01 19:28:18 +00:00
|
|
|
|
2015-10-12 05:35:22 +00:00
|
|
|
co2 := at.Collector("webhook_failing")
|
|
|
|
wh2 := NewWebhook(co2)
|
2015-10-01 19:28:18 +00:00
|
|
|
|
2015-10-12 05:35:22 +00:00
|
|
|
wh2.Func = func(ts float64) bool {
|
|
|
|
// Fail the first two interval periods but eventually
|
|
|
|
// succeed in the third interval after a few failed attempts.
|
|
|
|
return ts < 4.5
|
|
|
|
}
|
2015-10-01 19:28:18 +00:00
|
|
|
|
2015-10-12 05:35:22 +00:00
|
|
|
am := at.Alertmanager(fmt.Sprintf(conf, wh1.Address(), wh2.Address()))
|
2015-10-01 19:28:18 +00:00
|
|
|
|
2015-10-12 05:35:22 +00:00
|
|
|
am.Push(At(1), Alert("alertname", "test1"))
|
2015-10-01 19:28:18 +00:00
|
|
|
|
2015-10-12 05:35:22 +00:00
|
|
|
co1.Want(Between(2, 2.5), Alert("alertname", "test1").Active(1))
|
|
|
|
co1.Want(Between(5, 5.5), Alert("alertname", "test1").Active(1))
|
2015-10-01 19:28:18 +00:00
|
|
|
|
2015-10-12 05:35:22 +00:00
|
|
|
co2.Want(Between(4.5, 5), Alert("alertname", "test1").Active(1))
|
2015-10-01 19:28:18 +00:00
|
|
|
}
|
|
|
|
|
2015-10-02 12:10:04 +00:00
|
|
|
func TestBatching(t *testing.T) {
|
|
|
|
t.Parallel()
|
|
|
|
|
|
|
|
conf := `
|
2015-10-19 14:52:54 +00:00
|
|
|
route:
|
2015-11-10 13:08:20 +00:00
|
|
|
receiver: "default"
|
2015-10-19 15:35:59 +00:00
|
|
|
group_by: []
|
2015-10-08 08:50:37 +00:00
|
|
|
group_wait: 1s
|
|
|
|
group_interval: 1s
|
|
|
|
repeat_interval: 5s
|
2015-09-30 16:02:47 +00:00
|
|
|
|
2015-11-10 13:08:20 +00:00
|
|
|
receivers:
|
|
|
|
- name: "default"
|
2015-09-30 16:02:47 +00:00
|
|
|
webhook_configs:
|
2015-10-02 12:10:04 +00:00
|
|
|
- url: 'http://%s'
|
2015-09-30 16:02:47 +00:00
|
|
|
`
|
|
|
|
|
|
|
|
at := NewAcceptanceTest(t, &AcceptanceOpts{
|
|
|
|
Tolerance: 150 * time.Millisecond,
|
|
|
|
})
|
|
|
|
|
|
|
|
co := at.Collector("webhook")
|
2015-10-02 12:10:04 +00:00
|
|
|
wh := NewWebhook(co)
|
2015-09-30 16:02:47 +00:00
|
|
|
|
2015-10-02 12:10:04 +00:00
|
|
|
am := at.Alertmanager(fmt.Sprintf(conf, wh.Address()))
|
2015-09-30 16:02:47 +00:00
|
|
|
|
|
|
|
am.Push(At(1.1), Alert("alertname", "test1").Active(1))
|
|
|
|
am.Push(At(1.9), Alert("alertname", "test5").Active(1))
|
|
|
|
am.Push(At(2.3),
|
|
|
|
Alert("alertname", "test2").Active(1.5),
|
|
|
|
Alert("alertname", "test3").Active(1.5),
|
|
|
|
Alert("alertname", "test4").Active(1.6),
|
|
|
|
)
|
|
|
|
|
|
|
|
co.Want(Between(2.0, 2.5),
|
|
|
|
Alert("alertname", "test1").Active(1),
|
|
|
|
Alert("alertname", "test5").Active(1),
|
|
|
|
)
|
|
|
|
// Only expect the new ones with the next group interval.
|
|
|
|
co.Want(Between(3, 3.5),
|
|
|
|
Alert("alertname", "test2").Active(1.5),
|
|
|
|
Alert("alertname", "test3").Active(1.5),
|
|
|
|
Alert("alertname", "test4").Active(1.6),
|
|
|
|
)
|
|
|
|
|
|
|
|
// While no changes happen expect no additional notifications
|
|
|
|
// until the 5s repeat interval has ended.
|
2015-10-01 07:43:51 +00:00
|
|
|
|
|
|
|
// The last three notifications should sent with the first two even
|
|
|
|
// though their repeat interval has not yet passed. This way fragmented
|
2015-09-30 17:03:19 +00:00
|
|
|
// batches are unified and notification noise reduced.
|
|
|
|
co.Want(Between(7, 7.5),
|
2015-09-30 16:02:47 +00:00
|
|
|
Alert("alertname", "test1").Active(1),
|
2015-09-30 16:45:49 +00:00
|
|
|
Alert("alertname", "test5").Active(1),
|
2015-09-30 16:02:47 +00:00
|
|
|
Alert("alertname", "test2").Active(1.5),
|
|
|
|
Alert("alertname", "test3").Active(1.5),
|
|
|
|
Alert("alertname", "test4").Active(1.6),
|
|
|
|
)
|
|
|
|
|
|
|
|
at.Run()
|
|
|
|
}
|