dispatch: fix missing receivers in Groups() (#1964)
Signed-off-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
parent
f450720213
commit
ab537b5b2f
|
@ -135,16 +135,21 @@ func (d *Dispatcher) run(it provider.AlertIterator) {
|
||||||
|
|
||||||
// AlertGroup represents how alerts exist within an aggrGroup.
|
// AlertGroup represents how alerts exist within an aggrGroup.
|
||||||
type AlertGroup struct {
|
type AlertGroup struct {
|
||||||
Alerts []*types.Alert
|
Alerts types.AlertSlice
|
||||||
Labels model.LabelSet
|
Labels model.LabelSet
|
||||||
Receiver string
|
Receiver string
|
||||||
}
|
}
|
||||||
|
|
||||||
type AlertGroups []*AlertGroup
|
type AlertGroups []*AlertGroup
|
||||||
|
|
||||||
func (ag AlertGroups) Swap(i, j int) { ag[i], ag[j] = ag[j], ag[i] }
|
func (ag AlertGroups) Swap(i, j int) { ag[i], ag[j] = ag[j], ag[i] }
|
||||||
func (ag AlertGroups) Less(i, j int) bool { return ag[i].Labels.Before(ag[j].Labels) }
|
func (ag AlertGroups) Less(i, j int) bool {
|
||||||
func (ag AlertGroups) Len() int { return len(ag) }
|
if ag[i].Labels.Equal(ag[j].Labels) {
|
||||||
|
return ag[i].Receiver < ag[j].Receiver
|
||||||
|
}
|
||||||
|
return ag[i].Labels.Before(ag[j].Labels)
|
||||||
|
}
|
||||||
|
func (ag AlertGroups) Len() int { return len(ag) }
|
||||||
|
|
||||||
// Groups returns a slice of AlertGroups from the dispatcher's internal state.
|
// Groups returns a slice of AlertGroups from the dispatcher's internal state.
|
||||||
func (d *Dispatcher) Groups(routeFilter func(*Route) bool, alertFilter func(*types.Alert, time.Time) bool) (AlertGroups, map[model.Fingerprint][]string) {
|
func (d *Dispatcher) Groups(routeFilter func(*Route) bool, alertFilter func(*types.Alert, time.Time) bool) (AlertGroups, map[model.Fingerprint][]string) {
|
||||||
|
@ -153,13 +158,12 @@ func (d *Dispatcher) Groups(routeFilter func(*Route) bool, alertFilter func(*typ
|
||||||
d.mtx.RLock()
|
d.mtx.RLock()
|
||||||
defer d.mtx.RUnlock()
|
defer d.mtx.RUnlock()
|
||||||
|
|
||||||
seen := map[model.Fingerprint]*AlertGroup{}
|
|
||||||
|
|
||||||
// Keep a list of receivers for an alert to prevent checking each alert
|
// Keep a list of receivers for an alert to prevent checking each alert
|
||||||
// again against all routes. The alert has already matched against this
|
// again against all routes. The alert has already matched against this
|
||||||
// route on ingestion.
|
// route on ingestion.
|
||||||
receivers := map[model.Fingerprint][]string{}
|
receivers := map[model.Fingerprint][]string{}
|
||||||
|
|
||||||
|
now := time.Now()
|
||||||
for route, ags := range d.aggrGroups {
|
for route, ags := range d.aggrGroups {
|
||||||
if !routeFilter(route) {
|
if !routeFilter(route) {
|
||||||
continue
|
continue
|
||||||
|
@ -167,18 +171,11 @@ func (d *Dispatcher) Groups(routeFilter func(*Route) bool, alertFilter func(*typ
|
||||||
|
|
||||||
for _, ag := range ags {
|
for _, ag := range ags {
|
||||||
receiver := route.RouteOpts.Receiver
|
receiver := route.RouteOpts.Receiver
|
||||||
alertGroup, ok := seen[ag.fingerprint()]
|
alertGroup := &AlertGroup{
|
||||||
if !ok {
|
Labels: ag.labels,
|
||||||
alertGroup = &AlertGroup{
|
Receiver: receiver,
|
||||||
Labels: ag.labels,
|
|
||||||
Receiver: receiver,
|
|
||||||
}
|
|
||||||
|
|
||||||
seen[ag.fingerprint()] = alertGroup
|
|
||||||
}
|
}
|
||||||
|
|
||||||
now := time.Now()
|
|
||||||
|
|
||||||
alerts := ag.alerts.List()
|
alerts := ag.alerts.List()
|
||||||
filteredAlerts := make([]*types.Alert, 0, len(alerts))
|
filteredAlerts := make([]*types.Alert, 0, len(alerts))
|
||||||
for _, a := range alerts {
|
for _, a := range alerts {
|
||||||
|
@ -207,8 +204,13 @@ func (d *Dispatcher) Groups(routeFilter func(*Route) bool, alertFilter func(*typ
|
||||||
groups = append(groups, alertGroup)
|
groups = append(groups, alertGroup)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
sort.Sort(groups)
|
sort.Sort(groups)
|
||||||
|
for i := range groups {
|
||||||
|
sort.Sort(groups[i].Alerts)
|
||||||
|
}
|
||||||
|
for i := range receivers {
|
||||||
|
sort.Strings(receivers[i])
|
||||||
|
}
|
||||||
|
|
||||||
return groups, receivers
|
return groups, receivers
|
||||||
}
|
}
|
||||||
|
|
|
@ -329,11 +329,36 @@ func TestGroupByAllLabels(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestGroups(t *testing.T) {
|
func TestGroups(t *testing.T) {
|
||||||
logger := log.NewNopLogger()
|
confData := `receivers:
|
||||||
conf, _, err := config.LoadFile("testdata/conf.yml")
|
- name: 'kafka'
|
||||||
|
- name: 'prod'
|
||||||
|
- name: 'testing'
|
||||||
|
|
||||||
|
route:
|
||||||
|
group_by: ['alertname']
|
||||||
|
group_wait: 10ms
|
||||||
|
group_interval: 10ms
|
||||||
|
receiver: 'prod'
|
||||||
|
routes:
|
||||||
|
- match:
|
||||||
|
env: 'testing'
|
||||||
|
receiver: 'testing'
|
||||||
|
group_by: ['alertname', 'service']
|
||||||
|
- match:
|
||||||
|
env: 'prod'
|
||||||
|
receiver: 'prod'
|
||||||
|
group_by: ['alertname', 'service', 'cluster']
|
||||||
|
continue: true
|
||||||
|
- match:
|
||||||
|
kafka: 'yes'
|
||||||
|
receiver: 'kafka'
|
||||||
|
group_by: ['alertname', 'service', 'cluster']`
|
||||||
|
conf, err := config.Load(confData)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
logger := log.NewNopLogger()
|
||||||
route := NewRoute(conf.Route, nil)
|
route := NewRoute(conf.Route, nil)
|
||||||
marker := types.NewMarker(prometheus.NewRegistry())
|
marker := types.NewMarker(prometheus.NewRegistry())
|
||||||
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, logger)
|
alerts, err := mem.NewAlerts(context.Background(), marker, time.Hour, logger)
|
||||||
|
@ -341,25 +366,34 @@ func TestGroups(t *testing.T) {
|
||||||
t.Fatal(err)
|
t.Fatal(err)
|
||||||
}
|
}
|
||||||
defer alerts.Close()
|
defer alerts.Close()
|
||||||
|
|
||||||
timeout := func(d time.Duration) time.Duration { return time.Duration(0) }
|
timeout := func(d time.Duration) time.Duration { return time.Duration(0) }
|
||||||
recorder := &recordStage{t: t, alerts: make(map[string]map[model.Fingerprint]*types.Alert)}
|
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)}
|
||||||
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, logger)
|
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, logger)
|
||||||
go dispatcher.Run()
|
go dispatcher.Run()
|
||||||
defer dispatcher.Stop()
|
defer dispatcher.Stop()
|
||||||
|
|
||||||
// Create alerts. the dispatcher will automatically create the groups.
|
// Create alerts. the dispatcher will automatically create the groups.
|
||||||
alerts.Put(
|
inputAlerts := []*types.Alert{
|
||||||
newAlert(model.LabelSet{"env": "testing", "alertname": "TestingAlert", "service": "api"}),
|
// Matches the parent route.
|
||||||
newAlert(model.LabelSet{"env": "prod", "alertname": "HighErrorRate", "cluster": "aa", "service": "api"}),
|
newAlert(model.LabelSet{"alertname": "OtherAlert", "cluster": "cc", "service": "dd"}),
|
||||||
newAlert(model.LabelSet{"env": "prod", "alertname": "HighErrorRate", "cluster": "bb", "service": "api"}),
|
// Matches the first sub-route.
|
||||||
newAlert(model.LabelSet{"env": "prod", "alertname": "HighLatency", "cluster": "bb", "service": "db", "kafka": "yes"}),
|
newAlert(model.LabelSet{"env": "testing", "alertname": "TestingAlert", "service": "api", "instance": "inst1"}),
|
||||||
)
|
// Matches the second sub-route.
|
||||||
|
newAlert(model.LabelSet{"env": "prod", "alertname": "HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst1"}),
|
||||||
|
newAlert(model.LabelSet{"env": "prod", "alertname": "HighErrorRate", "cluster": "aa", "service": "api", "instance": "inst2"}),
|
||||||
|
// Matches the second sub-route.
|
||||||
|
newAlert(model.LabelSet{"env": "prod", "alertname": "HighErrorRate", "cluster": "bb", "service": "api", "instance": "inst1"}),
|
||||||
|
// Matches the second and third sub-route.
|
||||||
|
newAlert(model.LabelSet{"env": "prod", "alertname": "HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst3"}),
|
||||||
|
}
|
||||||
|
alerts.Put(inputAlerts...)
|
||||||
|
|
||||||
// Let alerts get processed.
|
// Let alerts get processed.
|
||||||
for i := 0; len(recorder.Alerts()) != 5 && i < 10; i++ {
|
for i := 0; len(recorder.Alerts()) != 7 && i < 10; i++ {
|
||||||
time.Sleep(100 * time.Millisecond)
|
time.Sleep(200 * time.Millisecond)
|
||||||
}
|
}
|
||||||
require.Equal(t, 5, len(recorder.Alerts()))
|
require.Equal(t, 7, len(recorder.Alerts()))
|
||||||
|
|
||||||
alertGroups, receivers := dispatcher.Groups(
|
alertGroups, receivers := dispatcher.Groups(
|
||||||
func(*Route) bool {
|
func(*Route) bool {
|
||||||
|
@ -369,39 +403,70 @@ func TestGroups(t *testing.T) {
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
// Verify that grouping works.
|
require.Equal(t, AlertGroups{
|
||||||
require.Equal(t, 5, len(alertGroups))
|
&AlertGroup{
|
||||||
for _, ag := range alertGroups {
|
Alerts: []*types.Alert{inputAlerts[0]},
|
||||||
if len(ag.Labels) == 2 {
|
Labels: model.LabelSet{
|
||||||
// testing receiver
|
model.LabelName("alertname"): model.LabelValue("OtherAlert"),
|
||||||
require.Equal(t, 2, len(ag.Labels))
|
},
|
||||||
require.Equal(t, model.LabelSet{"alertname": "TestingAlert", "service": "api"}, ag.Labels)
|
Receiver: "prod",
|
||||||
for _, alert := range ag.Alerts {
|
},
|
||||||
alertsReceivers, ok := receivers[alert.Fingerprint()]
|
&AlertGroup{
|
||||||
require.True(t, ok)
|
Alerts: []*types.Alert{inputAlerts[1]},
|
||||||
require.Equal(t, 1, len(alertsReceivers))
|
Labels: model.LabelSet{
|
||||||
require.Equal(t, "testing", alertsReceivers[0])
|
model.LabelName("alertname"): model.LabelValue("TestingAlert"),
|
||||||
}
|
model.LabelName("service"): model.LabelValue("api"),
|
||||||
continue
|
},
|
||||||
}
|
Receiver: "testing",
|
||||||
require.Equal(t, 3, len(ag.Labels))
|
},
|
||||||
for _, alert := range ag.Alerts {
|
&AlertGroup{
|
||||||
alertsReceivers, ok := receivers[alert.Fingerprint()]
|
Alerts: []*types.Alert{inputAlerts[2], inputAlerts[3]},
|
||||||
require.True(t, ok)
|
Labels: model.LabelSet{
|
||||||
sort.Strings(alertsReceivers)
|
model.LabelName("alertname"): model.LabelValue("HighErrorRate"),
|
||||||
if labelValue := ag.Labels["alertname"]; string(labelValue) == "HighLatency" {
|
model.LabelName("service"): model.LabelValue("api"),
|
||||||
// Matches both prod and kafka receivers
|
model.LabelName("cluster"): model.LabelValue("aa"),
|
||||||
require.Equal(t, []string{"kafka", "prod"}, alertsReceivers)
|
},
|
||||||
continue
|
Receiver: "prod",
|
||||||
}
|
},
|
||||||
require.Equal(t, 1, len(alertsReceivers))
|
&AlertGroup{
|
||||||
require.Equal(t, "prod", alertsReceivers[0])
|
Alerts: []*types.Alert{inputAlerts[4]},
|
||||||
}
|
Labels: model.LabelSet{
|
||||||
}
|
model.LabelName("alertname"): model.LabelValue("HighErrorRate"),
|
||||||
|
model.LabelName("service"): model.LabelValue("api"),
|
||||||
|
model.LabelName("cluster"): model.LabelValue("bb"),
|
||||||
|
},
|
||||||
|
Receiver: "prod",
|
||||||
|
},
|
||||||
|
&AlertGroup{
|
||||||
|
Alerts: []*types.Alert{inputAlerts[5]},
|
||||||
|
Labels: model.LabelSet{
|
||||||
|
model.LabelName("alertname"): model.LabelValue("HighLatency"),
|
||||||
|
model.LabelName("service"): model.LabelValue("db"),
|
||||||
|
model.LabelName("cluster"): model.LabelValue("bb"),
|
||||||
|
},
|
||||||
|
Receiver: "kafka",
|
||||||
|
},
|
||||||
|
&AlertGroup{
|
||||||
|
Alerts: []*types.Alert{inputAlerts[5]},
|
||||||
|
Labels: model.LabelSet{
|
||||||
|
model.LabelName("alertname"): model.LabelValue("HighLatency"),
|
||||||
|
model.LabelName("service"): model.LabelValue("db"),
|
||||||
|
model.LabelName("cluster"): model.LabelValue("bb"),
|
||||||
|
},
|
||||||
|
Receiver: "prod",
|
||||||
|
},
|
||||||
|
}, alertGroups)
|
||||||
|
require.Equal(t, map[model.Fingerprint][]string{
|
||||||
|
inputAlerts[0].Fingerprint(): []string{"prod"},
|
||||||
|
inputAlerts[1].Fingerprint(): []string{"testing"},
|
||||||
|
inputAlerts[2].Fingerprint(): []string{"prod"},
|
||||||
|
inputAlerts[3].Fingerprint(): []string{"prod"},
|
||||||
|
inputAlerts[4].Fingerprint(): []string{"prod"},
|
||||||
|
inputAlerts[5].Fingerprint(): []string{"kafka", "prod"},
|
||||||
|
}, receivers)
|
||||||
}
|
}
|
||||||
|
|
||||||
type recordStage struct {
|
type recordStage struct {
|
||||||
t *testing.T
|
|
||||||
mtx sync.RWMutex
|
mtx sync.RWMutex
|
||||||
alerts map[string]map[model.Fingerprint]*types.Alert
|
alerts map[string]map[model.Fingerprint]*types.Alert
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,34 +0,0 @@
|
||||||
global:
|
|
||||||
resolve_timeout: 5m
|
|
||||||
|
|
||||||
receivers:
|
|
||||||
- name: 'testing'
|
|
||||||
webhook_configs:
|
|
||||||
- url: 'http://127.0.0.1:5001/'
|
|
||||||
- name: 'prod'
|
|
||||||
webhook_configs:
|
|
||||||
- url: 'http://127.0.0.1:5001/'
|
|
||||||
- name: 'kafka'
|
|
||||||
webhook_configs:
|
|
||||||
- url: 'http://127.0.0.1:5001/'
|
|
||||||
|
|
||||||
route:
|
|
||||||
group_by: ['alertname']
|
|
||||||
group_wait: 10s
|
|
||||||
group_interval: 10s
|
|
||||||
repeat_interval: 1h
|
|
||||||
receiver: 'prod'
|
|
||||||
routes:
|
|
||||||
- match:
|
|
||||||
env: 'testing'
|
|
||||||
receiver: 'testing'
|
|
||||||
group_by: ['alertname', 'service']
|
|
||||||
- match:
|
|
||||||
env: 'prod'
|
|
||||||
receiver: 'prod'
|
|
||||||
group_by: ['alertname', 'service', 'cluster']
|
|
||||||
continue: true
|
|
||||||
- match:
|
|
||||||
kafka: 'yes'
|
|
||||||
receiver: 'kafka'
|
|
||||||
group_by: ['alertname', 'service', 'cluster']
|
|
Loading…
Reference in New Issue