Added group limit to dispatcher.

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>
This commit is contained in:
Peter Štibraný 2021-05-05 17:26:37 +02:00 committed by Peter Štibraný
parent f686ff3be2
commit 390474ffbe
3 changed files with 114 additions and 40 deletions

View File

@ -460,7 +460,7 @@ func run() int {
silencer.Mutes(labels)
})
disp = dispatch.NewDispatcher(alerts, routes, pipeline, marker, timeoutFunc, logger, dispMetrics)
disp = dispatch.NewDispatcher(alerts, routes, pipeline, marker, timeoutFunc, nil, logger, dispMetrics)
routes.Walk(func(r *dispatch.Route) {
if r.RouteOpts.RepeatInterval > *retention {
level.Warn(configLogger).Log(

View File

@ -33,8 +33,9 @@ import (
// DispatcherMetrics represents metrics associated to a dispatcher.
type DispatcherMetrics struct {
aggrGroups prometheus.Gauge
processingDuration prometheus.Summary
aggrGroups prometheus.Gauge
processingDuration prometheus.Summary
aggrGroupLimitReached prometheus.Counter
}
// NewDispatcherMetrics returns a new registered DispatchMetrics.
@ -52,10 +53,16 @@ func NewDispatcherMetrics(r prometheus.Registerer) *DispatcherMetrics {
Help: "Summary of latencies for the processing of alerts.",
},
),
aggrGroupLimitReached: prometheus.NewCounter(
prometheus.CounterOpts{
Name: "alertmanager_dispatcher_aggregation_group_limit_reached_total",
Help: "Number of times when dispatcher failed to create new aggregation group due to limit.",
},
),
}
if r != nil {
r.MustRegister(m.aggrGroups, m.processingDuration)
r.MustRegister(m.aggrGroups, m.processingDuration, m.aggrGroupLimitReached)
}
return &m
@ -68,12 +75,14 @@ type Dispatcher struct {
alerts provider.Alerts
stage notify.Stage
metrics *DispatcherMetrics
limits Limits
marker types.Marker
timeout func(time.Duration) time.Duration
aggrGroups map[*Route]map[model.Fingerprint]*aggrGroup
mtx sync.RWMutex
mtx sync.RWMutex
aggrGroupsPerRoute map[*Route]map[model.Fingerprint]*aggrGroup
aggrGroupsNum int
done chan struct{}
ctx context.Context
@ -82,6 +91,14 @@ type Dispatcher struct {
logger log.Logger
}
// Limits describes limits used by Dispatcher.
type Limits interface {
// MaxNumberOfAggregationGroups returns max number of aggregation groups that dispatcher can have.
// 0 or negative value = unlimited.
// If dispatcher hits this limit, it will not create additional groups, but will log an error instead.
MaxNumberOfAggregationGroups() int
}
// NewDispatcher returns a new Dispatcher.
func NewDispatcher(
ap provider.Alerts,
@ -89,9 +106,14 @@ func NewDispatcher(
s notify.Stage,
mk types.Marker,
to func(time.Duration) time.Duration,
lim Limits,
l log.Logger,
m *DispatcherMetrics,
) *Dispatcher {
if lim == nil {
lim = nilLimits{}
}
disp := &Dispatcher{
alerts: ap,
stage: s,
@ -100,6 +122,7 @@ func NewDispatcher(
timeout: to,
logger: log.With(l, "component", "dispatcher"),
metrics: m,
limits: lim,
}
return disp
}
@ -109,7 +132,8 @@ func (d *Dispatcher) Run() {
d.done = make(chan struct{})
d.mtx.Lock()
d.aggrGroups = map[*Route]map[model.Fingerprint]*aggrGroup{}
d.aggrGroupsPerRoute = map[*Route]map[model.Fingerprint]*aggrGroup{}
d.aggrGroupsNum = 0
d.metrics.aggrGroups.Set(0)
d.ctx, d.cancel = context.WithCancel(context.Background())
d.mtx.Unlock()
@ -152,11 +176,12 @@ func (d *Dispatcher) run(it provider.AlertIterator) {
case <-cleanup.C:
d.mtx.Lock()
for _, groups := range d.aggrGroups {
for _, groups := range d.aggrGroupsPerRoute {
for _, ag := range groups {
if ag.empty() {
ag.stop()
delete(groups, ag.fingerprint())
d.aggrGroupsNum--
d.metrics.aggrGroups.Dec()
}
}
@ -201,7 +226,7 @@ func (d *Dispatcher) Groups(routeFilter func(*Route) bool, alertFilter func(*typ
receivers := map[model.Fingerprint][]string{}
now := time.Now()
for route, ags := range d.aggrGroups {
for route, ags := range d.aggrGroupsPerRoute {
if !routeFilter(route) {
continue
}
@ -284,21 +309,28 @@ func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
d.mtx.Lock()
defer d.mtx.Unlock()
group, ok := d.aggrGroups[route]
routeGroups, ok := d.aggrGroupsPerRoute[route]
if !ok {
group = map[model.Fingerprint]*aggrGroup{}
d.aggrGroups[route] = group
routeGroups = map[model.Fingerprint]*aggrGroup{}
d.aggrGroupsPerRoute[route] = routeGroups
}
ag, ok := group[fp]
ag, ok := routeGroups[fp]
if ok {
ag.insert(alert)
return
}
// If the group does not exist, create it.
// If the group does not exist, create it. But check the limit first.
if limit := d.limits.MaxNumberOfAggregationGroups(); limit > 0 && d.aggrGroupsNum >= limit {
d.metrics.aggrGroupLimitReached.Inc()
level.Error(d.logger).Log("msg", "Too many aggregation groups, cannot create new group for alert", "groups", d.aggrGroupsNum, "limit", limit, "alert", alert.Name())
return
}
ag = newAggrGroup(d.ctx, groupLabels, route, d.timeout, d.logger)
group[fp] = ag
routeGroups[fp] = ag
d.aggrGroupsNum++
d.metrics.aggrGroups.Inc()
// Insert the 1st alert in the group before starting the group's run()
@ -499,3 +531,7 @@ func (ag *aggrGroup) flush(notify func(...*types.Alert) bool) {
}
}
}
type nilLimits struct{}
func (n nilLimits) MaxNumberOfAggregationGroups() int { return 0 }

View File

@ -24,6 +24,7 @@ import (
"github.com/go-kit/kit/log"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
@ -373,7 +374,9 @@ route:
timeout := func(d time.Duration) time.Duration { return time.Duration(0) }
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)}
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, logger, NewDispatcherMetrics(prometheus.NewRegistry()))
lim := limits{groups: 6}
m := NewDispatcherMetrics(prometheus.NewRegistry())
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, lim, logger, m)
go dispatcher.Run()
defer dispatcher.Stop()
@ -391,7 +394,10 @@ route:
// Matches the second and third sub-route.
newAlert(model.LabelSet{"env": "prod", "alertname": "HighLatency", "cluster": "bb", "service": "db", "kafka": "yes", "instance": "inst3"}),
}
alerts.Put(inputAlerts...)
err = alerts.Put(inputAlerts...)
if err != nil {
t.Fatal(err)
}
// Let alerts get processed.
for i := 0; len(recorder.Alerts()) != 7 && i < 10; i++ {
@ -411,63 +417,87 @@ route:
&AlertGroup{
Alerts: []*types.Alert{inputAlerts[0]},
Labels: model.LabelSet{
model.LabelName("alertname"): model.LabelValue("OtherAlert"),
"alertname": "OtherAlert",
},
Receiver: "prod",
},
&AlertGroup{
Alerts: []*types.Alert{inputAlerts[1]},
Labels: model.LabelSet{
model.LabelName("alertname"): model.LabelValue("TestingAlert"),
model.LabelName("service"): model.LabelValue("api"),
"alertname": "TestingAlert",
"service": "api",
},
Receiver: "testing",
},
&AlertGroup{
Alerts: []*types.Alert{inputAlerts[2], inputAlerts[3]},
Labels: model.LabelSet{
model.LabelName("alertname"): model.LabelValue("HighErrorRate"),
model.LabelName("service"): model.LabelValue("api"),
model.LabelName("cluster"): model.LabelValue("aa"),
"alertname": "HighErrorRate",
"service": "api",
"cluster": "aa",
},
Receiver: "prod",
},
&AlertGroup{
Alerts: []*types.Alert{inputAlerts[4]},
Labels: model.LabelSet{
model.LabelName("alertname"): model.LabelValue("HighErrorRate"),
model.LabelName("service"): model.LabelValue("api"),
model.LabelName("cluster"): model.LabelValue("bb"),
"alertname": "HighErrorRate",
"service": "api",
"cluster": "bb",
},
Receiver: "prod",
},
&AlertGroup{
Alerts: []*types.Alert{inputAlerts[5]},
Labels: model.LabelSet{
model.LabelName("alertname"): model.LabelValue("HighLatency"),
model.LabelName("service"): model.LabelValue("db"),
model.LabelName("cluster"): model.LabelValue("bb"),
"alertname": "HighLatency",
"service": "db",
"cluster": "bb",
},
Receiver: "kafka",
},
&AlertGroup{
Alerts: []*types.Alert{inputAlerts[5]},
Labels: model.LabelSet{
model.LabelName("alertname"): model.LabelValue("HighLatency"),
model.LabelName("service"): model.LabelValue("db"),
model.LabelName("cluster"): model.LabelValue("bb"),
"alertname": "HighLatency",
"service": "db",
"cluster": "bb",
},
Receiver: "prod",
},
}, alertGroups)
require.Equal(t, map[model.Fingerprint][]string{
inputAlerts[0].Fingerprint(): []string{"prod"},
inputAlerts[1].Fingerprint(): []string{"testing"},
inputAlerts[2].Fingerprint(): []string{"prod"},
inputAlerts[3].Fingerprint(): []string{"prod"},
inputAlerts[4].Fingerprint(): []string{"prod"},
inputAlerts[5].Fingerprint(): []string{"kafka", "prod"},
inputAlerts[0].Fingerprint(): {"prod"},
inputAlerts[1].Fingerprint(): {"testing"},
inputAlerts[2].Fingerprint(): {"prod"},
inputAlerts[3].Fingerprint(): {"prod"},
inputAlerts[4].Fingerprint(): {"prod"},
inputAlerts[5].Fingerprint(): {"kafka", "prod"},
}, receivers)
require.Equal(t, 0.0, testutil.ToFloat64(m.aggrGroupLimitReached))
// Try to store new alert. This time, we will hit limit for number of groups.
err = alerts.Put(newAlert(model.LabelSet{"env": "prod", "alertname": "NewAlert", "cluster": "new-cluster", "service": "db"}))
if err != nil {
t.Fatal(err)
}
// Let alert get processed.
for i := 0; testutil.ToFloat64(m.aggrGroupLimitReached) == 0 && i < 10; i++ {
time.Sleep(200 * time.Millisecond)
}
require.Equal(t, 1.0, testutil.ToFloat64(m.aggrGroupLimitReached))
// Verify there are still only 6 groups.
alertGroups, _ = dispatcher.Groups(
func(*Route) bool {
return true
}, func(*types.Alert, time.Time) bool {
return true
},
)
require.Len(t, alertGroups, 6)
}
type recordStage struct {
@ -534,7 +564,7 @@ func TestDispatcherRace(t *testing.T) {
defer alerts.Close()
timeout := func(d time.Duration) time.Duration { return time.Duration(0) }
dispatcher := NewDispatcher(alerts, nil, nil, marker, timeout, logger, NewDispatcherMetrics(prometheus.NewRegistry()))
dispatcher := NewDispatcher(alerts, nil, nil, marker, timeout, nil, logger, NewDispatcherMetrics(prometheus.NewRegistry()))
go dispatcher.Run()
dispatcher.Stop()
}
@ -562,7 +592,7 @@ func TestDispatcherRaceOnFirstAlertNotDeliveredWhenGroupWaitIsZero(t *testing.T)
timeout := func(d time.Duration) time.Duration { return d }
recorder := &recordStage{alerts: make(map[string]map[model.Fingerprint]*types.Alert)}
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, logger, NewDispatcherMetrics(prometheus.NewRegistry()))
dispatcher := NewDispatcher(alerts, route, recorder, marker, timeout, nil, logger, NewDispatcherMetrics(prometheus.NewRegistry()))
go dispatcher.Run()
defer dispatcher.Stop()
@ -585,3 +615,11 @@ func TestDispatcherRaceOnFirstAlertNotDeliveredWhenGroupWaitIsZero(t *testing.T)
// We expect all alerts to be notified immediately, since they all belong to different groups.
require.Equal(t, numAlerts, len(recorder.Alerts()))
}
type limits struct {
groups int
}
func (l limits) MaxNumberOfAggregationGroups() int {
return l.groups
}