From 32bb2899061bb9ff140ccffcb4215727a0f64b51 Mon Sep 17 00:00:00 2001 From: kirillsablin Date: Thu, 29 Nov 2018 12:31:14 +0100 Subject: [PATCH] dispatch: Add group_by_all support (#1588) To aggregate by all possible labels use '...' as the sole label name. This effectively disables aggregation entirely, passing through all alerts as-is. This is unlikely to be what you want, unless you have a very low alert volume or your upstream notification system performs its own grouping. Example: group_by: [...] Signed-off-by: Kyryl Sablin --- README.md | 6 +++ config/config.go | 22 ++++++++- config/config_test.go | 58 ++++++++++++++++++++++++ config/testdata/conf.group-by-all.yml | 10 +++++ dispatch/dispatch.go | 19 +++++--- dispatch/dispatch_test.go | 64 +++++++++++++++++++++++++++ dispatch/route.go | 12 ++++- dispatch/route_test.go | 11 ++++- doc/examples/simple.yml | 6 +++ 9 files changed, 197 insertions(+), 11 deletions(-) create mode 100644 config/testdata/conf.group-by-all.yml diff --git a/README.md b/README.md index 677f5bdf..281fa446 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,12 @@ route: # The labels by which incoming alerts are grouped together. For example, # multiple alerts coming in for cluster=A and alertname=LatencyHigh would # be batched into a single group. + # + # To aggregate by all possible labels use '...' as the sole label name. + # This effectively disables aggregation entirely, passing through all + # alerts as-is. This is unlikely to be what you want, unless you have + # a very low alert volume or your upstream notification system performs + # its own grouping. Example: group_by: [...] group_by: ['alertname', 'cluster'] # When a new group of alerts is created by an incoming alert, wait at diff --git a/config/config.go b/config/config.go index e4322e47..549d9648 100644 --- a/config/config.go +++ b/config/config.go @@ -494,8 +494,11 @@ func (c *GlobalConfig) UnmarshalYAML(unmarshal func(interface{}) error) error { // A Route is a node that contains definitions of how to handle alerts. type Route struct { - Receiver string `yaml:"receiver,omitempty" json:"receiver,omitempty"` - GroupBy []model.LabelName `yaml:"group_by,omitempty" json:"group_by,omitempty"` + Receiver string `yaml:"receiver,omitempty" json:"receiver,omitempty"` + + GroupByStr []string `yaml:"group_by,omitempty" json:"group_by,omitempty"` + GroupBy []model.LabelName + GroupByAll bool Match map[string]string `yaml:"match,omitempty" json:"match,omitempty"` MatchRE map[string]Regexp `yaml:"match_re,omitempty" json:"match_re,omitempty"` @@ -525,6 +528,21 @@ func (r *Route) UnmarshalYAML(unmarshal func(interface{}) error) error { return fmt.Errorf("invalid label name %q", k) } } + for _, l := range r.GroupByStr { + if l == "..." { + r.GroupByAll = true + } else { + labelName := model.LabelName(l) + if !labelName.IsValid() { + return fmt.Errorf("invalid label name %q in group_by list", l) + } + r.GroupBy = append(r.GroupBy, labelName) + } + } + + if len(r.GroupBy) > 0 && r.GroupByAll { + return fmt.Errorf("cannot have wildcard group_by (`...`) and other other labels at the same time") + } groupBy := map[model.LabelName]struct{}{} diff --git a/config/config_test.go b/config/config_test.go index a1b94e21..dd73f37d 100644 --- a/config/config_test.go +++ b/config/config_test.go @@ -144,6 +144,47 @@ receivers: } +func TestWildcardGroupByWithOtherGroupByLabels(t *testing.T) { + in := ` +route: + group_by: ['alertname', 'cluster', '...'] + receiver: team-X-mails +receivers: +- name: 'team-X-mails' +` + _, err := Load(in) + + expected := "cannot have wildcard group_by (`...`) and other other labels at the same time" + + if err == nil { + t.Fatalf("no error returned, expected:\n%q", expected) + } + if err.Error() != expected { + t.Errorf("\nexpected:\n%q\ngot:\n%q", expected, err.Error()) + } +} + +func TestGroupByInvalidLabel(t *testing.T) { + in := ` +route: + group_by: ['-invalid-'] + receiver: team-X-mails +receivers: +- name: 'team-X-mails' +` + _, err := Load(in) + + expected := "invalid label name \"-invalid-\" in group_by list" + + if err == nil { + t.Fatalf("no error returned, expected:\n%q", expected) + } + if err.Error() != expected { + t.Errorf("\nexpected:\n%q\ngot:\n%q", expected, err.Error()) + } + +} + func TestRootRouteExists(t *testing.T) { in := ` receivers: @@ -448,6 +489,12 @@ func TestEmptyFieldsAndRegex(t *testing.T) { "cluster", "service", }, + GroupByStr: []string{ + "alertname", + "cluster", + "service", + }, + GroupByAll: false, Routes: []*Route{ { Receiver: "team-X-mails", @@ -506,6 +553,17 @@ func TestSMTPHello(t *testing.T) { } } +func TestGroupByAll(t *testing.T) { + c, _, err := LoadFile("testdata/conf.group-by-all.yml") + if err != nil { + t.Errorf("Error parsing %s: %s", "testdata/conf.group-by-all.yml", err) + } + + if !c.Route.GroupByAll { + t.Errorf("Invalid group by all param: expected to by true") + } +} + func TestVictorOpsDefaultAPIKey(t *testing.T) { conf, _, err := LoadFile("testdata/conf.victorops-default-apikey.yml") if err != nil { diff --git a/config/testdata/conf.group-by-all.yml b/config/testdata/conf.group-by-all.yml new file mode 100644 index 00000000..12c8f3b9 --- /dev/null +++ b/config/testdata/conf.group-by-all.yml @@ -0,0 +1,10 @@ +route: + group_by: [...] + group_wait: 30s + group_interval: 5m + repeat_interval: 3h + receiver: team-X + +receivers: +- name: 'team-X' + diff --git a/dispatch/dispatch.go b/dispatch/dispatch.go index 01182d5b..89080994 100644 --- a/dispatch/dispatch.go +++ b/dispatch/dispatch.go @@ -152,13 +152,7 @@ type notifyFunc func(context.Context, ...*types.Alert) bool // processAlert determines in which aggregation group the alert falls // and inserts it. func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) { - groupLabels := model.LabelSet{} - - for ln, lv := range alert.Labels { - if _, ok := route.RouteOpts.GroupBy[ln]; ok { - groupLabels[ln] = lv - } - } + groupLabels := getGroupLabels(alert, route) fp := groupLabels.Fingerprint() @@ -189,6 +183,17 @@ func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) { ag.insert(alert) } +func getGroupLabels(alert *types.Alert, route *Route) model.LabelSet { + groupLabels := model.LabelSet{} + for ln, lv := range alert.Labels { + if _, ok := route.RouteOpts.GroupBy[ln]; ok || route.RouteOpts.GroupByAll { + groupLabels[ln] = lv + } + } + + return groupLabels +} + // aggrGroup aggregates alert fingerprints into groups to which a // common set of routing options applies. // It emits notifications in the specified intervals. diff --git a/dispatch/dispatch_test.go b/dispatch/dispatch_test.go index f68745b2..4ec2ab84 100644 --- a/dispatch/dispatch_test.go +++ b/dispatch/dispatch_test.go @@ -240,3 +240,67 @@ func TestAggrGroup(t *testing.T) { ag.stop() } + +func TestGroupLabels(t *testing.T) { + var a = &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{ + "a": "v1", + "b": "v2", + "c": "v3", + }, + }, + } + + route := &Route{ + RouteOpts: RouteOpts{ + GroupBy: map[model.LabelName]struct{}{ + "a": struct{}{}, + "b": struct{}{}, + }, + GroupByAll: false, + }, + } + + expLs := model.LabelSet{ + "a": "v1", + "b": "v2", + } + + ls := getGroupLabels(a, route) + + if !reflect.DeepEqual(ls, expLs) { + t.Fatalf("expected labels are %v, but got %v", expLs, ls) + } +} + +func TestGroupByAllLabels(t *testing.T) { + var a = &types.Alert{ + Alert: model.Alert{ + Labels: model.LabelSet{ + "a": "v1", + "b": "v2", + "c": "v3", + }, + }, + } + + route := &Route{ + RouteOpts: RouteOpts{ + GroupBy: map[model.LabelName]struct{}{}, + GroupByAll: true, + }, + } + + expLs := model.LabelSet{ + "a": "v1", + "b": "v2", + "c": "v3", + } + + ls := getGroupLabels(a, route) + + if !reflect.DeepEqual(ls, expLs) { + t.Fatalf("expected labels are %v, but got %v", expLs, ls) + } +} diff --git a/dispatch/route.go b/dispatch/route.go index add2afb6..928d641a 100644 --- a/dispatch/route.go +++ b/dispatch/route.go @@ -32,6 +32,7 @@ var DefaultRouteOpts = RouteOpts{ GroupInterval: 5 * time.Minute, RepeatInterval: 4 * time.Hour, GroupBy: map[model.LabelName]struct{}{}, + GroupByAll: false, } // A Route is a node that contains definitions of how to handle alerts. @@ -69,6 +70,9 @@ func NewRoute(cr *config.Route, parent *Route) *Route { opts.GroupBy[ln] = struct{}{} } } + + opts.GroupByAll = cr.GroupByAll + if cr.GroupWait != nil { opts.GroupWait = time.Duration(*cr.GroupWait) } @@ -158,6 +162,9 @@ type RouteOpts struct { // What labels to group alerts by for notifications. GroupBy map[model.LabelName]struct{} + // Use all alert labels to group. + GroupByAll bool + // How long to wait to group matching alerts before sending // a notification. GroupWait time.Duration @@ -170,7 +177,8 @@ func (ro *RouteOpts) String() string { for ln := range ro.GroupBy { labels = append(labels, ln) } - return fmt.Sprintf("", ro.Receiver, labels, ro.GroupWait, ro.GroupInterval) + return fmt.Sprintf("", + ro.Receiver, labels, ro.GroupByAll, ro.GroupWait, ro.GroupInterval) } // MarshalJSON returns a JSON representation of the routing options. @@ -178,11 +186,13 @@ func (ro *RouteOpts) MarshalJSON() ([]byte, error) { v := struct { Receiver string `json:"receiver"` GroupBy model.LabelNames `json:"groupBy"` + GroupByAll bool `json:"groupByAll"` GroupWait time.Duration `json:"groupWait"` GroupInterval time.Duration `json:"groupInterval"` RepeatInterval time.Duration `json:"repeatInterval"` }{ Receiver: ro.Receiver, + GroupByAll: ro.GroupByAll, GroupWait: ro.GroupWait, GroupInterval: ro.GroupInterval, RepeatInterval: ro.RepeatInterval, diff --git a/dispatch/route_test.go b/dispatch/route_test.go index 396c41af..e291b3f2 100644 --- a/dispatch/route_test.go +++ b/dispatch/route_test.go @@ -39,7 +39,7 @@ routes: env: 'testing' receiver: 'notify-testing' - group_by: [] + group_by: [...] - match: env: "production" @@ -110,6 +110,7 @@ routes: { Receiver: "notify-A", GroupBy: def.GroupBy, + GroupByAll: false, GroupWait: def.GroupWait, GroupInterval: def.GroupInterval, RepeatInterval: def.RepeatInterval, @@ -126,6 +127,7 @@ routes: { Receiver: "notify-A", GroupBy: def.GroupBy, + GroupByAll: false, GroupWait: def.GroupWait, GroupInterval: def.GroupInterval, RepeatInterval: def.RepeatInterval, @@ -141,6 +143,7 @@ routes: { Receiver: "notify-BC", GroupBy: lset("foo", "bar"), + GroupByAll: false, GroupWait: 2 * time.Minute, GroupInterval: def.GroupInterval, RepeatInterval: def.RepeatInterval, @@ -157,6 +160,7 @@ routes: { Receiver: "notify-testing", GroupBy: lset(), + GroupByAll: true, GroupWait: def.GroupWait, GroupInterval: def.GroupInterval, RepeatInterval: def.RepeatInterval, @@ -173,6 +177,7 @@ routes: { Receiver: "notify-productionA", GroupBy: def.GroupBy, + GroupByAll: false, GroupWait: 1 * time.Minute, GroupInterval: def.GroupInterval, RepeatInterval: def.RepeatInterval, @@ -180,6 +185,7 @@ routes: { Receiver: "notify-productionB", GroupBy: lset("job"), + GroupByAll: false, GroupWait: 30 * time.Second, GroupInterval: 5 * time.Minute, RepeatInterval: 1 * time.Hour, @@ -198,6 +204,7 @@ routes: { Receiver: "notify-def", GroupBy: lset("role"), + GroupByAll: false, GroupWait: def.GroupWait, GroupInterval: def.GroupInterval, RepeatInterval: def.RepeatInterval, @@ -214,6 +221,7 @@ routes: { Receiver: "notify-testing", GroupBy: lset("role"), + GroupByAll: false, GroupWait: def.GroupWait, GroupInterval: def.GroupInterval, RepeatInterval: def.RepeatInterval, @@ -231,6 +239,7 @@ routes: { Receiver: "notify-testing", GroupBy: lset("role"), + GroupByAll: false, GroupWait: 2 * time.Minute, GroupInterval: def.GroupInterval, RepeatInterval: def.RepeatInterval, diff --git a/doc/examples/simple.yml b/doc/examples/simple.yml index 3a2165ca..921e718e 100644 --- a/doc/examples/simple.yml +++ b/doc/examples/simple.yml @@ -18,6 +18,12 @@ route: # The labels by which incoming alerts are grouped together. For example, # multiple alerts coming in for cluster=A and alertname=LatencyHigh would # be batched into a single group. + # + # To aggregate by all possible labels use '...' as the sole label name. + # This effectively disables aggregation entirely, passing through all + # alerts as-is. This is unlikely to be what you want, unless you have + # a very low alert volume or your upstream notification system performs + # its own grouping. Example: group_by: [...] group_by: ['alertname', 'cluster', 'service'] # When a new group of alerts is created by an incoming alert, wait at