dispatch: Add group_by_all support (#1588)

To aggregate by all possible labels use '...' as the sole label name. 
This effectively disables aggregation entirely, passing through all 
alerts as-is. This is unlikely to be what you want, unless you have 
a very low alert volume or your upstream notification system performs 
its own grouping. Example: group_by: [...]

Signed-off-by: Kyryl Sablin <kyryl.sablin@schibsted.com>
This commit is contained in:
kirillsablin 2018-11-29 12:31:14 +01:00 committed by Max Inden
parent 758b5e3bb1
commit 32bb289906
9 changed files with 197 additions and 11 deletions

View File

@ -70,6 +70,12 @@ route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
#
# To aggregate by all possible labels use '...' as the sole label name.
# This effectively disables aggregation entirely, passing through all
# alerts as-is. This is unlikely to be what you want, unless you have
# a very low alert volume or your upstream notification system performs
# its own grouping. Example: group_by: [...]
group_by: ['alertname', 'cluster']
# When a new group of alerts is created by an incoming alert, wait at

View File

@ -495,7 +495,10 @@ func (c *GlobalConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
// A Route is a node that contains definitions of how to handle alerts.
type Route struct {
Receiver string `yaml:"receiver,omitempty" json:"receiver,omitempty"`
GroupBy []model.LabelName `yaml:"group_by,omitempty" json:"group_by,omitempty"`
GroupByStr []string `yaml:"group_by,omitempty" json:"group_by,omitempty"`
GroupBy []model.LabelName
GroupByAll bool
Match map[string]string `yaml:"match,omitempty" json:"match,omitempty"`
MatchRE map[string]Regexp `yaml:"match_re,omitempty" json:"match_re,omitempty"`
@ -525,6 +528,21 @@ func (r *Route) UnmarshalYAML(unmarshal func(interface{}) error) error {
return fmt.Errorf("invalid label name %q", k)
}
}
for _, l := range r.GroupByStr {
if l == "..." {
r.GroupByAll = true
} else {
labelName := model.LabelName(l)
if !labelName.IsValid() {
return fmt.Errorf("invalid label name %q in group_by list", l)
}
r.GroupBy = append(r.GroupBy, labelName)
}
}
if len(r.GroupBy) > 0 && r.GroupByAll {
return fmt.Errorf("cannot have wildcard group_by (`...`) and other other labels at the same time")
}
groupBy := map[model.LabelName]struct{}{}

View File

@ -144,6 +144,47 @@ receivers:
}
func TestWildcardGroupByWithOtherGroupByLabels(t *testing.T) {
in := `
route:
group_by: ['alertname', 'cluster', '...']
receiver: team-X-mails
receivers:
- name: 'team-X-mails'
`
_, err := Load(in)
expected := "cannot have wildcard group_by (`...`) and other other labels at the same time"
if err == nil {
t.Fatalf("no error returned, expected:\n%q", expected)
}
if err.Error() != expected {
t.Errorf("\nexpected:\n%q\ngot:\n%q", expected, err.Error())
}
}
func TestGroupByInvalidLabel(t *testing.T) {
in := `
route:
group_by: ['-invalid-']
receiver: team-X-mails
receivers:
- name: 'team-X-mails'
`
_, err := Load(in)
expected := "invalid label name \"-invalid-\" in group_by list"
if err == nil {
t.Fatalf("no error returned, expected:\n%q", expected)
}
if err.Error() != expected {
t.Errorf("\nexpected:\n%q\ngot:\n%q", expected, err.Error())
}
}
func TestRootRouteExists(t *testing.T) {
in := `
receivers:
@ -448,6 +489,12 @@ func TestEmptyFieldsAndRegex(t *testing.T) {
"cluster",
"service",
},
GroupByStr: []string{
"alertname",
"cluster",
"service",
},
GroupByAll: false,
Routes: []*Route{
{
Receiver: "team-X-mails",
@ -506,6 +553,17 @@ func TestSMTPHello(t *testing.T) {
}
}
func TestGroupByAll(t *testing.T) {
c, _, err := LoadFile("testdata/conf.group-by-all.yml")
if err != nil {
t.Errorf("Error parsing %s: %s", "testdata/conf.group-by-all.yml", err)
}
if !c.Route.GroupByAll {
t.Errorf("Invalid group by all param: expected to by true")
}
}
func TestVictorOpsDefaultAPIKey(t *testing.T) {
conf, _, err := LoadFile("testdata/conf.victorops-default-apikey.yml")
if err != nil {

10
config/testdata/conf.group-by-all.yml vendored Normal file
View File

@ -0,0 +1,10 @@
route:
group_by: [...]
group_wait: 30s
group_interval: 5m
repeat_interval: 3h
receiver: team-X
receivers:
- name: 'team-X'

View File

@ -152,13 +152,7 @@ type notifyFunc func(context.Context, ...*types.Alert) bool
// processAlert determines in which aggregation group the alert falls
// and inserts it.
func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
groupLabels := model.LabelSet{}
for ln, lv := range alert.Labels {
if _, ok := route.RouteOpts.GroupBy[ln]; ok {
groupLabels[ln] = lv
}
}
groupLabels := getGroupLabels(alert, route)
fp := groupLabels.Fingerprint()
@ -189,6 +183,17 @@ func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
ag.insert(alert)
}
func getGroupLabels(alert *types.Alert, route *Route) model.LabelSet {
groupLabels := model.LabelSet{}
for ln, lv := range alert.Labels {
if _, ok := route.RouteOpts.GroupBy[ln]; ok || route.RouteOpts.GroupByAll {
groupLabels[ln] = lv
}
}
return groupLabels
}
// aggrGroup aggregates alert fingerprints into groups to which a
// common set of routing options applies.
// It emits notifications in the specified intervals.

View File

@ -240,3 +240,67 @@ func TestAggrGroup(t *testing.T) {
ag.stop()
}
func TestGroupLabels(t *testing.T) {
var a = &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"a": "v1",
"b": "v2",
"c": "v3",
},
},
}
route := &Route{
RouteOpts: RouteOpts{
GroupBy: map[model.LabelName]struct{}{
"a": struct{}{},
"b": struct{}{},
},
GroupByAll: false,
},
}
expLs := model.LabelSet{
"a": "v1",
"b": "v2",
}
ls := getGroupLabels(a, route)
if !reflect.DeepEqual(ls, expLs) {
t.Fatalf("expected labels are %v, but got %v", expLs, ls)
}
}
func TestGroupByAllLabels(t *testing.T) {
var a = &types.Alert{
Alert: model.Alert{
Labels: model.LabelSet{
"a": "v1",
"b": "v2",
"c": "v3",
},
},
}
route := &Route{
RouteOpts: RouteOpts{
GroupBy: map[model.LabelName]struct{}{},
GroupByAll: true,
},
}
expLs := model.LabelSet{
"a": "v1",
"b": "v2",
"c": "v3",
}
ls := getGroupLabels(a, route)
if !reflect.DeepEqual(ls, expLs) {
t.Fatalf("expected labels are %v, but got %v", expLs, ls)
}
}

View File

@ -32,6 +32,7 @@ var DefaultRouteOpts = RouteOpts{
GroupInterval: 5 * time.Minute,
RepeatInterval: 4 * time.Hour,
GroupBy: map[model.LabelName]struct{}{},
GroupByAll: false,
}
// A Route is a node that contains definitions of how to handle alerts.
@ -69,6 +70,9 @@ func NewRoute(cr *config.Route, parent *Route) *Route {
opts.GroupBy[ln] = struct{}{}
}
}
opts.GroupByAll = cr.GroupByAll
if cr.GroupWait != nil {
opts.GroupWait = time.Duration(*cr.GroupWait)
}
@ -158,6 +162,9 @@ type RouteOpts struct {
// What labels to group alerts by for notifications.
GroupBy map[model.LabelName]struct{}
// Use all alert labels to group.
GroupByAll bool
// How long to wait to group matching alerts before sending
// a notification.
GroupWait time.Duration
@ -170,7 +177,8 @@ func (ro *RouteOpts) String() string {
for ln := range ro.GroupBy {
labels = append(labels, ln)
}
return fmt.Sprintf("<RouteOpts send_to:%q group_by:%q timers:%q|%q>", ro.Receiver, labels, ro.GroupWait, ro.GroupInterval)
return fmt.Sprintf("<RouteOpts send_to:%q group_by:%q group_by_all:%t timers:%q|%q>",
ro.Receiver, labels, ro.GroupByAll, ro.GroupWait, ro.GroupInterval)
}
// MarshalJSON returns a JSON representation of the routing options.
@ -178,11 +186,13 @@ func (ro *RouteOpts) MarshalJSON() ([]byte, error) {
v := struct {
Receiver string `json:"receiver"`
GroupBy model.LabelNames `json:"groupBy"`
GroupByAll bool `json:"groupByAll"`
GroupWait time.Duration `json:"groupWait"`
GroupInterval time.Duration `json:"groupInterval"`
RepeatInterval time.Duration `json:"repeatInterval"`
}{
Receiver: ro.Receiver,
GroupByAll: ro.GroupByAll,
GroupWait: ro.GroupWait,
GroupInterval: ro.GroupInterval,
RepeatInterval: ro.RepeatInterval,

View File

@ -39,7 +39,7 @@ routes:
env: 'testing'
receiver: 'notify-testing'
group_by: []
group_by: [...]
- match:
env: "production"
@ -110,6 +110,7 @@ routes:
{
Receiver: "notify-A",
GroupBy: def.GroupBy,
GroupByAll: false,
GroupWait: def.GroupWait,
GroupInterval: def.GroupInterval,
RepeatInterval: def.RepeatInterval,
@ -126,6 +127,7 @@ routes:
{
Receiver: "notify-A",
GroupBy: def.GroupBy,
GroupByAll: false,
GroupWait: def.GroupWait,
GroupInterval: def.GroupInterval,
RepeatInterval: def.RepeatInterval,
@ -141,6 +143,7 @@ routes:
{
Receiver: "notify-BC",
GroupBy: lset("foo", "bar"),
GroupByAll: false,
GroupWait: 2 * time.Minute,
GroupInterval: def.GroupInterval,
RepeatInterval: def.RepeatInterval,
@ -157,6 +160,7 @@ routes:
{
Receiver: "notify-testing",
GroupBy: lset(),
GroupByAll: true,
GroupWait: def.GroupWait,
GroupInterval: def.GroupInterval,
RepeatInterval: def.RepeatInterval,
@ -173,6 +177,7 @@ routes:
{
Receiver: "notify-productionA",
GroupBy: def.GroupBy,
GroupByAll: false,
GroupWait: 1 * time.Minute,
GroupInterval: def.GroupInterval,
RepeatInterval: def.RepeatInterval,
@ -180,6 +185,7 @@ routes:
{
Receiver: "notify-productionB",
GroupBy: lset("job"),
GroupByAll: false,
GroupWait: 30 * time.Second,
GroupInterval: 5 * time.Minute,
RepeatInterval: 1 * time.Hour,
@ -198,6 +204,7 @@ routes:
{
Receiver: "notify-def",
GroupBy: lset("role"),
GroupByAll: false,
GroupWait: def.GroupWait,
GroupInterval: def.GroupInterval,
RepeatInterval: def.RepeatInterval,
@ -214,6 +221,7 @@ routes:
{
Receiver: "notify-testing",
GroupBy: lset("role"),
GroupByAll: false,
GroupWait: def.GroupWait,
GroupInterval: def.GroupInterval,
RepeatInterval: def.RepeatInterval,
@ -231,6 +239,7 @@ routes:
{
Receiver: "notify-testing",
GroupBy: lset("role"),
GroupByAll: false,
GroupWait: 2 * time.Minute,
GroupInterval: def.GroupInterval,
RepeatInterval: def.RepeatInterval,

View File

@ -18,6 +18,12 @@ route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
#
# To aggregate by all possible labels use '...' as the sole label name.
# This effectively disables aggregation entirely, passing through all
# alerts as-is. This is unlikely to be what you want, unless you have
# a very low alert volume or your upstream notification system performs
# its own grouping. Example: group_by: [...]
group_by: ['alertname', 'cluster', 'service']
# When a new group of alerts is created by an incoming alert, wait at