dispatch: Add group_by_all support (#1588)
To aggregate by all possible labels use '...' as the sole label name. This effectively disables aggregation entirely, passing through all alerts as-is. This is unlikely to be what you want, unless you have a very low alert volume or your upstream notification system performs its own grouping. Example: group_by: [...] Signed-off-by: Kyryl Sablin <kyryl.sablin@schibsted.com>
This commit is contained in:
parent
758b5e3bb1
commit
32bb289906
|
@ -70,6 +70,12 @@ route:
|
||||||
# The labels by which incoming alerts are grouped together. For example,
|
# The labels by which incoming alerts are grouped together. For example,
|
||||||
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
|
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
|
||||||
# be batched into a single group.
|
# be batched into a single group.
|
||||||
|
#
|
||||||
|
# To aggregate by all possible labels use '...' as the sole label name.
|
||||||
|
# This effectively disables aggregation entirely, passing through all
|
||||||
|
# alerts as-is. This is unlikely to be what you want, unless you have
|
||||||
|
# a very low alert volume or your upstream notification system performs
|
||||||
|
# its own grouping. Example: group_by: [...]
|
||||||
group_by: ['alertname', 'cluster']
|
group_by: ['alertname', 'cluster']
|
||||||
|
|
||||||
# When a new group of alerts is created by an incoming alert, wait at
|
# When a new group of alerts is created by an incoming alert, wait at
|
||||||
|
|
|
@ -495,7 +495,10 @@ func (c *GlobalConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||||
// A Route is a node that contains definitions of how to handle alerts.
|
// A Route is a node that contains definitions of how to handle alerts.
|
||||||
type Route struct {
|
type Route struct {
|
||||||
Receiver string `yaml:"receiver,omitempty" json:"receiver,omitempty"`
|
Receiver string `yaml:"receiver,omitempty" json:"receiver,omitempty"`
|
||||||
GroupBy []model.LabelName `yaml:"group_by,omitempty" json:"group_by,omitempty"`
|
|
||||||
|
GroupByStr []string `yaml:"group_by,omitempty" json:"group_by,omitempty"`
|
||||||
|
GroupBy []model.LabelName
|
||||||
|
GroupByAll bool
|
||||||
|
|
||||||
Match map[string]string `yaml:"match,omitempty" json:"match,omitempty"`
|
Match map[string]string `yaml:"match,omitempty" json:"match,omitempty"`
|
||||||
MatchRE map[string]Regexp `yaml:"match_re,omitempty" json:"match_re,omitempty"`
|
MatchRE map[string]Regexp `yaml:"match_re,omitempty" json:"match_re,omitempty"`
|
||||||
|
@ -525,6 +528,21 @@ func (r *Route) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
||||||
return fmt.Errorf("invalid label name %q", k)
|
return fmt.Errorf("invalid label name %q", k)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for _, l := range r.GroupByStr {
|
||||||
|
if l == "..." {
|
||||||
|
r.GroupByAll = true
|
||||||
|
} else {
|
||||||
|
labelName := model.LabelName(l)
|
||||||
|
if !labelName.IsValid() {
|
||||||
|
return fmt.Errorf("invalid label name %q in group_by list", l)
|
||||||
|
}
|
||||||
|
r.GroupBy = append(r.GroupBy, labelName)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(r.GroupBy) > 0 && r.GroupByAll {
|
||||||
|
return fmt.Errorf("cannot have wildcard group_by (`...`) and other other labels at the same time")
|
||||||
|
}
|
||||||
|
|
||||||
groupBy := map[model.LabelName]struct{}{}
|
groupBy := map[model.LabelName]struct{}{}
|
||||||
|
|
||||||
|
|
|
@ -144,6 +144,47 @@ receivers:
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestWildcardGroupByWithOtherGroupByLabels(t *testing.T) {
|
||||||
|
in := `
|
||||||
|
route:
|
||||||
|
group_by: ['alertname', 'cluster', '...']
|
||||||
|
receiver: team-X-mails
|
||||||
|
receivers:
|
||||||
|
- name: 'team-X-mails'
|
||||||
|
`
|
||||||
|
_, err := Load(in)
|
||||||
|
|
||||||
|
expected := "cannot have wildcard group_by (`...`) and other other labels at the same time"
|
||||||
|
|
||||||
|
if err == nil {
|
||||||
|
t.Fatalf("no error returned, expected:\n%q", expected)
|
||||||
|
}
|
||||||
|
if err.Error() != expected {
|
||||||
|
t.Errorf("\nexpected:\n%q\ngot:\n%q", expected, err.Error())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGroupByInvalidLabel(t *testing.T) {
|
||||||
|
in := `
|
||||||
|
route:
|
||||||
|
group_by: ['-invalid-']
|
||||||
|
receiver: team-X-mails
|
||||||
|
receivers:
|
||||||
|
- name: 'team-X-mails'
|
||||||
|
`
|
||||||
|
_, err := Load(in)
|
||||||
|
|
||||||
|
expected := "invalid label name \"-invalid-\" in group_by list"
|
||||||
|
|
||||||
|
if err == nil {
|
||||||
|
t.Fatalf("no error returned, expected:\n%q", expected)
|
||||||
|
}
|
||||||
|
if err.Error() != expected {
|
||||||
|
t.Errorf("\nexpected:\n%q\ngot:\n%q", expected, err.Error())
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
func TestRootRouteExists(t *testing.T) {
|
func TestRootRouteExists(t *testing.T) {
|
||||||
in := `
|
in := `
|
||||||
receivers:
|
receivers:
|
||||||
|
@ -448,6 +489,12 @@ func TestEmptyFieldsAndRegex(t *testing.T) {
|
||||||
"cluster",
|
"cluster",
|
||||||
"service",
|
"service",
|
||||||
},
|
},
|
||||||
|
GroupByStr: []string{
|
||||||
|
"alertname",
|
||||||
|
"cluster",
|
||||||
|
"service",
|
||||||
|
},
|
||||||
|
GroupByAll: false,
|
||||||
Routes: []*Route{
|
Routes: []*Route{
|
||||||
{
|
{
|
||||||
Receiver: "team-X-mails",
|
Receiver: "team-X-mails",
|
||||||
|
@ -506,6 +553,17 @@ func TestSMTPHello(t *testing.T) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestGroupByAll(t *testing.T) {
|
||||||
|
c, _, err := LoadFile("testdata/conf.group-by-all.yml")
|
||||||
|
if err != nil {
|
||||||
|
t.Errorf("Error parsing %s: %s", "testdata/conf.group-by-all.yml", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if !c.Route.GroupByAll {
|
||||||
|
t.Errorf("Invalid group by all param: expected to by true")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func TestVictorOpsDefaultAPIKey(t *testing.T) {
|
func TestVictorOpsDefaultAPIKey(t *testing.T) {
|
||||||
conf, _, err := LoadFile("testdata/conf.victorops-default-apikey.yml")
|
conf, _, err := LoadFile("testdata/conf.victorops-default-apikey.yml")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
@ -0,0 +1,10 @@
|
||||||
|
route:
|
||||||
|
group_by: [...]
|
||||||
|
group_wait: 30s
|
||||||
|
group_interval: 5m
|
||||||
|
repeat_interval: 3h
|
||||||
|
receiver: team-X
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: 'team-X'
|
||||||
|
|
|
@ -152,13 +152,7 @@ type notifyFunc func(context.Context, ...*types.Alert) bool
|
||||||
// processAlert determines in which aggregation group the alert falls
|
// processAlert determines in which aggregation group the alert falls
|
||||||
// and inserts it.
|
// and inserts it.
|
||||||
func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
|
func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
|
||||||
groupLabels := model.LabelSet{}
|
groupLabels := getGroupLabels(alert, route)
|
||||||
|
|
||||||
for ln, lv := range alert.Labels {
|
|
||||||
if _, ok := route.RouteOpts.GroupBy[ln]; ok {
|
|
||||||
groupLabels[ln] = lv
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
fp := groupLabels.Fingerprint()
|
fp := groupLabels.Fingerprint()
|
||||||
|
|
||||||
|
@ -189,6 +183,17 @@ func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
|
||||||
ag.insert(alert)
|
ag.insert(alert)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func getGroupLabels(alert *types.Alert, route *Route) model.LabelSet {
|
||||||
|
groupLabels := model.LabelSet{}
|
||||||
|
for ln, lv := range alert.Labels {
|
||||||
|
if _, ok := route.RouteOpts.GroupBy[ln]; ok || route.RouteOpts.GroupByAll {
|
||||||
|
groupLabels[ln] = lv
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return groupLabels
|
||||||
|
}
|
||||||
|
|
||||||
// aggrGroup aggregates alert fingerprints into groups to which a
|
// aggrGroup aggregates alert fingerprints into groups to which a
|
||||||
// common set of routing options applies.
|
// common set of routing options applies.
|
||||||
// It emits notifications in the specified intervals.
|
// It emits notifications in the specified intervals.
|
||||||
|
|
|
@ -240,3 +240,67 @@ func TestAggrGroup(t *testing.T) {
|
||||||
|
|
||||||
ag.stop()
|
ag.stop()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestGroupLabels(t *testing.T) {
|
||||||
|
var a = &types.Alert{
|
||||||
|
Alert: model.Alert{
|
||||||
|
Labels: model.LabelSet{
|
||||||
|
"a": "v1",
|
||||||
|
"b": "v2",
|
||||||
|
"c": "v3",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
route := &Route{
|
||||||
|
RouteOpts: RouteOpts{
|
||||||
|
GroupBy: map[model.LabelName]struct{}{
|
||||||
|
"a": struct{}{},
|
||||||
|
"b": struct{}{},
|
||||||
|
},
|
||||||
|
GroupByAll: false,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
expLs := model.LabelSet{
|
||||||
|
"a": "v1",
|
||||||
|
"b": "v2",
|
||||||
|
}
|
||||||
|
|
||||||
|
ls := getGroupLabels(a, route)
|
||||||
|
|
||||||
|
if !reflect.DeepEqual(ls, expLs) {
|
||||||
|
t.Fatalf("expected labels are %v, but got %v", expLs, ls)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGroupByAllLabels(t *testing.T) {
|
||||||
|
var a = &types.Alert{
|
||||||
|
Alert: model.Alert{
|
||||||
|
Labels: model.LabelSet{
|
||||||
|
"a": "v1",
|
||||||
|
"b": "v2",
|
||||||
|
"c": "v3",
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
route := &Route{
|
||||||
|
RouteOpts: RouteOpts{
|
||||||
|
GroupBy: map[model.LabelName]struct{}{},
|
||||||
|
GroupByAll: true,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
expLs := model.LabelSet{
|
||||||
|
"a": "v1",
|
||||||
|
"b": "v2",
|
||||||
|
"c": "v3",
|
||||||
|
}
|
||||||
|
|
||||||
|
ls := getGroupLabels(a, route)
|
||||||
|
|
||||||
|
if !reflect.DeepEqual(ls, expLs) {
|
||||||
|
t.Fatalf("expected labels are %v, but got %v", expLs, ls)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -32,6 +32,7 @@ var DefaultRouteOpts = RouteOpts{
|
||||||
GroupInterval: 5 * time.Minute,
|
GroupInterval: 5 * time.Minute,
|
||||||
RepeatInterval: 4 * time.Hour,
|
RepeatInterval: 4 * time.Hour,
|
||||||
GroupBy: map[model.LabelName]struct{}{},
|
GroupBy: map[model.LabelName]struct{}{},
|
||||||
|
GroupByAll: false,
|
||||||
}
|
}
|
||||||
|
|
||||||
// A Route is a node that contains definitions of how to handle alerts.
|
// A Route is a node that contains definitions of how to handle alerts.
|
||||||
|
@ -69,6 +70,9 @@ func NewRoute(cr *config.Route, parent *Route) *Route {
|
||||||
opts.GroupBy[ln] = struct{}{}
|
opts.GroupBy[ln] = struct{}{}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
opts.GroupByAll = cr.GroupByAll
|
||||||
|
|
||||||
if cr.GroupWait != nil {
|
if cr.GroupWait != nil {
|
||||||
opts.GroupWait = time.Duration(*cr.GroupWait)
|
opts.GroupWait = time.Duration(*cr.GroupWait)
|
||||||
}
|
}
|
||||||
|
@ -158,6 +162,9 @@ type RouteOpts struct {
|
||||||
// What labels to group alerts by for notifications.
|
// What labels to group alerts by for notifications.
|
||||||
GroupBy map[model.LabelName]struct{}
|
GroupBy map[model.LabelName]struct{}
|
||||||
|
|
||||||
|
// Use all alert labels to group.
|
||||||
|
GroupByAll bool
|
||||||
|
|
||||||
// How long to wait to group matching alerts before sending
|
// How long to wait to group matching alerts before sending
|
||||||
// a notification.
|
// a notification.
|
||||||
GroupWait time.Duration
|
GroupWait time.Duration
|
||||||
|
@ -170,7 +177,8 @@ func (ro *RouteOpts) String() string {
|
||||||
for ln := range ro.GroupBy {
|
for ln := range ro.GroupBy {
|
||||||
labels = append(labels, ln)
|
labels = append(labels, ln)
|
||||||
}
|
}
|
||||||
return fmt.Sprintf("<RouteOpts send_to:%q group_by:%q timers:%q|%q>", ro.Receiver, labels, ro.GroupWait, ro.GroupInterval)
|
return fmt.Sprintf("<RouteOpts send_to:%q group_by:%q group_by_all:%t timers:%q|%q>",
|
||||||
|
ro.Receiver, labels, ro.GroupByAll, ro.GroupWait, ro.GroupInterval)
|
||||||
}
|
}
|
||||||
|
|
||||||
// MarshalJSON returns a JSON representation of the routing options.
|
// MarshalJSON returns a JSON representation of the routing options.
|
||||||
|
@ -178,11 +186,13 @@ func (ro *RouteOpts) MarshalJSON() ([]byte, error) {
|
||||||
v := struct {
|
v := struct {
|
||||||
Receiver string `json:"receiver"`
|
Receiver string `json:"receiver"`
|
||||||
GroupBy model.LabelNames `json:"groupBy"`
|
GroupBy model.LabelNames `json:"groupBy"`
|
||||||
|
GroupByAll bool `json:"groupByAll"`
|
||||||
GroupWait time.Duration `json:"groupWait"`
|
GroupWait time.Duration `json:"groupWait"`
|
||||||
GroupInterval time.Duration `json:"groupInterval"`
|
GroupInterval time.Duration `json:"groupInterval"`
|
||||||
RepeatInterval time.Duration `json:"repeatInterval"`
|
RepeatInterval time.Duration `json:"repeatInterval"`
|
||||||
}{
|
}{
|
||||||
Receiver: ro.Receiver,
|
Receiver: ro.Receiver,
|
||||||
|
GroupByAll: ro.GroupByAll,
|
||||||
GroupWait: ro.GroupWait,
|
GroupWait: ro.GroupWait,
|
||||||
GroupInterval: ro.GroupInterval,
|
GroupInterval: ro.GroupInterval,
|
||||||
RepeatInterval: ro.RepeatInterval,
|
RepeatInterval: ro.RepeatInterval,
|
||||||
|
|
|
@ -39,7 +39,7 @@ routes:
|
||||||
env: 'testing'
|
env: 'testing'
|
||||||
|
|
||||||
receiver: 'notify-testing'
|
receiver: 'notify-testing'
|
||||||
group_by: []
|
group_by: [...]
|
||||||
|
|
||||||
- match:
|
- match:
|
||||||
env: "production"
|
env: "production"
|
||||||
|
@ -110,6 +110,7 @@ routes:
|
||||||
{
|
{
|
||||||
Receiver: "notify-A",
|
Receiver: "notify-A",
|
||||||
GroupBy: def.GroupBy,
|
GroupBy: def.GroupBy,
|
||||||
|
GroupByAll: false,
|
||||||
GroupWait: def.GroupWait,
|
GroupWait: def.GroupWait,
|
||||||
GroupInterval: def.GroupInterval,
|
GroupInterval: def.GroupInterval,
|
||||||
RepeatInterval: def.RepeatInterval,
|
RepeatInterval: def.RepeatInterval,
|
||||||
|
@ -126,6 +127,7 @@ routes:
|
||||||
{
|
{
|
||||||
Receiver: "notify-A",
|
Receiver: "notify-A",
|
||||||
GroupBy: def.GroupBy,
|
GroupBy: def.GroupBy,
|
||||||
|
GroupByAll: false,
|
||||||
GroupWait: def.GroupWait,
|
GroupWait: def.GroupWait,
|
||||||
GroupInterval: def.GroupInterval,
|
GroupInterval: def.GroupInterval,
|
||||||
RepeatInterval: def.RepeatInterval,
|
RepeatInterval: def.RepeatInterval,
|
||||||
|
@ -141,6 +143,7 @@ routes:
|
||||||
{
|
{
|
||||||
Receiver: "notify-BC",
|
Receiver: "notify-BC",
|
||||||
GroupBy: lset("foo", "bar"),
|
GroupBy: lset("foo", "bar"),
|
||||||
|
GroupByAll: false,
|
||||||
GroupWait: 2 * time.Minute,
|
GroupWait: 2 * time.Minute,
|
||||||
GroupInterval: def.GroupInterval,
|
GroupInterval: def.GroupInterval,
|
||||||
RepeatInterval: def.RepeatInterval,
|
RepeatInterval: def.RepeatInterval,
|
||||||
|
@ -157,6 +160,7 @@ routes:
|
||||||
{
|
{
|
||||||
Receiver: "notify-testing",
|
Receiver: "notify-testing",
|
||||||
GroupBy: lset(),
|
GroupBy: lset(),
|
||||||
|
GroupByAll: true,
|
||||||
GroupWait: def.GroupWait,
|
GroupWait: def.GroupWait,
|
||||||
GroupInterval: def.GroupInterval,
|
GroupInterval: def.GroupInterval,
|
||||||
RepeatInterval: def.RepeatInterval,
|
RepeatInterval: def.RepeatInterval,
|
||||||
|
@ -173,6 +177,7 @@ routes:
|
||||||
{
|
{
|
||||||
Receiver: "notify-productionA",
|
Receiver: "notify-productionA",
|
||||||
GroupBy: def.GroupBy,
|
GroupBy: def.GroupBy,
|
||||||
|
GroupByAll: false,
|
||||||
GroupWait: 1 * time.Minute,
|
GroupWait: 1 * time.Minute,
|
||||||
GroupInterval: def.GroupInterval,
|
GroupInterval: def.GroupInterval,
|
||||||
RepeatInterval: def.RepeatInterval,
|
RepeatInterval: def.RepeatInterval,
|
||||||
|
@ -180,6 +185,7 @@ routes:
|
||||||
{
|
{
|
||||||
Receiver: "notify-productionB",
|
Receiver: "notify-productionB",
|
||||||
GroupBy: lset("job"),
|
GroupBy: lset("job"),
|
||||||
|
GroupByAll: false,
|
||||||
GroupWait: 30 * time.Second,
|
GroupWait: 30 * time.Second,
|
||||||
GroupInterval: 5 * time.Minute,
|
GroupInterval: 5 * time.Minute,
|
||||||
RepeatInterval: 1 * time.Hour,
|
RepeatInterval: 1 * time.Hour,
|
||||||
|
@ -198,6 +204,7 @@ routes:
|
||||||
{
|
{
|
||||||
Receiver: "notify-def",
|
Receiver: "notify-def",
|
||||||
GroupBy: lset("role"),
|
GroupBy: lset("role"),
|
||||||
|
GroupByAll: false,
|
||||||
GroupWait: def.GroupWait,
|
GroupWait: def.GroupWait,
|
||||||
GroupInterval: def.GroupInterval,
|
GroupInterval: def.GroupInterval,
|
||||||
RepeatInterval: def.RepeatInterval,
|
RepeatInterval: def.RepeatInterval,
|
||||||
|
@ -214,6 +221,7 @@ routes:
|
||||||
{
|
{
|
||||||
Receiver: "notify-testing",
|
Receiver: "notify-testing",
|
||||||
GroupBy: lset("role"),
|
GroupBy: lset("role"),
|
||||||
|
GroupByAll: false,
|
||||||
GroupWait: def.GroupWait,
|
GroupWait: def.GroupWait,
|
||||||
GroupInterval: def.GroupInterval,
|
GroupInterval: def.GroupInterval,
|
||||||
RepeatInterval: def.RepeatInterval,
|
RepeatInterval: def.RepeatInterval,
|
||||||
|
@ -231,6 +239,7 @@ routes:
|
||||||
{
|
{
|
||||||
Receiver: "notify-testing",
|
Receiver: "notify-testing",
|
||||||
GroupBy: lset("role"),
|
GroupBy: lset("role"),
|
||||||
|
GroupByAll: false,
|
||||||
GroupWait: 2 * time.Minute,
|
GroupWait: 2 * time.Minute,
|
||||||
GroupInterval: def.GroupInterval,
|
GroupInterval: def.GroupInterval,
|
||||||
RepeatInterval: def.RepeatInterval,
|
RepeatInterval: def.RepeatInterval,
|
||||||
|
|
|
@ -18,6 +18,12 @@ route:
|
||||||
# The labels by which incoming alerts are grouped together. For example,
|
# The labels by which incoming alerts are grouped together. For example,
|
||||||
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
|
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
|
||||||
# be batched into a single group.
|
# be batched into a single group.
|
||||||
|
#
|
||||||
|
# To aggregate by all possible labels use '...' as the sole label name.
|
||||||
|
# This effectively disables aggregation entirely, passing through all
|
||||||
|
# alerts as-is. This is unlikely to be what you want, unless you have
|
||||||
|
# a very low alert volume or your upstream notification system performs
|
||||||
|
# its own grouping. Example: group_by: [...]
|
||||||
group_by: ['alertname', 'cluster', 'service']
|
group_by: ['alertname', 'cluster', 'service']
|
||||||
|
|
||||||
# When a new group of alerts is created by an incoming alert, wait at
|
# When a new group of alerts is created by an incoming alert, wait at
|
||||||
|
|
Loading…
Reference in New Issue