dispatch: Add group_by_all support (#1588)
To aggregate by all possible labels use '...' as the sole label name. This effectively disables aggregation entirely, passing through all alerts as-is. This is unlikely to be what you want, unless you have a very low alert volume or your upstream notification system performs its own grouping. Example: group_by: [...] Signed-off-by: Kyryl Sablin <kyryl.sablin@schibsted.com>
This commit is contained in:
parent
758b5e3bb1
commit
32bb289906
|
@ -70,6 +70,12 @@ route:
|
|||
# The labels by which incoming alerts are grouped together. For example,
|
||||
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
|
||||
# be batched into a single group.
|
||||
#
|
||||
# To aggregate by all possible labels use '...' as the sole label name.
|
||||
# This effectively disables aggregation entirely, passing through all
|
||||
# alerts as-is. This is unlikely to be what you want, unless you have
|
||||
# a very low alert volume or your upstream notification system performs
|
||||
# its own grouping. Example: group_by: [...]
|
||||
group_by: ['alertname', 'cluster']
|
||||
|
||||
# When a new group of alerts is created by an incoming alert, wait at
|
||||
|
|
|
@ -494,8 +494,11 @@ func (c *GlobalConfig) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
|||
|
||||
// A Route is a node that contains definitions of how to handle alerts.
|
||||
type Route struct {
|
||||
Receiver string `yaml:"receiver,omitempty" json:"receiver,omitempty"`
|
||||
GroupBy []model.LabelName `yaml:"group_by,omitempty" json:"group_by,omitempty"`
|
||||
Receiver string `yaml:"receiver,omitempty" json:"receiver,omitempty"`
|
||||
|
||||
GroupByStr []string `yaml:"group_by,omitempty" json:"group_by,omitempty"`
|
||||
GroupBy []model.LabelName
|
||||
GroupByAll bool
|
||||
|
||||
Match map[string]string `yaml:"match,omitempty" json:"match,omitempty"`
|
||||
MatchRE map[string]Regexp `yaml:"match_re,omitempty" json:"match_re,omitempty"`
|
||||
|
@ -525,6 +528,21 @@ func (r *Route) UnmarshalYAML(unmarshal func(interface{}) error) error {
|
|||
return fmt.Errorf("invalid label name %q", k)
|
||||
}
|
||||
}
|
||||
for _, l := range r.GroupByStr {
|
||||
if l == "..." {
|
||||
r.GroupByAll = true
|
||||
} else {
|
||||
labelName := model.LabelName(l)
|
||||
if !labelName.IsValid() {
|
||||
return fmt.Errorf("invalid label name %q in group_by list", l)
|
||||
}
|
||||
r.GroupBy = append(r.GroupBy, labelName)
|
||||
}
|
||||
}
|
||||
|
||||
if len(r.GroupBy) > 0 && r.GroupByAll {
|
||||
return fmt.Errorf("cannot have wildcard group_by (`...`) and other other labels at the same time")
|
||||
}
|
||||
|
||||
groupBy := map[model.LabelName]struct{}{}
|
||||
|
||||
|
|
|
@ -144,6 +144,47 @@ receivers:
|
|||
|
||||
}
|
||||
|
||||
func TestWildcardGroupByWithOtherGroupByLabels(t *testing.T) {
|
||||
in := `
|
||||
route:
|
||||
group_by: ['alertname', 'cluster', '...']
|
||||
receiver: team-X-mails
|
||||
receivers:
|
||||
- name: 'team-X-mails'
|
||||
`
|
||||
_, err := Load(in)
|
||||
|
||||
expected := "cannot have wildcard group_by (`...`) and other other labels at the same time"
|
||||
|
||||
if err == nil {
|
||||
t.Fatalf("no error returned, expected:\n%q", expected)
|
||||
}
|
||||
if err.Error() != expected {
|
||||
t.Errorf("\nexpected:\n%q\ngot:\n%q", expected, err.Error())
|
||||
}
|
||||
}
|
||||
|
||||
func TestGroupByInvalidLabel(t *testing.T) {
|
||||
in := `
|
||||
route:
|
||||
group_by: ['-invalid-']
|
||||
receiver: team-X-mails
|
||||
receivers:
|
||||
- name: 'team-X-mails'
|
||||
`
|
||||
_, err := Load(in)
|
||||
|
||||
expected := "invalid label name \"-invalid-\" in group_by list"
|
||||
|
||||
if err == nil {
|
||||
t.Fatalf("no error returned, expected:\n%q", expected)
|
||||
}
|
||||
if err.Error() != expected {
|
||||
t.Errorf("\nexpected:\n%q\ngot:\n%q", expected, err.Error())
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestRootRouteExists(t *testing.T) {
|
||||
in := `
|
||||
receivers:
|
||||
|
@ -448,6 +489,12 @@ func TestEmptyFieldsAndRegex(t *testing.T) {
|
|||
"cluster",
|
||||
"service",
|
||||
},
|
||||
GroupByStr: []string{
|
||||
"alertname",
|
||||
"cluster",
|
||||
"service",
|
||||
},
|
||||
GroupByAll: false,
|
||||
Routes: []*Route{
|
||||
{
|
||||
Receiver: "team-X-mails",
|
||||
|
@ -506,6 +553,17 @@ func TestSMTPHello(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestGroupByAll(t *testing.T) {
|
||||
c, _, err := LoadFile("testdata/conf.group-by-all.yml")
|
||||
if err != nil {
|
||||
t.Errorf("Error parsing %s: %s", "testdata/conf.group-by-all.yml", err)
|
||||
}
|
||||
|
||||
if !c.Route.GroupByAll {
|
||||
t.Errorf("Invalid group by all param: expected to by true")
|
||||
}
|
||||
}
|
||||
|
||||
func TestVictorOpsDefaultAPIKey(t *testing.T) {
|
||||
conf, _, err := LoadFile("testdata/conf.victorops-default-apikey.yml")
|
||||
if err != nil {
|
||||
|
|
|
@ -0,0 +1,10 @@
|
|||
route:
|
||||
group_by: [...]
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 3h
|
||||
receiver: team-X
|
||||
|
||||
receivers:
|
||||
- name: 'team-X'
|
||||
|
|
@ -152,13 +152,7 @@ type notifyFunc func(context.Context, ...*types.Alert) bool
|
|||
// processAlert determines in which aggregation group the alert falls
|
||||
// and inserts it.
|
||||
func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
|
||||
groupLabels := model.LabelSet{}
|
||||
|
||||
for ln, lv := range alert.Labels {
|
||||
if _, ok := route.RouteOpts.GroupBy[ln]; ok {
|
||||
groupLabels[ln] = lv
|
||||
}
|
||||
}
|
||||
groupLabels := getGroupLabels(alert, route)
|
||||
|
||||
fp := groupLabels.Fingerprint()
|
||||
|
||||
|
@ -189,6 +183,17 @@ func (d *Dispatcher) processAlert(alert *types.Alert, route *Route) {
|
|||
ag.insert(alert)
|
||||
}
|
||||
|
||||
func getGroupLabels(alert *types.Alert, route *Route) model.LabelSet {
|
||||
groupLabels := model.LabelSet{}
|
||||
for ln, lv := range alert.Labels {
|
||||
if _, ok := route.RouteOpts.GroupBy[ln]; ok || route.RouteOpts.GroupByAll {
|
||||
groupLabels[ln] = lv
|
||||
}
|
||||
}
|
||||
|
||||
return groupLabels
|
||||
}
|
||||
|
||||
// aggrGroup aggregates alert fingerprints into groups to which a
|
||||
// common set of routing options applies.
|
||||
// It emits notifications in the specified intervals.
|
||||
|
|
|
@ -240,3 +240,67 @@ func TestAggrGroup(t *testing.T) {
|
|||
|
||||
ag.stop()
|
||||
}
|
||||
|
||||
func TestGroupLabels(t *testing.T) {
|
||||
var a = &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{
|
||||
"a": "v1",
|
||||
"b": "v2",
|
||||
"c": "v3",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
route := &Route{
|
||||
RouteOpts: RouteOpts{
|
||||
GroupBy: map[model.LabelName]struct{}{
|
||||
"a": struct{}{},
|
||||
"b": struct{}{},
|
||||
},
|
||||
GroupByAll: false,
|
||||
},
|
||||
}
|
||||
|
||||
expLs := model.LabelSet{
|
||||
"a": "v1",
|
||||
"b": "v2",
|
||||
}
|
||||
|
||||
ls := getGroupLabels(a, route)
|
||||
|
||||
if !reflect.DeepEqual(ls, expLs) {
|
||||
t.Fatalf("expected labels are %v, but got %v", expLs, ls)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGroupByAllLabels(t *testing.T) {
|
||||
var a = &types.Alert{
|
||||
Alert: model.Alert{
|
||||
Labels: model.LabelSet{
|
||||
"a": "v1",
|
||||
"b": "v2",
|
||||
"c": "v3",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
route := &Route{
|
||||
RouteOpts: RouteOpts{
|
||||
GroupBy: map[model.LabelName]struct{}{},
|
||||
GroupByAll: true,
|
||||
},
|
||||
}
|
||||
|
||||
expLs := model.LabelSet{
|
||||
"a": "v1",
|
||||
"b": "v2",
|
||||
"c": "v3",
|
||||
}
|
||||
|
||||
ls := getGroupLabels(a, route)
|
||||
|
||||
if !reflect.DeepEqual(ls, expLs) {
|
||||
t.Fatalf("expected labels are %v, but got %v", expLs, ls)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -32,6 +32,7 @@ var DefaultRouteOpts = RouteOpts{
|
|||
GroupInterval: 5 * time.Minute,
|
||||
RepeatInterval: 4 * time.Hour,
|
||||
GroupBy: map[model.LabelName]struct{}{},
|
||||
GroupByAll: false,
|
||||
}
|
||||
|
||||
// A Route is a node that contains definitions of how to handle alerts.
|
||||
|
@ -69,6 +70,9 @@ func NewRoute(cr *config.Route, parent *Route) *Route {
|
|||
opts.GroupBy[ln] = struct{}{}
|
||||
}
|
||||
}
|
||||
|
||||
opts.GroupByAll = cr.GroupByAll
|
||||
|
||||
if cr.GroupWait != nil {
|
||||
opts.GroupWait = time.Duration(*cr.GroupWait)
|
||||
}
|
||||
|
@ -158,6 +162,9 @@ type RouteOpts struct {
|
|||
// What labels to group alerts by for notifications.
|
||||
GroupBy map[model.LabelName]struct{}
|
||||
|
||||
// Use all alert labels to group.
|
||||
GroupByAll bool
|
||||
|
||||
// How long to wait to group matching alerts before sending
|
||||
// a notification.
|
||||
GroupWait time.Duration
|
||||
|
@ -170,7 +177,8 @@ func (ro *RouteOpts) String() string {
|
|||
for ln := range ro.GroupBy {
|
||||
labels = append(labels, ln)
|
||||
}
|
||||
return fmt.Sprintf("<RouteOpts send_to:%q group_by:%q timers:%q|%q>", ro.Receiver, labels, ro.GroupWait, ro.GroupInterval)
|
||||
return fmt.Sprintf("<RouteOpts send_to:%q group_by:%q group_by_all:%t timers:%q|%q>",
|
||||
ro.Receiver, labels, ro.GroupByAll, ro.GroupWait, ro.GroupInterval)
|
||||
}
|
||||
|
||||
// MarshalJSON returns a JSON representation of the routing options.
|
||||
|
@ -178,11 +186,13 @@ func (ro *RouteOpts) MarshalJSON() ([]byte, error) {
|
|||
v := struct {
|
||||
Receiver string `json:"receiver"`
|
||||
GroupBy model.LabelNames `json:"groupBy"`
|
||||
GroupByAll bool `json:"groupByAll"`
|
||||
GroupWait time.Duration `json:"groupWait"`
|
||||
GroupInterval time.Duration `json:"groupInterval"`
|
||||
RepeatInterval time.Duration `json:"repeatInterval"`
|
||||
}{
|
||||
Receiver: ro.Receiver,
|
||||
GroupByAll: ro.GroupByAll,
|
||||
GroupWait: ro.GroupWait,
|
||||
GroupInterval: ro.GroupInterval,
|
||||
RepeatInterval: ro.RepeatInterval,
|
||||
|
|
|
@ -39,7 +39,7 @@ routes:
|
|||
env: 'testing'
|
||||
|
||||
receiver: 'notify-testing'
|
||||
group_by: []
|
||||
group_by: [...]
|
||||
|
||||
- match:
|
||||
env: "production"
|
||||
|
@ -110,6 +110,7 @@ routes:
|
|||
{
|
||||
Receiver: "notify-A",
|
||||
GroupBy: def.GroupBy,
|
||||
GroupByAll: false,
|
||||
GroupWait: def.GroupWait,
|
||||
GroupInterval: def.GroupInterval,
|
||||
RepeatInterval: def.RepeatInterval,
|
||||
|
@ -126,6 +127,7 @@ routes:
|
|||
{
|
||||
Receiver: "notify-A",
|
||||
GroupBy: def.GroupBy,
|
||||
GroupByAll: false,
|
||||
GroupWait: def.GroupWait,
|
||||
GroupInterval: def.GroupInterval,
|
||||
RepeatInterval: def.RepeatInterval,
|
||||
|
@ -141,6 +143,7 @@ routes:
|
|||
{
|
||||
Receiver: "notify-BC",
|
||||
GroupBy: lset("foo", "bar"),
|
||||
GroupByAll: false,
|
||||
GroupWait: 2 * time.Minute,
|
||||
GroupInterval: def.GroupInterval,
|
||||
RepeatInterval: def.RepeatInterval,
|
||||
|
@ -157,6 +160,7 @@ routes:
|
|||
{
|
||||
Receiver: "notify-testing",
|
||||
GroupBy: lset(),
|
||||
GroupByAll: true,
|
||||
GroupWait: def.GroupWait,
|
||||
GroupInterval: def.GroupInterval,
|
||||
RepeatInterval: def.RepeatInterval,
|
||||
|
@ -173,6 +177,7 @@ routes:
|
|||
{
|
||||
Receiver: "notify-productionA",
|
||||
GroupBy: def.GroupBy,
|
||||
GroupByAll: false,
|
||||
GroupWait: 1 * time.Minute,
|
||||
GroupInterval: def.GroupInterval,
|
||||
RepeatInterval: def.RepeatInterval,
|
||||
|
@ -180,6 +185,7 @@ routes:
|
|||
{
|
||||
Receiver: "notify-productionB",
|
||||
GroupBy: lset("job"),
|
||||
GroupByAll: false,
|
||||
GroupWait: 30 * time.Second,
|
||||
GroupInterval: 5 * time.Minute,
|
||||
RepeatInterval: 1 * time.Hour,
|
||||
|
@ -198,6 +204,7 @@ routes:
|
|||
{
|
||||
Receiver: "notify-def",
|
||||
GroupBy: lset("role"),
|
||||
GroupByAll: false,
|
||||
GroupWait: def.GroupWait,
|
||||
GroupInterval: def.GroupInterval,
|
||||
RepeatInterval: def.RepeatInterval,
|
||||
|
@ -214,6 +221,7 @@ routes:
|
|||
{
|
||||
Receiver: "notify-testing",
|
||||
GroupBy: lset("role"),
|
||||
GroupByAll: false,
|
||||
GroupWait: def.GroupWait,
|
||||
GroupInterval: def.GroupInterval,
|
||||
RepeatInterval: def.RepeatInterval,
|
||||
|
@ -231,6 +239,7 @@ routes:
|
|||
{
|
||||
Receiver: "notify-testing",
|
||||
GroupBy: lset("role"),
|
||||
GroupByAll: false,
|
||||
GroupWait: 2 * time.Minute,
|
||||
GroupInterval: def.GroupInterval,
|
||||
RepeatInterval: def.RepeatInterval,
|
||||
|
|
|
@ -18,6 +18,12 @@ route:
|
|||
# The labels by which incoming alerts are grouped together. For example,
|
||||
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
|
||||
# be batched into a single group.
|
||||
#
|
||||
# To aggregate by all possible labels use '...' as the sole label name.
|
||||
# This effectively disables aggregation entirely, passing through all
|
||||
# alerts as-is. This is unlikely to be what you want, unless you have
|
||||
# a very low alert volume or your upstream notification system performs
|
||||
# its own grouping. Example: group_by: [...]
|
||||
group_by: ['alertname', 'cluster', 'service']
|
||||
|
||||
# When a new group of alerts is created by an incoming alert, wait at
|
||||
|
|
Loading…
Reference in New Issue