2015-10-11 15:24:49 +00:00
|
|
|
// Copyright 2015 Prometheus Team
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
//
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
//
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
|
2016-08-09 09:04:01 +00:00
|
|
|
package inhibit
|
2015-09-27 11:20:54 +00:00
|
|
|
|
|
|
|
import (
|
2017-10-07 11:01:37 +00:00
|
|
|
"context"
|
2024-11-06 09:09:57 +00:00
|
|
|
"log/slog"
|
2015-09-27 11:20:54 +00:00
|
|
|
"sync"
|
2016-06-13 13:14:51 +00:00
|
|
|
"time"
|
2015-09-27 11:20:54 +00:00
|
|
|
|
2019-04-22 13:11:40 +00:00
|
|
|
"github.com/oklog/run"
|
2015-09-27 11:20:54 +00:00
|
|
|
"github.com/prometheus/common/model"
|
|
|
|
|
|
|
|
"github.com/prometheus/alertmanager/config"
|
2021-01-22 14:54:11 +00:00
|
|
|
"github.com/prometheus/alertmanager/pkg/labels"
|
2015-09-27 11:20:54 +00:00
|
|
|
"github.com/prometheus/alertmanager/provider"
|
2018-09-03 12:52:53 +00:00
|
|
|
"github.com/prometheus/alertmanager/store"
|
2015-09-27 11:20:54 +00:00
|
|
|
"github.com/prometheus/alertmanager/types"
|
|
|
|
)
|
|
|
|
|
2019-02-25 16:11:43 +00:00
|
|
|
// An Inhibitor determines whether a given label set is muted based on the
|
|
|
|
// currently active alerts and a set of inhibition rules. It implements the
|
|
|
|
// Muter interface.
|
2015-09-27 11:20:54 +00:00
|
|
|
type Inhibitor struct {
|
|
|
|
alerts provider.Alerts
|
|
|
|
rules []*InhibitRule
|
2024-04-30 14:26:04 +00:00
|
|
|
marker types.AlertMarker
|
2024-11-06 09:09:57 +00:00
|
|
|
logger *slog.Logger
|
2015-09-27 11:20:54 +00:00
|
|
|
|
2017-10-07 11:01:37 +00:00
|
|
|
mtx sync.RWMutex
|
|
|
|
cancel func()
|
2015-09-27 11:20:54 +00:00
|
|
|
}
|
|
|
|
|
2015-11-05 09:49:32 +00:00
|
|
|
// NewInhibitor returns a new Inhibitor.
|
2024-11-06 09:09:57 +00:00
|
|
|
func NewInhibitor(ap provider.Alerts, rs []config.InhibitRule, mk types.AlertMarker, logger *slog.Logger) *Inhibitor {
|
2015-10-11 14:54:39 +00:00
|
|
|
ih := &Inhibitor{
|
|
|
|
alerts: ap,
|
2015-11-09 13:34:57 +00:00
|
|
|
marker: mk,
|
2017-10-22 05:59:33 +00:00
|
|
|
logger: logger,
|
2015-10-11 14:54:39 +00:00
|
|
|
}
|
|
|
|
for _, cr := range rs {
|
2016-06-13 13:14:51 +00:00
|
|
|
r := NewInhibitRule(cr)
|
|
|
|
ih.rules = append(ih.rules, r)
|
2015-10-11 14:54:39 +00:00
|
|
|
}
|
|
|
|
return ih
|
|
|
|
}
|
2015-09-27 11:20:54 +00:00
|
|
|
|
2017-10-07 11:01:37 +00:00
|
|
|
func (ih *Inhibitor) run(ctx context.Context) {
|
2016-06-13 13:14:51 +00:00
|
|
|
it := ih.alerts.Subscribe()
|
|
|
|
defer it.Close()
|
|
|
|
|
|
|
|
for {
|
|
|
|
select {
|
2017-10-07 11:01:37 +00:00
|
|
|
case <-ctx.Done():
|
2016-06-13 13:14:51 +00:00
|
|
|
return
|
|
|
|
case a := <-it.Next():
|
|
|
|
if err := it.Err(); err != nil {
|
2024-11-06 09:09:57 +00:00
|
|
|
ih.logger.Error("Error iterating alerts", "err", err)
|
2016-06-13 13:14:51 +00:00
|
|
|
continue
|
|
|
|
}
|
2018-04-18 14:26:04 +00:00
|
|
|
// Update the inhibition rules' cache.
|
2016-06-13 13:14:51 +00:00
|
|
|
for _, r := range ih.rules {
|
2021-01-22 14:54:11 +00:00
|
|
|
if r.SourceMatchers.Matches(a.Labels) {
|
2018-09-03 12:52:53 +00:00
|
|
|
if err := r.scache.Set(a); err != nil {
|
2024-11-06 09:09:57 +00:00
|
|
|
ih.logger.Error("error on set alert", "err", err)
|
2018-09-03 12:52:53 +00:00
|
|
|
}
|
2016-06-13 13:14:51 +00:00
|
|
|
}
|
2015-09-27 11:20:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
2016-06-13 13:14:51 +00:00
|
|
|
}
|
|
|
|
|
2018-09-03 12:52:53 +00:00
|
|
|
// Run the Inhibitor's background processing.
|
2017-10-07 11:01:37 +00:00
|
|
|
func (ih *Inhibitor) Run() {
|
|
|
|
var (
|
2019-04-22 13:11:40 +00:00
|
|
|
g run.Group
|
2017-10-07 11:01:37 +00:00
|
|
|
ctx context.Context
|
|
|
|
)
|
|
|
|
|
2017-11-23 10:45:00 +00:00
|
|
|
ih.mtx.Lock()
|
2017-10-07 11:01:37 +00:00
|
|
|
ctx, ih.cancel = context.WithCancel(context.Background())
|
2017-11-23 10:45:00 +00:00
|
|
|
ih.mtx.Unlock()
|
2017-10-07 11:01:37 +00:00
|
|
|
runCtx, runCancel := context.WithCancel(ctx)
|
|
|
|
|
2018-09-03 12:52:53 +00:00
|
|
|
for _, rule := range ih.rules {
|
2019-10-22 07:25:31 +00:00
|
|
|
go rule.scache.Run(runCtx, 15*time.Minute)
|
2018-09-03 12:52:53 +00:00
|
|
|
}
|
|
|
|
|
2017-10-07 11:01:37 +00:00
|
|
|
g.Add(func() error {
|
|
|
|
ih.run(runCtx)
|
|
|
|
return nil
|
|
|
|
}, func(err error) {
|
|
|
|
runCancel()
|
|
|
|
})
|
|
|
|
|
2018-08-05 13:38:25 +00:00
|
|
|
if err := g.Run(); err != nil {
|
2024-11-06 09:09:57 +00:00
|
|
|
ih.logger.Warn("error running inhibitor", "err", err)
|
2018-08-05 13:38:25 +00:00
|
|
|
}
|
2017-10-07 11:01:37 +00:00
|
|
|
}
|
|
|
|
|
2016-06-13 13:14:51 +00:00
|
|
|
// Stop the Inhibitor's background processing.
|
|
|
|
func (ih *Inhibitor) Stop() {
|
|
|
|
if ih == nil {
|
|
|
|
return
|
2015-11-20 14:10:38 +00:00
|
|
|
}
|
2015-11-09 13:34:57 +00:00
|
|
|
|
2017-11-23 10:45:00 +00:00
|
|
|
ih.mtx.RLock()
|
|
|
|
defer ih.mtx.RUnlock()
|
2017-10-07 11:01:37 +00:00
|
|
|
if ih.cancel != nil {
|
|
|
|
ih.cancel()
|
2016-06-13 13:14:51 +00:00
|
|
|
}
|
|
|
|
}
|
2015-11-09 13:34:57 +00:00
|
|
|
|
2019-02-25 16:11:43 +00:00
|
|
|
// Mutes returns true iff the given label set is muted. It implements the Muter
|
|
|
|
// interface.
|
2016-06-13 13:14:51 +00:00
|
|
|
func (ih *Inhibitor) Mutes(lset model.LabelSet) bool {
|
|
|
|
fp := lset.Fingerprint()
|
|
|
|
|
|
|
|
for _, r := range ih.rules {
|
2021-01-22 14:54:11 +00:00
|
|
|
if !r.TargetMatchers.Matches(lset) {
|
Modify the self-inhibition prevention semantics
This has been discussed in #666 (issue of hell...).
As concluded there, the cleanest semantics is most likely the
following: "An alert that matches both target and source side cannot
inhibit alerts for which the same is true." The two open questions
were:
1. How difficult is the implementation?
2. Is it needed?
This relatively simple commit proves that the answer to (1) is: Not
very difficult. (This also includes a performance-improving
simplification, which would have been possible without a change of
semantics.)
The answer to (2) is twofold:
For one, the original use case in #666 wasn't solved by our interim
solution. What we solved is the case where the self-inhibition is
triggered by a wide target match, i.e. I have a specific alert that
should inhibit a whole group of target alerts without inhibiting
itself. What we did _not_ solve is the inverted case: Self-inhibition
by a wide source match, i.e. an alert that should only fire if none of
a whole group of source alert fires. I mean, we "fixed" it as in, the
target alert will never be inhibited, but @lmb in #666 wanted the
alert to be inhibited _sometimes_ (just not _always_).
The other part is that I think that the asymmetry in our interim
solution will at some point haunt us. Thus, I really would like to get
this change in before we do a 1.0 release.
In practice, I expect this to be only relevant in very rare cases. But
those cases will be most difficult to reason with, and I claim that
the solution in this commit is matching what humans intuitively
expect.
Signed-off-by: beorn7 <beorn@soundcloud.com>
2019-02-22 18:57:27 +00:00
|
|
|
// If target side of rule doesn't match, we don't need to look any further.
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
// If we are here, the target side matches. If the source side matches, too, we
|
|
|
|
// need to exclude inhibiting alerts for which the same is true.
|
2021-01-22 14:54:11 +00:00
|
|
|
if inhibitedByFP, eq := r.hasEqual(lset, r.SourceMatchers.Matches(lset)); eq {
|
2018-04-18 14:26:04 +00:00
|
|
|
ih.marker.SetInhibited(fp, inhibitedByFP.String())
|
2016-06-13 13:14:51 +00:00
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
2017-04-27 12:18:52 +00:00
|
|
|
ih.marker.SetInhibited(fp)
|
2016-06-13 13:14:51 +00:00
|
|
|
|
2017-04-27 12:18:52 +00:00
|
|
|
return false
|
2015-09-27 11:20:54 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// An InhibitRule specifies that a class of (source) alerts should inhibit
|
|
|
|
// notifications for another class of (target) alerts if all specified matching
|
|
|
|
// labels are equal between the two alerts. This may be used to inhibit alerts
|
|
|
|
// from sending notifications if their meaning is logically a subset of a
|
|
|
|
// higher-level alert.
|
|
|
|
type InhibitRule struct {
|
|
|
|
// The set of Filters which define the group of source alerts (which inhibit
|
|
|
|
// the target alerts).
|
2021-01-22 14:54:11 +00:00
|
|
|
SourceMatchers labels.Matchers
|
2015-09-27 11:20:54 +00:00
|
|
|
// The set of Filters which define the group of target alerts (which are
|
|
|
|
// inhibited by the source alerts).
|
2021-01-22 14:54:11 +00:00
|
|
|
TargetMatchers labels.Matchers
|
2015-09-27 11:20:54 +00:00
|
|
|
// A set of label names whose label values need to be identical in source and
|
|
|
|
// target alerts in order for the inhibition to take effect.
|
|
|
|
Equal map[model.LabelName]struct{}
|
2016-06-13 13:14:51 +00:00
|
|
|
|
|
|
|
// Cache of alerts matching source labels.
|
2018-09-03 12:52:53 +00:00
|
|
|
scache *store.Alerts
|
2015-09-27 11:20:54 +00:00
|
|
|
}
|
|
|
|
|
2018-09-03 12:52:53 +00:00
|
|
|
// NewInhibitRule returns a new InhibitRule based on a configuration definition.
|
2023-01-09 21:12:12 +00:00
|
|
|
func NewInhibitRule(cr config.InhibitRule) *InhibitRule {
|
2015-09-27 11:20:54 +00:00
|
|
|
var (
|
2021-01-22 14:54:11 +00:00
|
|
|
sourcem labels.Matchers
|
|
|
|
targetm labels.Matchers
|
2015-09-27 11:20:54 +00:00
|
|
|
)
|
2021-01-22 14:54:11 +00:00
|
|
|
// cr.SourceMatch will be deprecated. This for loop appends regex matchers.
|
2015-09-27 11:20:54 +00:00
|
|
|
for ln, lv := range cr.SourceMatch {
|
2021-01-22 14:54:11 +00:00
|
|
|
matcher, err := labels.NewMatcher(labels.MatchEqual, ln, lv)
|
|
|
|
if err != nil {
|
|
|
|
// This error must not happen because the config already validates the yaml.
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
sourcem = append(sourcem, matcher)
|
2015-09-27 11:20:54 +00:00
|
|
|
}
|
2021-01-22 14:54:11 +00:00
|
|
|
// cr.SourceMatchRE will be deprecated. This for loop appends regex matchers.
|
2015-09-27 11:20:54 +00:00
|
|
|
for ln, lv := range cr.SourceMatchRE {
|
2021-01-22 14:54:11 +00:00
|
|
|
matcher, err := labels.NewMatcher(labels.MatchRegexp, ln, lv.String())
|
|
|
|
if err != nil {
|
|
|
|
// This error must not happen because the config already validates the yaml.
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
sourcem = append(sourcem, matcher)
|
2015-09-27 11:20:54 +00:00
|
|
|
}
|
2021-01-22 14:54:11 +00:00
|
|
|
// We append the new-style matchers. This can be simplified once the deprecated matcher syntax is removed.
|
|
|
|
sourcem = append(sourcem, cr.SourceMatchers...)
|
2015-09-27 11:20:54 +00:00
|
|
|
|
2021-01-22 14:54:11 +00:00
|
|
|
// cr.TargetMatch will be deprecated. This for loop appends regex matchers.
|
2015-09-27 11:20:54 +00:00
|
|
|
for ln, lv := range cr.TargetMatch {
|
2021-01-22 14:54:11 +00:00
|
|
|
matcher, err := labels.NewMatcher(labels.MatchEqual, ln, lv)
|
|
|
|
if err != nil {
|
|
|
|
// This error must not happen because the config already validates the yaml.
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
targetm = append(targetm, matcher)
|
2015-09-27 11:20:54 +00:00
|
|
|
}
|
2021-01-22 14:54:11 +00:00
|
|
|
// cr.TargetMatchRE will be deprecated. This for loop appends regex matchers.
|
2015-09-27 11:20:54 +00:00
|
|
|
for ln, lv := range cr.TargetMatchRE {
|
2021-01-22 14:54:11 +00:00
|
|
|
matcher, err := labels.NewMatcher(labels.MatchRegexp, ln, lv.String())
|
|
|
|
if err != nil {
|
|
|
|
// This error must not happen because the config already validates the yaml.
|
|
|
|
panic(err)
|
|
|
|
}
|
|
|
|
targetm = append(targetm, matcher)
|
2015-09-27 11:20:54 +00:00
|
|
|
}
|
2021-01-22 14:54:11 +00:00
|
|
|
// We append the new-style matchers. This can be simplified once the deprecated matcher syntax is removed.
|
|
|
|
targetm = append(targetm, cr.TargetMatchers...)
|
2015-09-27 11:20:54 +00:00
|
|
|
|
2015-10-05 11:23:40 +00:00
|
|
|
equal := map[model.LabelName]struct{}{}
|
2015-09-27 11:20:54 +00:00
|
|
|
for _, ln := range cr.Equal {
|
|
|
|
equal[ln] = struct{}{}
|
|
|
|
}
|
|
|
|
|
|
|
|
return &InhibitRule{
|
|
|
|
SourceMatchers: sourcem,
|
|
|
|
TargetMatchers: targetm,
|
|
|
|
Equal: equal,
|
2019-09-18 07:29:34 +00:00
|
|
|
scache: store.NewAlerts(),
|
2015-09-27 11:20:54 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Modify the self-inhibition prevention semantics
This has been discussed in #666 (issue of hell...).
As concluded there, the cleanest semantics is most likely the
following: "An alert that matches both target and source side cannot
inhibit alerts for which the same is true." The two open questions
were:
1. How difficult is the implementation?
2. Is it needed?
This relatively simple commit proves that the answer to (1) is: Not
very difficult. (This also includes a performance-improving
simplification, which would have been possible without a change of
semantics.)
The answer to (2) is twofold:
For one, the original use case in #666 wasn't solved by our interim
solution. What we solved is the case where the self-inhibition is
triggered by a wide target match, i.e. I have a specific alert that
should inhibit a whole group of target alerts without inhibiting
itself. What we did _not_ solve is the inverted case: Self-inhibition
by a wide source match, i.e. an alert that should only fire if none of
a whole group of source alert fires. I mean, we "fixed" it as in, the
target alert will never be inhibited, but @lmb in #666 wanted the
alert to be inhibited _sometimes_ (just not _always_).
The other part is that I think that the asymmetry in our interim
solution will at some point haunt us. Thus, I really would like to get
this change in before we do a 1.0 release.
In practice, I expect this to be only relevant in very rare cases. But
those cases will be most difficult to reason with, and I claim that
the solution in this commit is matching what humans intuitively
expect.
Signed-off-by: beorn7 <beorn@soundcloud.com>
2019-02-22 18:57:27 +00:00
|
|
|
// hasEqual checks whether the source cache contains alerts matching the equal
|
2019-02-25 16:11:43 +00:00
|
|
|
// labels for the given label set. If so, the fingerprint of one of those alerts
|
|
|
|
// is returned. If excludeTwoSidedMatch is true, alerts that match both the
|
|
|
|
// source and the target side of the rule are disregarded.
|
Modify the self-inhibition prevention semantics
This has been discussed in #666 (issue of hell...).
As concluded there, the cleanest semantics is most likely the
following: "An alert that matches both target and source side cannot
inhibit alerts for which the same is true." The two open questions
were:
1. How difficult is the implementation?
2. Is it needed?
This relatively simple commit proves that the answer to (1) is: Not
very difficult. (This also includes a performance-improving
simplification, which would have been possible without a change of
semantics.)
The answer to (2) is twofold:
For one, the original use case in #666 wasn't solved by our interim
solution. What we solved is the case where the self-inhibition is
triggered by a wide target match, i.e. I have a specific alert that
should inhibit a whole group of target alerts without inhibiting
itself. What we did _not_ solve is the inverted case: Self-inhibition
by a wide source match, i.e. an alert that should only fire if none of
a whole group of source alert fires. I mean, we "fixed" it as in, the
target alert will never be inhibited, but @lmb in #666 wanted the
alert to be inhibited _sometimes_ (just not _always_).
The other part is that I think that the asymmetry in our interim
solution will at some point haunt us. Thus, I really would like to get
this change in before we do a 1.0 release.
In practice, I expect this to be only relevant in very rare cases. But
those cases will be most difficult to reason with, and I claim that
the solution in this commit is matching what humans intuitively
expect.
Signed-off-by: beorn7 <beorn@soundcloud.com>
2019-02-22 18:57:27 +00:00
|
|
|
func (r *InhibitRule) hasEqual(lset model.LabelSet, excludeTwoSidedMatch bool) (model.Fingerprint, bool) {
|
2016-06-13 13:14:51 +00:00
|
|
|
Outer:
|
2019-04-19 12:01:41 +00:00
|
|
|
for _, a := range r.scache.List() {
|
2016-06-13 13:14:51 +00:00
|
|
|
// The cache might be stale and contain resolved alerts.
|
|
|
|
if a.Resolved() {
|
|
|
|
continue
|
2015-09-28 20:08:47 +00:00
|
|
|
}
|
2016-06-13 13:14:51 +00:00
|
|
|
for n := range r.Equal {
|
|
|
|
if a.Labels[n] != lset[n] {
|
|
|
|
continue Outer
|
|
|
|
}
|
|
|
|
}
|
2021-01-22 14:54:11 +00:00
|
|
|
if excludeTwoSidedMatch && r.TargetMatchers.Matches(a.Labels) {
|
Modify the self-inhibition prevention semantics
This has been discussed in #666 (issue of hell...).
As concluded there, the cleanest semantics is most likely the
following: "An alert that matches both target and source side cannot
inhibit alerts for which the same is true." The two open questions
were:
1. How difficult is the implementation?
2. Is it needed?
This relatively simple commit proves that the answer to (1) is: Not
very difficult. (This also includes a performance-improving
simplification, which would have been possible without a change of
semantics.)
The answer to (2) is twofold:
For one, the original use case in #666 wasn't solved by our interim
solution. What we solved is the case where the self-inhibition is
triggered by a wide target match, i.e. I have a specific alert that
should inhibit a whole group of target alerts without inhibiting
itself. What we did _not_ solve is the inverted case: Self-inhibition
by a wide source match, i.e. an alert that should only fire if none of
a whole group of source alert fires. I mean, we "fixed" it as in, the
target alert will never be inhibited, but @lmb in #666 wanted the
alert to be inhibited _sometimes_ (just not _always_).
The other part is that I think that the asymmetry in our interim
solution will at some point haunt us. Thus, I really would like to get
this change in before we do a 1.0 release.
In practice, I expect this to be only relevant in very rare cases. But
those cases will be most difficult to reason with, and I claim that
the solution in this commit is matching what humans intuitively
expect.
Signed-off-by: beorn7 <beorn@soundcloud.com>
2019-02-22 18:57:27 +00:00
|
|
|
continue Outer
|
|
|
|
}
|
2018-09-03 12:52:53 +00:00
|
|
|
return a.Fingerprint(), true
|
2015-09-28 20:08:47 +00:00
|
|
|
}
|
2017-04-27 12:18:52 +00:00
|
|
|
return model.Fingerprint(0), false
|
2016-06-13 13:14:51 +00:00
|
|
|
}
|