mirror of
https://github.com/prometheus/alertmanager
synced 2025-02-20 20:57:05 +00:00
Add limits for silences (#3852)
* Add limits for silences This commit adds limits for silences including the maximum number of active and pending silences, and the maximum size per silence (in bytes). Signed-off-by: George Robinson <george.robinson@grafana.com> * Remove default limits Signed-off-by: George Robinson <george.robinson@grafana.com> * Allow expiration of silences that exceed max size --------- Signed-off-by: George Robinson <george.robinson@grafana.com>
This commit is contained in:
parent
06b389bfec
commit
b67bde8cf9
@ -146,6 +146,8 @@ func run() int {
|
|||||||
dataDir = kingpin.Flag("storage.path", "Base path for data storage.").Default("data/").String()
|
dataDir = kingpin.Flag("storage.path", "Base path for data storage.").Default("data/").String()
|
||||||
retention = kingpin.Flag("data.retention", "How long to keep data for.").Default("120h").Duration()
|
retention = kingpin.Flag("data.retention", "How long to keep data for.").Default("120h").Duration()
|
||||||
maintenanceInterval = kingpin.Flag("data.maintenance-interval", "Interval between garbage collection and snapshotting to disk of the silences and the notification logs.").Default("15m").Duration()
|
maintenanceInterval = kingpin.Flag("data.maintenance-interval", "Interval between garbage collection and snapshotting to disk of the silences and the notification logs.").Default("15m").Duration()
|
||||||
|
maxSilences = kingpin.Flag("silences.max-silences", "Maximum number of active and pending silences, excluding expired silences. If negative or zero, no limit is set.").Default("0").Int()
|
||||||
|
maxPerSilenceBytes = kingpin.Flag("silences.max-per-silence-bytes", "Maximum per silence size in bytes. If negative or zero, no limit is set.").Default("0").Int()
|
||||||
alertGCInterval = kingpin.Flag("alerts.gc-interval", "Interval between alert GC.").Default("30m").Duration()
|
alertGCInterval = kingpin.Flag("alerts.gc-interval", "Interval between alert GC.").Default("30m").Duration()
|
||||||
|
|
||||||
webConfig = webflag.AddFlags(kingpin.CommandLine, ":9093")
|
webConfig = webflag.AddFlags(kingpin.CommandLine, ":9093")
|
||||||
@ -259,8 +261,12 @@ func run() int {
|
|||||||
silenceOpts := silence.Options{
|
silenceOpts := silence.Options{
|
||||||
SnapshotFile: filepath.Join(*dataDir, "silences"),
|
SnapshotFile: filepath.Join(*dataDir, "silences"),
|
||||||
Retention: *retention,
|
Retention: *retention,
|
||||||
Logger: log.With(logger, "component", "silences"),
|
Limits: silence.Limits{
|
||||||
Metrics: prometheus.DefaultRegisterer,
|
MaxSilences: *maxSilences,
|
||||||
|
MaxPerSilenceBytes: *maxPerSilenceBytes,
|
||||||
|
},
|
||||||
|
Logger: log.With(logger, "component", "silences"),
|
||||||
|
Metrics: prometheus.DefaultRegisterer,
|
||||||
}
|
}
|
||||||
|
|
||||||
silences, err := silence.New(silenceOpts)
|
silences, err := silence.New(silenceOpts)
|
||||||
|
@ -22,6 +22,17 @@ is not well-formed, the changes will not be applied and an error is logged.
|
|||||||
A configuration reload is triggered by sending a `SIGHUP` to the process or
|
A configuration reload is triggered by sending a `SIGHUP` to the process or
|
||||||
sending an HTTP POST request to the `/-/reload` endpoint.
|
sending an HTTP POST request to the `/-/reload` endpoint.
|
||||||
|
|
||||||
|
## Limits
|
||||||
|
|
||||||
|
Alertmanager supports a number of configurable limits via command-line flags.
|
||||||
|
|
||||||
|
To limit the maximum number of active and pending silences, excluding expired ones,
|
||||||
|
use the `--silences.max-silences` flag.
|
||||||
|
You can limit the maximum size of individual silences with `--silences.max-per-silence-bytes`,
|
||||||
|
where the unit is in bytes.
|
||||||
|
|
||||||
|
Both limits are disabled by default.
|
||||||
|
|
||||||
## Configuration file introduction
|
## Configuration file introduction
|
||||||
|
|
||||||
To specify which configuration file to load, use the `--config.file` flag.
|
To specify which configuration file to load, use the `--config.file` flag.
|
||||||
|
@ -193,6 +193,7 @@ type Silences struct {
|
|||||||
logger log.Logger
|
logger log.Logger
|
||||||
metrics *metrics
|
metrics *metrics
|
||||||
retention time.Duration
|
retention time.Duration
|
||||||
|
limits Limits
|
||||||
|
|
||||||
mtx sync.RWMutex
|
mtx sync.RWMutex
|
||||||
st state
|
st state
|
||||||
@ -201,6 +202,16 @@ type Silences struct {
|
|||||||
mc matcherCache
|
mc matcherCache
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Limits contains the limits for silences.
|
||||||
|
type Limits struct {
|
||||||
|
// MaxSilences limits the maximum number active and pending silences.
|
||||||
|
// It does not include expired silences.
|
||||||
|
MaxSilences int
|
||||||
|
// MaxPerSilenceBytes is the maximum size of an individual silence as
|
||||||
|
// stored on disk.
|
||||||
|
MaxPerSilenceBytes int
|
||||||
|
}
|
||||||
|
|
||||||
// MaintenanceFunc represents the function to run as part of the periodic maintenance for silences.
|
// MaintenanceFunc represents the function to run as part of the periodic maintenance for silences.
|
||||||
// It returns the size of the snapshot taken or an error if it failed.
|
// It returns the size of the snapshot taken or an error if it failed.
|
||||||
type MaintenanceFunc func() (int64, error)
|
type MaintenanceFunc func() (int64, error)
|
||||||
@ -318,6 +329,7 @@ type Options struct {
|
|||||||
// Retention time for newly created Silences. Silences may be
|
// Retention time for newly created Silences. Silences may be
|
||||||
// garbage collected after the given duration after they ended.
|
// garbage collected after the given duration after they ended.
|
||||||
Retention time.Duration
|
Retention time.Duration
|
||||||
|
Limits Limits
|
||||||
|
|
||||||
// A logger used by background processing.
|
// A logger used by background processing.
|
||||||
Logger log.Logger
|
Logger log.Logger
|
||||||
@ -342,6 +354,7 @@ func New(o Options) (*Silences, error) {
|
|||||||
mc: matcherCache{},
|
mc: matcherCache{},
|
||||||
logger: log.NewNopLogger(),
|
logger: log.NewNopLogger(),
|
||||||
retention: o.Retention,
|
retention: o.Retention,
|
||||||
|
limits: o.Limits,
|
||||||
broadcast: func([]byte) {},
|
broadcast: func([]byte) {},
|
||||||
st: state{},
|
st: state{},
|
||||||
}
|
}
|
||||||
@ -569,6 +582,13 @@ func (s *Silences) setSilence(sil *pb.Silence, now time.Time, skipValidate bool)
|
|||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Check the limit unless the silence has been expired. This is to avoid
|
||||||
|
// situations where silences cannot be expired after the limit has been
|
||||||
|
// reduced.
|
||||||
|
if n := msil.Size(); s.limits.MaxPerSilenceBytes > 0 && n > s.limits.MaxPerSilenceBytes && sil.EndsAt.After(now) {
|
||||||
|
return fmt.Errorf("silence exceeded maximum size: %d bytes (limit: %d bytes)", n, s.limits.MaxPerSilenceBytes)
|
||||||
|
}
|
||||||
|
|
||||||
if s.st.merge(msil, now) {
|
if s.st.merge(msil, now) {
|
||||||
s.version++
|
s.version++
|
||||||
}
|
}
|
||||||
@ -585,10 +605,10 @@ func (s *Silences) Set(sil *pb.Silence) (string, error) {
|
|||||||
|
|
||||||
now := s.nowUTC()
|
now := s.nowUTC()
|
||||||
prev, ok := s.getSilence(sil.Id)
|
prev, ok := s.getSilence(sil.Id)
|
||||||
|
|
||||||
if sil.Id != "" && !ok {
|
if sil.Id != "" && !ok {
|
||||||
return "", ErrNotFound
|
return "", ErrNotFound
|
||||||
}
|
}
|
||||||
|
|
||||||
if ok {
|
if ok {
|
||||||
if canUpdate(prev, sil, now) {
|
if canUpdate(prev, sil, now) {
|
||||||
return sil.Id, s.setSilence(sil, now, false)
|
return sil.Id, s.setSilence(sil, now, false)
|
||||||
@ -600,7 +620,24 @@ func (s *Silences) Set(sil *pb.Silence) (string, error) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// If we got here it's either a new silence or a replacing one.
|
// If we got here it's either a new silence or a replacing one.
|
||||||
|
if s.limits.MaxSilences > 0 {
|
||||||
|
// Get the number of active and pending silences to enforce limits.
|
||||||
|
q := &query{}
|
||||||
|
err := QState(types.SilenceStateActive, types.SilenceStatePending)(q)
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("unable to query silences while checking limits: %w", err)
|
||||||
|
}
|
||||||
|
sils, _, err := s.query(q, s.nowUTC())
|
||||||
|
if err != nil {
|
||||||
|
return "", fmt.Errorf("unable to query silences while checking limits: %w", err)
|
||||||
|
}
|
||||||
|
if len(sils)+1 > s.limits.MaxSilences {
|
||||||
|
return "", fmt.Errorf("exceeded maximum number of silences: %d (limit: %d)", len(sils), s.limits.MaxSilences)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
uid, err := uuid.NewV4()
|
uid, err := uuid.NewV4()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return "", fmt.Errorf("generate uuid: %w", err)
|
return "", fmt.Errorf("generate uuid: %w", err)
|
||||||
@ -611,7 +648,11 @@ func (s *Silences) Set(sil *pb.Silence) (string, error) {
|
|||||||
sil.StartsAt = now
|
sil.StartsAt = now
|
||||||
}
|
}
|
||||||
|
|
||||||
return sil.Id, s.setSilence(sil, now, false)
|
if err = s.setSilence(sil, now, false); err != nil {
|
||||||
|
return "", err
|
||||||
|
}
|
||||||
|
|
||||||
|
return sil.Id, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// canUpdate returns true if silence a can be updated to b without
|
// canUpdate returns true if silence a can be updated to b without
|
||||||
@ -755,6 +796,9 @@ func (s *Silences) QueryOne(params ...QueryParam) (*pb.Silence, error) {
|
|||||||
// Query for silences based on the given query parameters. It returns the
|
// Query for silences based on the given query parameters. It returns the
|
||||||
// resulting silences and the state version the result is based on.
|
// resulting silences and the state version the result is based on.
|
||||||
func (s *Silences) Query(params ...QueryParam) ([]*pb.Silence, int, error) {
|
func (s *Silences) Query(params ...QueryParam) ([]*pb.Silence, int, error) {
|
||||||
|
s.mtx.Lock()
|
||||||
|
defer s.mtx.Unlock()
|
||||||
|
|
||||||
s.metrics.queriesTotal.Inc()
|
s.metrics.queriesTotal.Inc()
|
||||||
defer prometheus.NewTimer(s.metrics.queryDuration).ObserveDuration()
|
defer prometheus.NewTimer(s.metrics.queryDuration).ObserveDuration()
|
||||||
|
|
||||||
@ -794,9 +838,6 @@ func (s *Silences) query(q *query, now time.Time) ([]*pb.Silence, int, error) {
|
|||||||
// the use of post-filter functions is the trivial solution for now.
|
// the use of post-filter functions is the trivial solution for now.
|
||||||
var res []*pb.Silence
|
var res []*pb.Silence
|
||||||
|
|
||||||
s.mtx.Lock()
|
|
||||||
defer s.mtx.Unlock()
|
|
||||||
|
|
||||||
if q.ids != nil {
|
if q.ids != nil {
|
||||||
for _, id := range q.ids {
|
for _, id := range q.ids {
|
||||||
if s, ok := s.st[id]; ok {
|
if s, ok := s.st[id]; ok {
|
||||||
|
@ -18,6 +18,7 @@ import (
|
|||||||
"os"
|
"os"
|
||||||
"runtime"
|
"runtime"
|
||||||
"sort"
|
"sort"
|
||||||
|
"strings"
|
||||||
"sync"
|
"sync"
|
||||||
"testing"
|
"testing"
|
||||||
"time"
|
"time"
|
||||||
@ -458,6 +459,74 @@ func TestSilenceSet(t *testing.T) {
|
|||||||
require.Equal(t, want, s.st, "unexpected state after silence creation")
|
require.Equal(t, want, s.st, "unexpected state after silence creation")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func TestSilenceLimits(t *testing.T) {
|
||||||
|
s, err := New(Options{
|
||||||
|
Limits: Limits{
|
||||||
|
MaxSilences: 1,
|
||||||
|
MaxPerSilenceBytes: 2 << 11, // 4KB
|
||||||
|
},
|
||||||
|
})
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
// Insert sil1 should succeed without error.
|
||||||
|
sil1 := &pb.Silence{
|
||||||
|
Matchers: []*pb.Matcher{{Name: "a", Pattern: "b"}},
|
||||||
|
StartsAt: time.Now(),
|
||||||
|
EndsAt: time.Now().Add(5 * time.Minute),
|
||||||
|
}
|
||||||
|
id1, err := s.Set(sil1)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.NotEqual(t, "", id1)
|
||||||
|
|
||||||
|
// Insert sil2 should fail because maximum number of silences
|
||||||
|
// has been exceeded.
|
||||||
|
sil2 := &pb.Silence{
|
||||||
|
Matchers: []*pb.Matcher{{Name: "a", Pattern: "b"}},
|
||||||
|
StartsAt: time.Now(),
|
||||||
|
EndsAt: time.Now().Add(5 * time.Minute),
|
||||||
|
}
|
||||||
|
id2, err := s.Set(sil2)
|
||||||
|
require.EqualError(t, err, "exceeded maximum number of silences: 1 (limit: 1)")
|
||||||
|
require.Equal(t, "", id2)
|
||||||
|
|
||||||
|
// Expire sil1. This should allow sil2 to be inserted.
|
||||||
|
require.NoError(t, s.Expire(id1))
|
||||||
|
id2, err = s.Set(sil2)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.NotEqual(t, "", id2)
|
||||||
|
|
||||||
|
// Should be able to update sil2 without hitting the limit.
|
||||||
|
_, err = s.Set(sil2)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
// Expire sil2.
|
||||||
|
require.NoError(t, s.Expire(id2))
|
||||||
|
|
||||||
|
// Insert sil3 should fail because it exceeds maximum size.
|
||||||
|
sil3 := &pb.Silence{
|
||||||
|
Matchers: []*pb.Matcher{
|
||||||
|
{
|
||||||
|
Name: strings.Repeat("a", 2<<9),
|
||||||
|
Pattern: strings.Repeat("b", 2<<9),
|
||||||
|
},
|
||||||
|
{
|
||||||
|
Name: strings.Repeat("c", 2<<9),
|
||||||
|
Pattern: strings.Repeat("d", 2<<9),
|
||||||
|
},
|
||||||
|
},
|
||||||
|
CreatedBy: strings.Repeat("e", 2<<9),
|
||||||
|
Comment: strings.Repeat("f", 2<<9),
|
||||||
|
StartsAt: time.Now(),
|
||||||
|
EndsAt: time.Now().Add(5 * time.Minute),
|
||||||
|
}
|
||||||
|
id3, err := s.Set(sil3)
|
||||||
|
require.Error(t, err)
|
||||||
|
// Do not check the exact size as it can change between consecutive runs
|
||||||
|
// due to padding.
|
||||||
|
require.Contains(t, err.Error(), "silence exceeded maximum size")
|
||||||
|
require.Equal(t, "", id3)
|
||||||
|
}
|
||||||
|
|
||||||
func TestSetActiveSilence(t *testing.T) {
|
func TestSetActiveSilence(t *testing.T) {
|
||||||
s, err := New(Options{
|
s, err := New(Options{
|
||||||
Retention: time.Hour,
|
Retention: time.Hour,
|
||||||
|
Loading…
Reference in New Issue
Block a user