mirror of
https://github.com/prometheus/alertmanager
synced 2025-02-18 19:46:54 +00:00
Add limits for silences (#3852)
* Add limits for silences This commit adds limits for silences including the maximum number of active and pending silences, and the maximum size per silence (in bytes). Signed-off-by: George Robinson <george.robinson@grafana.com> * Remove default limits Signed-off-by: George Robinson <george.robinson@grafana.com> * Allow expiration of silences that exceed max size --------- Signed-off-by: George Robinson <george.robinson@grafana.com>
This commit is contained in:
parent
06b389bfec
commit
b67bde8cf9
@ -146,6 +146,8 @@ func run() int {
|
||||
dataDir = kingpin.Flag("storage.path", "Base path for data storage.").Default("data/").String()
|
||||
retention = kingpin.Flag("data.retention", "How long to keep data for.").Default("120h").Duration()
|
||||
maintenanceInterval = kingpin.Flag("data.maintenance-interval", "Interval between garbage collection and snapshotting to disk of the silences and the notification logs.").Default("15m").Duration()
|
||||
maxSilences = kingpin.Flag("silences.max-silences", "Maximum number of active and pending silences, excluding expired silences. If negative or zero, no limit is set.").Default("0").Int()
|
||||
maxPerSilenceBytes = kingpin.Flag("silences.max-per-silence-bytes", "Maximum per silence size in bytes. If negative or zero, no limit is set.").Default("0").Int()
|
||||
alertGCInterval = kingpin.Flag("alerts.gc-interval", "Interval between alert GC.").Default("30m").Duration()
|
||||
|
||||
webConfig = webflag.AddFlags(kingpin.CommandLine, ":9093")
|
||||
@ -259,8 +261,12 @@ func run() int {
|
||||
silenceOpts := silence.Options{
|
||||
SnapshotFile: filepath.Join(*dataDir, "silences"),
|
||||
Retention: *retention,
|
||||
Logger: log.With(logger, "component", "silences"),
|
||||
Metrics: prometheus.DefaultRegisterer,
|
||||
Limits: silence.Limits{
|
||||
MaxSilences: *maxSilences,
|
||||
MaxPerSilenceBytes: *maxPerSilenceBytes,
|
||||
},
|
||||
Logger: log.With(logger, "component", "silences"),
|
||||
Metrics: prometheus.DefaultRegisterer,
|
||||
}
|
||||
|
||||
silences, err := silence.New(silenceOpts)
|
||||
|
@ -22,6 +22,17 @@ is not well-formed, the changes will not be applied and an error is logged.
|
||||
A configuration reload is triggered by sending a `SIGHUP` to the process or
|
||||
sending an HTTP POST request to the `/-/reload` endpoint.
|
||||
|
||||
## Limits
|
||||
|
||||
Alertmanager supports a number of configurable limits via command-line flags.
|
||||
|
||||
To limit the maximum number of active and pending silences, excluding expired ones,
|
||||
use the `--silences.max-silences` flag.
|
||||
You can limit the maximum size of individual silences with `--silences.max-per-silence-bytes`,
|
||||
where the unit is in bytes.
|
||||
|
||||
Both limits are disabled by default.
|
||||
|
||||
## Configuration file introduction
|
||||
|
||||
To specify which configuration file to load, use the `--config.file` flag.
|
||||
|
@ -193,6 +193,7 @@ type Silences struct {
|
||||
logger log.Logger
|
||||
metrics *metrics
|
||||
retention time.Duration
|
||||
limits Limits
|
||||
|
||||
mtx sync.RWMutex
|
||||
st state
|
||||
@ -201,6 +202,16 @@ type Silences struct {
|
||||
mc matcherCache
|
||||
}
|
||||
|
||||
// Limits contains the limits for silences.
|
||||
type Limits struct {
|
||||
// MaxSilences limits the maximum number active and pending silences.
|
||||
// It does not include expired silences.
|
||||
MaxSilences int
|
||||
// MaxPerSilenceBytes is the maximum size of an individual silence as
|
||||
// stored on disk.
|
||||
MaxPerSilenceBytes int
|
||||
}
|
||||
|
||||
// MaintenanceFunc represents the function to run as part of the periodic maintenance for silences.
|
||||
// It returns the size of the snapshot taken or an error if it failed.
|
||||
type MaintenanceFunc func() (int64, error)
|
||||
@ -318,6 +329,7 @@ type Options struct {
|
||||
// Retention time for newly created Silences. Silences may be
|
||||
// garbage collected after the given duration after they ended.
|
||||
Retention time.Duration
|
||||
Limits Limits
|
||||
|
||||
// A logger used by background processing.
|
||||
Logger log.Logger
|
||||
@ -342,6 +354,7 @@ func New(o Options) (*Silences, error) {
|
||||
mc: matcherCache{},
|
||||
logger: log.NewNopLogger(),
|
||||
retention: o.Retention,
|
||||
limits: o.Limits,
|
||||
broadcast: func([]byte) {},
|
||||
st: state{},
|
||||
}
|
||||
@ -569,6 +582,13 @@ func (s *Silences) setSilence(sil *pb.Silence, now time.Time, skipValidate bool)
|
||||
return err
|
||||
}
|
||||
|
||||
// Check the limit unless the silence has been expired. This is to avoid
|
||||
// situations where silences cannot be expired after the limit has been
|
||||
// reduced.
|
||||
if n := msil.Size(); s.limits.MaxPerSilenceBytes > 0 && n > s.limits.MaxPerSilenceBytes && sil.EndsAt.After(now) {
|
||||
return fmt.Errorf("silence exceeded maximum size: %d bytes (limit: %d bytes)", n, s.limits.MaxPerSilenceBytes)
|
||||
}
|
||||
|
||||
if s.st.merge(msil, now) {
|
||||
s.version++
|
||||
}
|
||||
@ -585,10 +605,10 @@ func (s *Silences) Set(sil *pb.Silence) (string, error) {
|
||||
|
||||
now := s.nowUTC()
|
||||
prev, ok := s.getSilence(sil.Id)
|
||||
|
||||
if sil.Id != "" && !ok {
|
||||
return "", ErrNotFound
|
||||
}
|
||||
|
||||
if ok {
|
||||
if canUpdate(prev, sil, now) {
|
||||
return sil.Id, s.setSilence(sil, now, false)
|
||||
@ -600,7 +620,24 @@ func (s *Silences) Set(sil *pb.Silence) (string, error) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// If we got here it's either a new silence or a replacing one.
|
||||
if s.limits.MaxSilences > 0 {
|
||||
// Get the number of active and pending silences to enforce limits.
|
||||
q := &query{}
|
||||
err := QState(types.SilenceStateActive, types.SilenceStatePending)(q)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("unable to query silences while checking limits: %w", err)
|
||||
}
|
||||
sils, _, err := s.query(q, s.nowUTC())
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("unable to query silences while checking limits: %w", err)
|
||||
}
|
||||
if len(sils)+1 > s.limits.MaxSilences {
|
||||
return "", fmt.Errorf("exceeded maximum number of silences: %d (limit: %d)", len(sils), s.limits.MaxSilences)
|
||||
}
|
||||
}
|
||||
|
||||
uid, err := uuid.NewV4()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("generate uuid: %w", err)
|
||||
@ -611,7 +648,11 @@ func (s *Silences) Set(sil *pb.Silence) (string, error) {
|
||||
sil.StartsAt = now
|
||||
}
|
||||
|
||||
return sil.Id, s.setSilence(sil, now, false)
|
||||
if err = s.setSilence(sil, now, false); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return sil.Id, nil
|
||||
}
|
||||
|
||||
// canUpdate returns true if silence a can be updated to b without
|
||||
@ -755,6 +796,9 @@ func (s *Silences) QueryOne(params ...QueryParam) (*pb.Silence, error) {
|
||||
// Query for silences based on the given query parameters. It returns the
|
||||
// resulting silences and the state version the result is based on.
|
||||
func (s *Silences) Query(params ...QueryParam) ([]*pb.Silence, int, error) {
|
||||
s.mtx.Lock()
|
||||
defer s.mtx.Unlock()
|
||||
|
||||
s.metrics.queriesTotal.Inc()
|
||||
defer prometheus.NewTimer(s.metrics.queryDuration).ObserveDuration()
|
||||
|
||||
@ -794,9 +838,6 @@ func (s *Silences) query(q *query, now time.Time) ([]*pb.Silence, int, error) {
|
||||
// the use of post-filter functions is the trivial solution for now.
|
||||
var res []*pb.Silence
|
||||
|
||||
s.mtx.Lock()
|
||||
defer s.mtx.Unlock()
|
||||
|
||||
if q.ids != nil {
|
||||
for _, id := range q.ids {
|
||||
if s, ok := s.st[id]; ok {
|
||||
|
@ -18,6 +18,7 @@ import (
|
||||
"os"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strings"
|
||||
"sync"
|
||||
"testing"
|
||||
"time"
|
||||
@ -458,6 +459,74 @@ func TestSilenceSet(t *testing.T) {
|
||||
require.Equal(t, want, s.st, "unexpected state after silence creation")
|
||||
}
|
||||
|
||||
func TestSilenceLimits(t *testing.T) {
|
||||
s, err := New(Options{
|
||||
Limits: Limits{
|
||||
MaxSilences: 1,
|
||||
MaxPerSilenceBytes: 2 << 11, // 4KB
|
||||
},
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
// Insert sil1 should succeed without error.
|
||||
sil1 := &pb.Silence{
|
||||
Matchers: []*pb.Matcher{{Name: "a", Pattern: "b"}},
|
||||
StartsAt: time.Now(),
|
||||
EndsAt: time.Now().Add(5 * time.Minute),
|
||||
}
|
||||
id1, err := s.Set(sil1)
|
||||
require.NoError(t, err)
|
||||
require.NotEqual(t, "", id1)
|
||||
|
||||
// Insert sil2 should fail because maximum number of silences
|
||||
// has been exceeded.
|
||||
sil2 := &pb.Silence{
|
||||
Matchers: []*pb.Matcher{{Name: "a", Pattern: "b"}},
|
||||
StartsAt: time.Now(),
|
||||
EndsAt: time.Now().Add(5 * time.Minute),
|
||||
}
|
||||
id2, err := s.Set(sil2)
|
||||
require.EqualError(t, err, "exceeded maximum number of silences: 1 (limit: 1)")
|
||||
require.Equal(t, "", id2)
|
||||
|
||||
// Expire sil1. This should allow sil2 to be inserted.
|
||||
require.NoError(t, s.Expire(id1))
|
||||
id2, err = s.Set(sil2)
|
||||
require.NoError(t, err)
|
||||
require.NotEqual(t, "", id2)
|
||||
|
||||
// Should be able to update sil2 without hitting the limit.
|
||||
_, err = s.Set(sil2)
|
||||
require.NoError(t, err)
|
||||
|
||||
// Expire sil2.
|
||||
require.NoError(t, s.Expire(id2))
|
||||
|
||||
// Insert sil3 should fail because it exceeds maximum size.
|
||||
sil3 := &pb.Silence{
|
||||
Matchers: []*pb.Matcher{
|
||||
{
|
||||
Name: strings.Repeat("a", 2<<9),
|
||||
Pattern: strings.Repeat("b", 2<<9),
|
||||
},
|
||||
{
|
||||
Name: strings.Repeat("c", 2<<9),
|
||||
Pattern: strings.Repeat("d", 2<<9),
|
||||
},
|
||||
},
|
||||
CreatedBy: strings.Repeat("e", 2<<9),
|
||||
Comment: strings.Repeat("f", 2<<9),
|
||||
StartsAt: time.Now(),
|
||||
EndsAt: time.Now().Add(5 * time.Minute),
|
||||
}
|
||||
id3, err := s.Set(sil3)
|
||||
require.Error(t, err)
|
||||
// Do not check the exact size as it can change between consecutive runs
|
||||
// due to padding.
|
||||
require.Contains(t, err.Error(), "silence exceeded maximum size")
|
||||
require.Equal(t, "", id3)
|
||||
}
|
||||
|
||||
func TestSetActiveSilence(t *testing.T) {
|
||||
s, err := New(Options{
|
||||
Retention: time.Hour,
|
||||
|
Loading…
Reference in New Issue
Block a user