From 3dc623d30bad51df6af6958c3f2c54d7497a987c Mon Sep 17 00:00:00 2001 From: machine424 Date: Thu, 26 Sep 2024 14:53:48 +0200 Subject: [PATCH 1/2] test(notifier): add reproducer Signed-off-by: machine424 Co-authored-by: tommy0 --- notifier/notifier_test.go | 46 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/notifier/notifier_test.go b/notifier/notifier_test.go index 68dd445811..b900580498 100644 --- a/notifier/notifier_test.go +++ b/notifier/notifier_test.go @@ -38,6 +38,7 @@ import ( "github.com/prometheus/prometheus/discovery" "github.com/prometheus/prometheus/config" + _ "github.com/prometheus/prometheus/discovery/file" "github.com/prometheus/prometheus/discovery/targetgroup" "github.com/prometheus/prometheus/model/labels" "github.com/prometheus/prometheus/model/relabel" @@ -1017,3 +1018,48 @@ func TestStop_DrainingEnabled(t *testing.T) { require.Equal(t, int64(2), alertsReceived.Load()) } + +func TestAlertmanagersNotDroppedDuringApplyConfig(t *testing.T) { + targetURL := "alertmanager:9093" + targetGroup := &targetgroup.Group{ + Targets: []model.LabelSet{ + { + "__address__": model.LabelValue(targetURL), + }, + }, + } + alertmanagerURL := fmt.Sprintf("http://%s/api/v2/alerts", targetURL) + + n := NewManager(&Options{}, nil) + cfg := &config.Config{} + s := ` +alerting: + alertmanagers: + - file_sd_configs: + - files: + - foo.json +` + // TODO: add order change test + // TODO: add entry removed with DS manager + + err := yaml.UnmarshalStrict([]byte(s), cfg) + require.NoError(t, err) + require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 1) + + yaml.Marshal(cfg.AlertingConfig.AlertmanagerConfigs) + + // First apply config and reload. + err = n.ApplyConfig(cfg) + require.NoError(t, err) + tgs := map[string][]*targetgroup.Group{"config-0": {targetGroup}} + n.reload(tgs) + require.Len(t, n.Alertmanagers(), 1) + require.Equal(t, alertmanagerURL, n.Alertmanagers()[0].String()) + + // Reapply the config. + err = n.ApplyConfig(cfg) + require.NoError(t, err) + // The already known alertmanagers shouldn't get dropped. + require.Len(t, n.Alertmanagers(), 1) + require.Equal(t, alertmanagerURL, n.Alertmanagers()[0].String()) +} From 83ee57343a0a7a44394a705063a8158a3f410ba2 Mon Sep 17 00:00:00 2001 From: machine424 Date: Thu, 26 Sep 2024 14:53:17 +0200 Subject: [PATCH 2/2] fix(notifier): stop dropping known alertmanagers on each ApplyConfig and waiting on SD to update them. Signed-off-by: machine424 --- notifier/notifier.go | 32 ++++++++++++++ notifier/notifier_test.go | 87 ++++++++++++++++++++++++++++++++------- 2 files changed, 105 insertions(+), 14 deletions(-) diff --git a/notifier/notifier.go b/notifier/notifier.go index 5374e73d62..1aa7977724 100644 --- a/notifier/notifier.go +++ b/notifier/notifier.go @@ -16,6 +16,8 @@ package notifier import ( "bytes" "context" + "crypto/md5" + "encoding/hex" "encoding/json" "fmt" "io" @@ -35,6 +37,7 @@ import ( "github.com/prometheus/common/sigv4" "github.com/prometheus/common/version" "go.uber.org/atomic" + "gopkg.in/yaml.v2" "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/discovery/targetgroup" @@ -257,6 +260,16 @@ func (n *Manager) ApplyConfig(conf *config.Config) error { n.opts.RelabelConfigs = conf.AlertingConfig.AlertRelabelConfigs amSets := make(map[string]*alertmanagerSet) + // configToAlertmanagers maps alertmanager sets for each unique AlertmanagerConfig, + // helping to avoid dropping known alertmanagers and re-use them without waiting for SD updates when applying the config. + configToAlertmanagers := make(map[string]*alertmanagerSet, len(n.alertmanagers)) + for _, oldAmSet := range n.alertmanagers { + hash, err := oldAmSet.configHash() + if err != nil { + return err + } + configToAlertmanagers[hash] = oldAmSet + } for k, cfg := range conf.AlertingConfig.AlertmanagerConfigs.ToMap() { ams, err := newAlertmanagerSet(cfg, n.logger, n.metrics) @@ -264,6 +277,16 @@ func (n *Manager) ApplyConfig(conf *config.Config) error { return err } + hash, err := ams.configHash() + if err != nil { + return err + } + + if oldAmSet, ok := configToAlertmanagers[hash]; ok { + ams.ams = oldAmSet.ams + ams.droppedAms = oldAmSet.droppedAms + } + amSets[k] = ams } @@ -803,6 +826,15 @@ func (s *alertmanagerSet) sync(tgs []*targetgroup.Group) { } } +func (s *alertmanagerSet) configHash() (string, error) { + b, err := yaml.Marshal(s.cfg) + if err != nil { + return "", err + } + hash := md5.Sum(b) + return hex.EncodeToString(hash[:]), nil +} + func postPath(pre string, v config.AlertmanagerAPIVersion) string { alertPushEndpoint := fmt.Sprintf("/api/%v/alerts", string(v)) return path.Join("/", pre, alertPushEndpoint) diff --git a/notifier/notifier_test.go b/notifier/notifier_test.go index b900580498..148a129991 100644 --- a/notifier/notifier_test.go +++ b/notifier/notifier_test.go @@ -1019,7 +1019,7 @@ func TestStop_DrainingEnabled(t *testing.T) { require.Equal(t, int64(2), alertsReceived.Load()) } -func TestAlertmanagersNotDroppedDuringApplyConfig(t *testing.T) { +func TestApplyConfig(t *testing.T) { targetURL := "alertmanager:9093" targetGroup := &targetgroup.Group{ Targets: []model.LabelSet{ @@ -1039,27 +1039,86 @@ alerting: - files: - foo.json ` - // TODO: add order change test - // TODO: add entry removed with DS manager - - err := yaml.UnmarshalStrict([]byte(s), cfg) - require.NoError(t, err) + // 1. Ensure known alertmanagers are not dropped during ApplyConfig. + require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg)) require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 1) - yaml.Marshal(cfg.AlertingConfig.AlertmanagerConfigs) - - // First apply config and reload. - err = n.ApplyConfig(cfg) - require.NoError(t, err) + // First, apply the config and reload. + require.NoError(t, n.ApplyConfig(cfg)) tgs := map[string][]*targetgroup.Group{"config-0": {targetGroup}} n.reload(tgs) require.Len(t, n.Alertmanagers(), 1) require.Equal(t, alertmanagerURL, n.Alertmanagers()[0].String()) // Reapply the config. - err = n.ApplyConfig(cfg) - require.NoError(t, err) - // The already known alertmanagers shouldn't get dropped. + require.NoError(t, n.ApplyConfig(cfg)) + // Ensure the known alertmanagers are not dropped. require.Len(t, n.Alertmanagers(), 1) require.Equal(t, alertmanagerURL, n.Alertmanagers()[0].String()) + + // 2. Ensure known alertmanagers are not dropped during ApplyConfig even when + // the config order changes. + s = ` +alerting: + alertmanagers: + - static_configs: + - file_sd_configs: + - files: + - foo.json +` + require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg)) + require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 2) + + require.NoError(t, n.ApplyConfig(cfg)) + require.Len(t, n.Alertmanagers(), 1) + // Ensure no unnecessary alertmanagers are injected. + require.Empty(t, n.alertmanagers["config-0"].ams) + // Ensure the config order is taken into account. + ams := n.alertmanagers["config-1"].ams + require.Len(t, ams, 1) + require.Equal(t, alertmanagerURL, ams[0].url().String()) + + // 3. Ensure known alertmanagers are reused for new config with identical AlertmanagerConfig. + s = ` +alerting: + alertmanagers: + - file_sd_configs: + - files: + - foo.json + - file_sd_configs: + - files: + - foo.json +` + require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg)) + require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 2) + + require.NoError(t, n.ApplyConfig(cfg)) + require.Len(t, n.Alertmanagers(), 2) + for cfgIdx := range 2 { + ams := n.alertmanagers[fmt.Sprintf("config-%d", cfgIdx)].ams + require.Len(t, ams, 1) + require.Equal(t, alertmanagerURL, ams[0].url().String()) + } + + // 4. Ensure known alertmanagers are reused only for identical AlertmanagerConfig. + s = ` +alerting: + alertmanagers: + - file_sd_configs: + - files: + - foo.json + path_prefix: /bar + - file_sd_configs: + - files: + - foo.json + relabel_configs: + - source_labels: ['__address__'] + regex: 'doesntmatter:1234' + action: drop +` + require.NoError(t, yaml.UnmarshalStrict([]byte(s), cfg)) + require.Len(t, cfg.AlertingConfig.AlertmanagerConfigs, 2) + + require.NoError(t, n.ApplyConfig(cfg)) + require.Empty(t, n.Alertmanagers()) }