From 51eebbef85a46f4dfeae0ca9aa4a04be171720fe Mon Sep 17 00:00:00 2001 From: stuart nelson Date: Mon, 18 Feb 2019 17:06:51 +0100 Subject: [PATCH] Stn/correctly mark api silences (#1733) * Update alert status on every GET to alerts Signed-off-by: stuart nelson --- api/api.go | 10 +- api/v2/api.go | 22 +++- cmd/alertmanager/main.go | 37 +++++- test/with_api_v2/acceptance.go | 9 ++ test/with_api_v2/acceptance/api_test.go | 167 ++++++++++++++++++++++++ 5 files changed, 232 insertions(+), 13 deletions(-) create mode 100644 test/with_api_v2/acceptance/api_test.go diff --git a/api/api.go b/api/api.go index bdebb177..46c5caa3 100644 --- a/api/api.go +++ b/api/api.go @@ -84,7 +84,8 @@ func (o Options) validate() error { return nil } -// New creates a new API object combining all API versions. +// New creates a new API object combining all API versions. Note that an Update +// call is also needed to get the APIs into an operational state. func New(opts Options) (*API, error) { if err := opts.validate(); err != nil { return nil, fmt.Errorf("invalid API options: %s", err) @@ -183,13 +184,14 @@ func (api *API) Register(r *route.Router, routePrefix string) *http.ServeMux { return mux } -// Update config and resolve timeout of each API. -func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration) error { +// Update config and resolve timeout of each API. APIv2 also needs +// setAlertStatus to be updated. +func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration, setAlertStatus func(model.LabelSet) error) error { if err := api.v1.Update(cfg, resolveTimeout); err != nil { return err } - return api.v2.Update(cfg, resolveTimeout) + return api.v2.Update(cfg, resolveTimeout, setAlertStatus) } func (api *API) limitHandler(h http.Handler) http.Handler { diff --git a/api/v2/api.go b/api/v2/api.go index ae633342..86639208 100644 --- a/api/v2/api.go +++ b/api/v2/api.go @@ -56,13 +56,14 @@ type API struct { getAlertStatus getAlertStatusFn uptime time.Time - // mtx protects resolveTimeout, alertmanagerConfig and route. + // mtx protects resolveTimeout, alertmanagerConfig, setAlertStatus and route. mtx sync.RWMutex // resolveTimeout represents the default resolve timeout that an alert is // assigned if no end time is specified. resolveTimeout time.Duration alertmanagerConfig *config.Config route *dispatch.Route + setAlertStatus setAlertStatusFn logger log.Logger @@ -70,9 +71,16 @@ type API struct { } type getAlertStatusFn func(prometheus_model.Fingerprint) types.AlertStatus +type setAlertStatusFn func(prometheus_model.LabelSet) error // NewAPI returns a new Alertmanager API v2 -func NewAPI(alerts provider.Alerts, sf getAlertStatusFn, silences *silence.Silences, peer *cluster.Peer, l log.Logger) (*API, error) { +func NewAPI( + alerts provider.Alerts, + sf getAlertStatusFn, + silences *silence.Silences, + peer *cluster.Peer, + l log.Logger, +) (*API, error) { api := API{ alerts: alerts, getAlertStatus: sf, @@ -114,14 +122,15 @@ func NewAPI(alerts provider.Alerts, sf getAlertStatusFn, silences *silence.Silen return &api, nil } -// Update sets the configuration string to a new value. -func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration) error { +// Update sets the API struct members that may change between reloads of alertmanager. +func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration, setAlertStatus setAlertStatusFn) error { api.mtx.Lock() defer api.mtx.Unlock() api.resolveTimeout = resolveTimeout api.alertmanagerConfig = cfg api.route = dispatch.NewRoute(cfg.Route, nil) + api.setAlertStatus = setAlertStatus return nil } @@ -252,6 +261,11 @@ func (api *API) getAlertsHandler(params alert_ops.GetAlertsParams) middleware.Re continue } + // Set alert's current status based on its label set. + if err := api.setAlertStatus(a.Labels); err != nil { + level.Error(api.logger).Log("msg", "set alert status failed", "err", err) + } + // Get alert's current status after seeing if it is suppressed. status := api.getAlertStatus(a.Fingerprint()) if !*params.Active && status.State == types.AlertStateActive { diff --git a/cmd/alertmanager/main.go b/cmd/alertmanager/main.go index b799edd7..b59962ad 100644 --- a/cmd/alertmanager/main.go +++ b/cmd/alertmanager/main.go @@ -34,6 +34,7 @@ import ( "github.com/go-kit/kit/log/level" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" + "github.com/prometheus/common/model" "github.com/prometheus/common/promlog" promlogflag "github.com/prometheus/common/promlog/flag" "github.com/prometheus/common/route" @@ -324,11 +325,6 @@ func run() int { hash = md5HashAsMetricValue(plainCfg) - err = api.Update(conf, time.Duration(conf.Global.ResolveTimeout)) - if err != nil { - return err - } - tmpl, err = template.FromGlobs(conf.Templates...) if err != nil { return err @@ -350,6 +346,12 @@ func run() int { peer, logger, ) + + err = api.Update(conf, time.Duration(conf.Global.ResolveTimeout), setAlertStatus(inhibitor, marker, silences)) + if err != nil { + return err + } + disp = dispatch.NewDispatcher(alerts, dispatch.NewRoute(conf.Route, nil), pipeline, marker, timeoutFunc, logger) go disp.Run() @@ -476,3 +478,28 @@ func md5HashAsMetricValue(data []byte) float64 { copy(bytes, smallSum) return float64(binary.LittleEndian.Uint64(bytes)) } + +func setAlertStatus(inhibitor *inhibit.Inhibitor, marker types.Marker, silences *silence.Silences) func(model.LabelSet) error { + return func(labels model.LabelSet) error { + inhibitor.Mutes(labels) + // TODO(beorn7): The following code is almost exactly replicated in notify/notify.go. + sils, err := silences.Query( + silence.QState(types.SilenceStateActive), + silence.QMatches(labels), + ) + if err != nil { + return fmt.Errorf("failed to query silences: %v", err) + } + + if len(sils) > 0 { + ids := make([]string, len(sils)) + for i, s := range sils { + ids[i] = s.Id + } + marker.SetSilenced(labels.Fingerprint(), ids...) + } else { + marker.SetSilenced(labels.Fingerprint()) + } + return nil + } +} diff --git a/test/with_api_v2/acceptance.go b/test/with_api_v2/acceptance.go index 2725d52f..5895c113 100644 --- a/test/with_api_v2/acceptance.go +++ b/test/with_api_v2/acceptance.go @@ -289,6 +289,11 @@ func (amc *AlertmanagerCluster) Start() error { return nil } +// Members returns the underlying slice of cluster members. +func (amc *AlertmanagerCluster) Members() []*Alertmanager { + return amc.ams +} + // Start the alertmanager and wait until it is ready to receive. func (am *Alertmanager) Start(additionalArg []string) error { am.t.Helper() @@ -555,6 +560,10 @@ func (am *Alertmanager) GenericAPIV2Call(at float64, f func()) { am.t.Do(at, f) } +func (am *Alertmanager) Client() *apiclient.Alertmanager { + return am.clientV2 +} + func (am *Alertmanager) getURL(path string) string { return fmt.Sprintf("http://%s%s%s", am.apiAddr, am.opts.RoutePrefix, path) } diff --git a/test/with_api_v2/acceptance/api_test.go b/test/with_api_v2/acceptance/api_test.go new file mode 100644 index 00000000..a7ec4b93 --- /dev/null +++ b/test/with_api_v2/acceptance/api_test.go @@ -0,0 +1,167 @@ +// Copyright 2018 Prometheus Team +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package test + +import ( + "fmt" + "testing" + "time" + + "github.com/go-openapi/strfmt" + a "github.com/prometheus/alertmanager/test/with_api_v2" + "github.com/prometheus/alertmanager/test/with_api_v2/api_v2_client/client/alert" + "github.com/prometheus/alertmanager/test/with_api_v2/api_v2_client/client/silence" + "github.com/prometheus/alertmanager/test/with_api_v2/api_v2_client/models" + "github.com/prometheus/common/model" + "github.com/stretchr/testify/require" +) + +// TestAlertGetReturnsCurrentStatus checks that querying the API returns the +// current status of each alert, i.e. if it is silenced or inhibited. +func TestAlertGetReturnsCurrentAlertStatus(t *testing.T) { + t.Parallel() + + conf := ` +route: + receiver: "default" + group_by: [] + group_wait: 1s + group_interval: 10m + repeat_interval: 1h + +inhibit_rules: + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname'] + +receivers: +- name: "default" + webhook_configs: + - url: 'http://%s' +` + + at := a.NewAcceptanceTest(t, &a.AcceptanceOpts{ + Tolerance: 1 * time.Second, + }) + co := at.Collector("webhook") + wh := a.NewWebhook(co) + + amc := at.AlertmanagerCluster(fmt.Sprintf(conf, wh.Address()), 1) + require.NoError(t, amc.Start()) + defer amc.Terminate() + + am := amc.Members()[0] + + labelName := "alertname" + labelValue := "test1" + + now := time.Now() + startsAt := strfmt.DateTime(now) + endsAt := strfmt.DateTime(now.Add(5 * time.Minute)) + + labels := models.LabelSet(map[string]string{labelName: labelValue, "severity": "warning"}) + fp := model.LabelSet{model.LabelName(labelName): model.LabelValue(labelValue), "severity": "warning"}.Fingerprint() + pa := &models.PostableAlert{ + StartsAt: startsAt, + EndsAt: endsAt, + Alert: models.Alert{Labels: labels}, + } + alertParams := alert.NewPostAlertsParams() + alertParams.Alerts = models.PostableAlerts{pa} + _, err := am.Client().Alert.PostAlerts(alertParams) + require.NoError(t, err) + + resp, err := am.Client().Alert.GetAlerts(nil) + require.NoError(t, err) + // No silence has been created or inhibiting alert sent, alert should + // be active. + for _, alert := range resp.Payload { + require.Equal(t, models.AlertStatusStateActive, *alert.Status.State) + } + + // Wait for group_wait, so that we are in the group_interval period, + // when the pipeline won't update the alert's status. + time.Sleep(2 * time.Second) + + // Create silence and verify that the alert is immediately marked + // silenced via the API. + silenceParams := silence.NewPostSilencesParams() + + cm := "a" + isRegex := false + ps := &models.PostableSilence{ + Silence: models.Silence{ + StartsAt: &startsAt, + EndsAt: &endsAt, + Comment: &cm, + CreatedBy: &cm, + Matchers: models.Matchers{ + &models.Matcher{Name: &labelName, Value: &labelValue, IsRegex: &isRegex}, + }, + }, + } + silenceParams.Silence = ps + silenceResp, err := am.Client().Silence.PostSilences(silenceParams) + require.NoError(t, err) + silenceID := silenceResp.Payload.SilenceID + + resp, err = am.Client().Alert.GetAlerts(nil) + require.NoError(t, err) + for _, alert := range resp.Payload { + require.Equal(t, models.AlertStatusStateSuppressed, *alert.Status.State) + require.Equal(t, fp.String(), *alert.Fingerprint) + require.Equal(t, 1, len(alert.Status.SilencedBy)) + require.Equal(t, silenceID, alert.Status.SilencedBy[0]) + } + + // Create inhibiting alert and verify that original alert is + // immediately marked as inhibited. + labels["severity"] = "critical" + _, err = am.Client().Alert.PostAlerts(alertParams) + require.NoError(t, err) + + inhibitingFP := model.LabelSet{model.LabelName(labelName): model.LabelValue(labelValue), "severity": "critical"}.Fingerprint() + + resp, err = am.Client().Alert.GetAlerts(nil) + require.NoError(t, err) + for _, alert := range resp.Payload { + require.Equal(t, 1, len(alert.Status.SilencedBy)) + require.Equal(t, silenceID, alert.Status.SilencedBy[0]) + if fp.String() == *alert.Fingerprint { + require.Equal(t, models.AlertStatusStateSuppressed, *alert.Status.State) + require.Equal(t, fp.String(), *alert.Fingerprint) + require.Equal(t, 1, len(alert.Status.InhibitedBy)) + require.Equal(t, inhibitingFP.String(), alert.Status.InhibitedBy[0]) + } + } + + deleteParams := silence.NewDeleteSilenceParams().WithSilenceID(strfmt.UUID(silenceID)) + _, err = am.Client().Silence.DeleteSilence(deleteParams) + require.NoError(t, err) + + resp, err = am.Client().Alert.GetAlerts(nil) + require.NoError(t, err) + // Silence has been deleted, inhibiting alert should be active. + // Original alert should still be inhibited. + for _, alert := range resp.Payload { + require.Equal(t, 0, len(alert.Status.SilencedBy)) + if inhibitingFP.String() == *alert.Fingerprint { + require.Equal(t, models.AlertStatusStateActive, *alert.Status.State) + } else { + require.Equal(t, models.AlertStatusStateSuppressed, *alert.Status.State) + } + } +}