Stn/correctly mark api silences (#1733)

* Update alert status on every GET to alerts

Signed-off-by: stuart nelson <stuartnelson3@gmail.com>
This commit is contained in:
stuart nelson 2019-02-18 17:06:51 +01:00 committed by GitHub
parent da6e2a88dd
commit 51eebbef85
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 232 additions and 13 deletions

View File

@ -84,7 +84,8 @@ func (o Options) validate() error {
return nil
}
// New creates a new API object combining all API versions.
// New creates a new API object combining all API versions. Note that an Update
// call is also needed to get the APIs into an operational state.
func New(opts Options) (*API, error) {
if err := opts.validate(); err != nil {
return nil, fmt.Errorf("invalid API options: %s", err)
@ -183,13 +184,14 @@ func (api *API) Register(r *route.Router, routePrefix string) *http.ServeMux {
return mux
}
// Update config and resolve timeout of each API.
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration) error {
// Update config and resolve timeout of each API. APIv2 also needs
// setAlertStatus to be updated.
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration, setAlertStatus func(model.LabelSet) error) error {
if err := api.v1.Update(cfg, resolveTimeout); err != nil {
return err
}
return api.v2.Update(cfg, resolveTimeout)
return api.v2.Update(cfg, resolveTimeout, setAlertStatus)
}
func (api *API) limitHandler(h http.Handler) http.Handler {

View File

@ -56,13 +56,14 @@ type API struct {
getAlertStatus getAlertStatusFn
uptime time.Time
// mtx protects resolveTimeout, alertmanagerConfig and route.
// mtx protects resolveTimeout, alertmanagerConfig, setAlertStatus and route.
mtx sync.RWMutex
// resolveTimeout represents the default resolve timeout that an alert is
// assigned if no end time is specified.
resolveTimeout time.Duration
alertmanagerConfig *config.Config
route *dispatch.Route
setAlertStatus setAlertStatusFn
logger log.Logger
@ -70,9 +71,16 @@ type API struct {
}
type getAlertStatusFn func(prometheus_model.Fingerprint) types.AlertStatus
type setAlertStatusFn func(prometheus_model.LabelSet) error
// NewAPI returns a new Alertmanager API v2
func NewAPI(alerts provider.Alerts, sf getAlertStatusFn, silences *silence.Silences, peer *cluster.Peer, l log.Logger) (*API, error) {
func NewAPI(
alerts provider.Alerts,
sf getAlertStatusFn,
silences *silence.Silences,
peer *cluster.Peer,
l log.Logger,
) (*API, error) {
api := API{
alerts: alerts,
getAlertStatus: sf,
@ -114,14 +122,15 @@ func NewAPI(alerts provider.Alerts, sf getAlertStatusFn, silences *silence.Silen
return &api, nil
}
// Update sets the configuration string to a new value.
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration) error {
// Update sets the API struct members that may change between reloads of alertmanager.
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration, setAlertStatus setAlertStatusFn) error {
api.mtx.Lock()
defer api.mtx.Unlock()
api.resolveTimeout = resolveTimeout
api.alertmanagerConfig = cfg
api.route = dispatch.NewRoute(cfg.Route, nil)
api.setAlertStatus = setAlertStatus
return nil
}
@ -252,6 +261,11 @@ func (api *API) getAlertsHandler(params alert_ops.GetAlertsParams) middleware.Re
continue
}
// Set alert's current status based on its label set.
if err := api.setAlertStatus(a.Labels); err != nil {
level.Error(api.logger).Log("msg", "set alert status failed", "err", err)
}
// Get alert's current status after seeing if it is suppressed.
status := api.getAlertStatus(a.Fingerprint())
if !*params.Active && status.State == types.AlertStateActive {

View File

@ -34,6 +34,7 @@ import (
"github.com/go-kit/kit/log/level"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"github.com/prometheus/common/model"
"github.com/prometheus/common/promlog"
promlogflag "github.com/prometheus/common/promlog/flag"
"github.com/prometheus/common/route"
@ -324,11 +325,6 @@ func run() int {
hash = md5HashAsMetricValue(plainCfg)
err = api.Update(conf, time.Duration(conf.Global.ResolveTimeout))
if err != nil {
return err
}
tmpl, err = template.FromGlobs(conf.Templates...)
if err != nil {
return err
@ -350,6 +346,12 @@ func run() int {
peer,
logger,
)
err = api.Update(conf, time.Duration(conf.Global.ResolveTimeout), setAlertStatus(inhibitor, marker, silences))
if err != nil {
return err
}
disp = dispatch.NewDispatcher(alerts, dispatch.NewRoute(conf.Route, nil), pipeline, marker, timeoutFunc, logger)
go disp.Run()
@ -476,3 +478,28 @@ func md5HashAsMetricValue(data []byte) float64 {
copy(bytes, smallSum)
return float64(binary.LittleEndian.Uint64(bytes))
}
func setAlertStatus(inhibitor *inhibit.Inhibitor, marker types.Marker, silences *silence.Silences) func(model.LabelSet) error {
return func(labels model.LabelSet) error {
inhibitor.Mutes(labels)
// TODO(beorn7): The following code is almost exactly replicated in notify/notify.go.
sils, err := silences.Query(
silence.QState(types.SilenceStateActive),
silence.QMatches(labels),
)
if err != nil {
return fmt.Errorf("failed to query silences: %v", err)
}
if len(sils) > 0 {
ids := make([]string, len(sils))
for i, s := range sils {
ids[i] = s.Id
}
marker.SetSilenced(labels.Fingerprint(), ids...)
} else {
marker.SetSilenced(labels.Fingerprint())
}
return nil
}
}

View File

@ -289,6 +289,11 @@ func (amc *AlertmanagerCluster) Start() error {
return nil
}
// Members returns the underlying slice of cluster members.
func (amc *AlertmanagerCluster) Members() []*Alertmanager {
return amc.ams
}
// Start the alertmanager and wait until it is ready to receive.
func (am *Alertmanager) Start(additionalArg []string) error {
am.t.Helper()
@ -555,6 +560,10 @@ func (am *Alertmanager) GenericAPIV2Call(at float64, f func()) {
am.t.Do(at, f)
}
func (am *Alertmanager) Client() *apiclient.Alertmanager {
return am.clientV2
}
func (am *Alertmanager) getURL(path string) string {
return fmt.Sprintf("http://%s%s%s", am.apiAddr, am.opts.RoutePrefix, path)
}

View File

@ -0,0 +1,167 @@
// Copyright 2018 Prometheus Team
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package test
import (
"fmt"
"testing"
"time"
"github.com/go-openapi/strfmt"
a "github.com/prometheus/alertmanager/test/with_api_v2"
"github.com/prometheus/alertmanager/test/with_api_v2/api_v2_client/client/alert"
"github.com/prometheus/alertmanager/test/with_api_v2/api_v2_client/client/silence"
"github.com/prometheus/alertmanager/test/with_api_v2/api_v2_client/models"
"github.com/prometheus/common/model"
"github.com/stretchr/testify/require"
)
// TestAlertGetReturnsCurrentStatus checks that querying the API returns the
// current status of each alert, i.e. if it is silenced or inhibited.
func TestAlertGetReturnsCurrentAlertStatus(t *testing.T) {
t.Parallel()
conf := `
route:
receiver: "default"
group_by: []
group_wait: 1s
group_interval: 10m
repeat_interval: 1h
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname']
receivers:
- name: "default"
webhook_configs:
- url: 'http://%s'
`
at := a.NewAcceptanceTest(t, &a.AcceptanceOpts{
Tolerance: 1 * time.Second,
})
co := at.Collector("webhook")
wh := a.NewWebhook(co)
amc := at.AlertmanagerCluster(fmt.Sprintf(conf, wh.Address()), 1)
require.NoError(t, amc.Start())
defer amc.Terminate()
am := amc.Members()[0]
labelName := "alertname"
labelValue := "test1"
now := time.Now()
startsAt := strfmt.DateTime(now)
endsAt := strfmt.DateTime(now.Add(5 * time.Minute))
labels := models.LabelSet(map[string]string{labelName: labelValue, "severity": "warning"})
fp := model.LabelSet{model.LabelName(labelName): model.LabelValue(labelValue), "severity": "warning"}.Fingerprint()
pa := &models.PostableAlert{
StartsAt: startsAt,
EndsAt: endsAt,
Alert: models.Alert{Labels: labels},
}
alertParams := alert.NewPostAlertsParams()
alertParams.Alerts = models.PostableAlerts{pa}
_, err := am.Client().Alert.PostAlerts(alertParams)
require.NoError(t, err)
resp, err := am.Client().Alert.GetAlerts(nil)
require.NoError(t, err)
// No silence has been created or inhibiting alert sent, alert should
// be active.
for _, alert := range resp.Payload {
require.Equal(t, models.AlertStatusStateActive, *alert.Status.State)
}
// Wait for group_wait, so that we are in the group_interval period,
// when the pipeline won't update the alert's status.
time.Sleep(2 * time.Second)
// Create silence and verify that the alert is immediately marked
// silenced via the API.
silenceParams := silence.NewPostSilencesParams()
cm := "a"
isRegex := false
ps := &models.PostableSilence{
Silence: models.Silence{
StartsAt: &startsAt,
EndsAt: &endsAt,
Comment: &cm,
CreatedBy: &cm,
Matchers: models.Matchers{
&models.Matcher{Name: &labelName, Value: &labelValue, IsRegex: &isRegex},
},
},
}
silenceParams.Silence = ps
silenceResp, err := am.Client().Silence.PostSilences(silenceParams)
require.NoError(t, err)
silenceID := silenceResp.Payload.SilenceID
resp, err = am.Client().Alert.GetAlerts(nil)
require.NoError(t, err)
for _, alert := range resp.Payload {
require.Equal(t, models.AlertStatusStateSuppressed, *alert.Status.State)
require.Equal(t, fp.String(), *alert.Fingerprint)
require.Equal(t, 1, len(alert.Status.SilencedBy))
require.Equal(t, silenceID, alert.Status.SilencedBy[0])
}
// Create inhibiting alert and verify that original alert is
// immediately marked as inhibited.
labels["severity"] = "critical"
_, err = am.Client().Alert.PostAlerts(alertParams)
require.NoError(t, err)
inhibitingFP := model.LabelSet{model.LabelName(labelName): model.LabelValue(labelValue), "severity": "critical"}.Fingerprint()
resp, err = am.Client().Alert.GetAlerts(nil)
require.NoError(t, err)
for _, alert := range resp.Payload {
require.Equal(t, 1, len(alert.Status.SilencedBy))
require.Equal(t, silenceID, alert.Status.SilencedBy[0])
if fp.String() == *alert.Fingerprint {
require.Equal(t, models.AlertStatusStateSuppressed, *alert.Status.State)
require.Equal(t, fp.String(), *alert.Fingerprint)
require.Equal(t, 1, len(alert.Status.InhibitedBy))
require.Equal(t, inhibitingFP.String(), alert.Status.InhibitedBy[0])
}
}
deleteParams := silence.NewDeleteSilenceParams().WithSilenceID(strfmt.UUID(silenceID))
_, err = am.Client().Silence.DeleteSilence(deleteParams)
require.NoError(t, err)
resp, err = am.Client().Alert.GetAlerts(nil)
require.NoError(t, err)
// Silence has been deleted, inhibiting alert should be active.
// Original alert should still be inhibited.
for _, alert := range resp.Payload {
require.Equal(t, 0, len(alert.Status.SilencedBy))
if inhibitingFP.String() == *alert.Fingerprint {
require.Equal(t, models.AlertStatusStateActive, *alert.Status.State)
} else {
require.Equal(t, models.AlertStatusStateSuppressed, *alert.Status.State)
}
}
}