Stn/correctly mark api silences (#1733)
* Update alert status on every GET to alerts Signed-off-by: stuart nelson <stuartnelson3@gmail.com>
This commit is contained in:
parent
da6e2a88dd
commit
51eebbef85
10
api/api.go
10
api/api.go
|
@ -84,7 +84,8 @@ func (o Options) validate() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// New creates a new API object combining all API versions.
|
||||
// New creates a new API object combining all API versions. Note that an Update
|
||||
// call is also needed to get the APIs into an operational state.
|
||||
func New(opts Options) (*API, error) {
|
||||
if err := opts.validate(); err != nil {
|
||||
return nil, fmt.Errorf("invalid API options: %s", err)
|
||||
|
@ -183,13 +184,14 @@ func (api *API) Register(r *route.Router, routePrefix string) *http.ServeMux {
|
|||
return mux
|
||||
}
|
||||
|
||||
// Update config and resolve timeout of each API.
|
||||
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration) error {
|
||||
// Update config and resolve timeout of each API. APIv2 also needs
|
||||
// setAlertStatus to be updated.
|
||||
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration, setAlertStatus func(model.LabelSet) error) error {
|
||||
if err := api.v1.Update(cfg, resolveTimeout); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return api.v2.Update(cfg, resolveTimeout)
|
||||
return api.v2.Update(cfg, resolveTimeout, setAlertStatus)
|
||||
}
|
||||
|
||||
func (api *API) limitHandler(h http.Handler) http.Handler {
|
||||
|
|
|
@ -56,13 +56,14 @@ type API struct {
|
|||
getAlertStatus getAlertStatusFn
|
||||
uptime time.Time
|
||||
|
||||
// mtx protects resolveTimeout, alertmanagerConfig and route.
|
||||
// mtx protects resolveTimeout, alertmanagerConfig, setAlertStatus and route.
|
||||
mtx sync.RWMutex
|
||||
// resolveTimeout represents the default resolve timeout that an alert is
|
||||
// assigned if no end time is specified.
|
||||
resolveTimeout time.Duration
|
||||
alertmanagerConfig *config.Config
|
||||
route *dispatch.Route
|
||||
setAlertStatus setAlertStatusFn
|
||||
|
||||
logger log.Logger
|
||||
|
||||
|
@ -70,9 +71,16 @@ type API struct {
|
|||
}
|
||||
|
||||
type getAlertStatusFn func(prometheus_model.Fingerprint) types.AlertStatus
|
||||
type setAlertStatusFn func(prometheus_model.LabelSet) error
|
||||
|
||||
// NewAPI returns a new Alertmanager API v2
|
||||
func NewAPI(alerts provider.Alerts, sf getAlertStatusFn, silences *silence.Silences, peer *cluster.Peer, l log.Logger) (*API, error) {
|
||||
func NewAPI(
|
||||
alerts provider.Alerts,
|
||||
sf getAlertStatusFn,
|
||||
silences *silence.Silences,
|
||||
peer *cluster.Peer,
|
||||
l log.Logger,
|
||||
) (*API, error) {
|
||||
api := API{
|
||||
alerts: alerts,
|
||||
getAlertStatus: sf,
|
||||
|
@ -114,14 +122,15 @@ func NewAPI(alerts provider.Alerts, sf getAlertStatusFn, silences *silence.Silen
|
|||
return &api, nil
|
||||
}
|
||||
|
||||
// Update sets the configuration string to a new value.
|
||||
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration) error {
|
||||
// Update sets the API struct members that may change between reloads of alertmanager.
|
||||
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration, setAlertStatus setAlertStatusFn) error {
|
||||
api.mtx.Lock()
|
||||
defer api.mtx.Unlock()
|
||||
|
||||
api.resolveTimeout = resolveTimeout
|
||||
api.alertmanagerConfig = cfg
|
||||
api.route = dispatch.NewRoute(cfg.Route, nil)
|
||||
api.setAlertStatus = setAlertStatus
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -252,6 +261,11 @@ func (api *API) getAlertsHandler(params alert_ops.GetAlertsParams) middleware.Re
|
|||
continue
|
||||
}
|
||||
|
||||
// Set alert's current status based on its label set.
|
||||
if err := api.setAlertStatus(a.Labels); err != nil {
|
||||
level.Error(api.logger).Log("msg", "set alert status failed", "err", err)
|
||||
}
|
||||
// Get alert's current status after seeing if it is suppressed.
|
||||
status := api.getAlertStatus(a.Fingerprint())
|
||||
|
||||
if !*params.Active && status.State == types.AlertStateActive {
|
||||
|
|
|
@ -34,6 +34,7 @@ import (
|
|||
"github.com/go-kit/kit/log/level"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||
"github.com/prometheus/common/model"
|
||||
"github.com/prometheus/common/promlog"
|
||||
promlogflag "github.com/prometheus/common/promlog/flag"
|
||||
"github.com/prometheus/common/route"
|
||||
|
@ -324,11 +325,6 @@ func run() int {
|
|||
|
||||
hash = md5HashAsMetricValue(plainCfg)
|
||||
|
||||
err = api.Update(conf, time.Duration(conf.Global.ResolveTimeout))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
tmpl, err = template.FromGlobs(conf.Templates...)
|
||||
if err != nil {
|
||||
return err
|
||||
|
@ -350,6 +346,12 @@ func run() int {
|
|||
peer,
|
||||
logger,
|
||||
)
|
||||
|
||||
err = api.Update(conf, time.Duration(conf.Global.ResolveTimeout), setAlertStatus(inhibitor, marker, silences))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
disp = dispatch.NewDispatcher(alerts, dispatch.NewRoute(conf.Route, nil), pipeline, marker, timeoutFunc, logger)
|
||||
|
||||
go disp.Run()
|
||||
|
@ -476,3 +478,28 @@ func md5HashAsMetricValue(data []byte) float64 {
|
|||
copy(bytes, smallSum)
|
||||
return float64(binary.LittleEndian.Uint64(bytes))
|
||||
}
|
||||
|
||||
func setAlertStatus(inhibitor *inhibit.Inhibitor, marker types.Marker, silences *silence.Silences) func(model.LabelSet) error {
|
||||
return func(labels model.LabelSet) error {
|
||||
inhibitor.Mutes(labels)
|
||||
// TODO(beorn7): The following code is almost exactly replicated in notify/notify.go.
|
||||
sils, err := silences.Query(
|
||||
silence.QState(types.SilenceStateActive),
|
||||
silence.QMatches(labels),
|
||||
)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to query silences: %v", err)
|
||||
}
|
||||
|
||||
if len(sils) > 0 {
|
||||
ids := make([]string, len(sils))
|
||||
for i, s := range sils {
|
||||
ids[i] = s.Id
|
||||
}
|
||||
marker.SetSilenced(labels.Fingerprint(), ids...)
|
||||
} else {
|
||||
marker.SetSilenced(labels.Fingerprint())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
|
|
@ -289,6 +289,11 @@ func (amc *AlertmanagerCluster) Start() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// Members returns the underlying slice of cluster members.
|
||||
func (amc *AlertmanagerCluster) Members() []*Alertmanager {
|
||||
return amc.ams
|
||||
}
|
||||
|
||||
// Start the alertmanager and wait until it is ready to receive.
|
||||
func (am *Alertmanager) Start(additionalArg []string) error {
|
||||
am.t.Helper()
|
||||
|
@ -555,6 +560,10 @@ func (am *Alertmanager) GenericAPIV2Call(at float64, f func()) {
|
|||
am.t.Do(at, f)
|
||||
}
|
||||
|
||||
func (am *Alertmanager) Client() *apiclient.Alertmanager {
|
||||
return am.clientV2
|
||||
}
|
||||
|
||||
func (am *Alertmanager) getURL(path string) string {
|
||||
return fmt.Sprintf("http://%s%s%s", am.apiAddr, am.opts.RoutePrefix, path)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,167 @@
|
|||
// Copyright 2018 Prometheus Team
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/go-openapi/strfmt"
|
||||
a "github.com/prometheus/alertmanager/test/with_api_v2"
|
||||
"github.com/prometheus/alertmanager/test/with_api_v2/api_v2_client/client/alert"
|
||||
"github.com/prometheus/alertmanager/test/with_api_v2/api_v2_client/client/silence"
|
||||
"github.com/prometheus/alertmanager/test/with_api_v2/api_v2_client/models"
|
||||
"github.com/prometheus/common/model"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
// TestAlertGetReturnsCurrentStatus checks that querying the API returns the
|
||||
// current status of each alert, i.e. if it is silenced or inhibited.
|
||||
func TestAlertGetReturnsCurrentAlertStatus(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
conf := `
|
||||
route:
|
||||
receiver: "default"
|
||||
group_by: []
|
||||
group_wait: 1s
|
||||
group_interval: 10m
|
||||
repeat_interval: 1h
|
||||
|
||||
inhibit_rules:
|
||||
- source_match:
|
||||
severity: 'critical'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['alertname']
|
||||
|
||||
receivers:
|
||||
- name: "default"
|
||||
webhook_configs:
|
||||
- url: 'http://%s'
|
||||
`
|
||||
|
||||
at := a.NewAcceptanceTest(t, &a.AcceptanceOpts{
|
||||
Tolerance: 1 * time.Second,
|
||||
})
|
||||
co := at.Collector("webhook")
|
||||
wh := a.NewWebhook(co)
|
||||
|
||||
amc := at.AlertmanagerCluster(fmt.Sprintf(conf, wh.Address()), 1)
|
||||
require.NoError(t, amc.Start())
|
||||
defer amc.Terminate()
|
||||
|
||||
am := amc.Members()[0]
|
||||
|
||||
labelName := "alertname"
|
||||
labelValue := "test1"
|
||||
|
||||
now := time.Now()
|
||||
startsAt := strfmt.DateTime(now)
|
||||
endsAt := strfmt.DateTime(now.Add(5 * time.Minute))
|
||||
|
||||
labels := models.LabelSet(map[string]string{labelName: labelValue, "severity": "warning"})
|
||||
fp := model.LabelSet{model.LabelName(labelName): model.LabelValue(labelValue), "severity": "warning"}.Fingerprint()
|
||||
pa := &models.PostableAlert{
|
||||
StartsAt: startsAt,
|
||||
EndsAt: endsAt,
|
||||
Alert: models.Alert{Labels: labels},
|
||||
}
|
||||
alertParams := alert.NewPostAlertsParams()
|
||||
alertParams.Alerts = models.PostableAlerts{pa}
|
||||
_, err := am.Client().Alert.PostAlerts(alertParams)
|
||||
require.NoError(t, err)
|
||||
|
||||
resp, err := am.Client().Alert.GetAlerts(nil)
|
||||
require.NoError(t, err)
|
||||
// No silence has been created or inhibiting alert sent, alert should
|
||||
// be active.
|
||||
for _, alert := range resp.Payload {
|
||||
require.Equal(t, models.AlertStatusStateActive, *alert.Status.State)
|
||||
}
|
||||
|
||||
// Wait for group_wait, so that we are in the group_interval period,
|
||||
// when the pipeline won't update the alert's status.
|
||||
time.Sleep(2 * time.Second)
|
||||
|
||||
// Create silence and verify that the alert is immediately marked
|
||||
// silenced via the API.
|
||||
silenceParams := silence.NewPostSilencesParams()
|
||||
|
||||
cm := "a"
|
||||
isRegex := false
|
||||
ps := &models.PostableSilence{
|
||||
Silence: models.Silence{
|
||||
StartsAt: &startsAt,
|
||||
EndsAt: &endsAt,
|
||||
Comment: &cm,
|
||||
CreatedBy: &cm,
|
||||
Matchers: models.Matchers{
|
||||
&models.Matcher{Name: &labelName, Value: &labelValue, IsRegex: &isRegex},
|
||||
},
|
||||
},
|
||||
}
|
||||
silenceParams.Silence = ps
|
||||
silenceResp, err := am.Client().Silence.PostSilences(silenceParams)
|
||||
require.NoError(t, err)
|
||||
silenceID := silenceResp.Payload.SilenceID
|
||||
|
||||
resp, err = am.Client().Alert.GetAlerts(nil)
|
||||
require.NoError(t, err)
|
||||
for _, alert := range resp.Payload {
|
||||
require.Equal(t, models.AlertStatusStateSuppressed, *alert.Status.State)
|
||||
require.Equal(t, fp.String(), *alert.Fingerprint)
|
||||
require.Equal(t, 1, len(alert.Status.SilencedBy))
|
||||
require.Equal(t, silenceID, alert.Status.SilencedBy[0])
|
||||
}
|
||||
|
||||
// Create inhibiting alert and verify that original alert is
|
||||
// immediately marked as inhibited.
|
||||
labels["severity"] = "critical"
|
||||
_, err = am.Client().Alert.PostAlerts(alertParams)
|
||||
require.NoError(t, err)
|
||||
|
||||
inhibitingFP := model.LabelSet{model.LabelName(labelName): model.LabelValue(labelValue), "severity": "critical"}.Fingerprint()
|
||||
|
||||
resp, err = am.Client().Alert.GetAlerts(nil)
|
||||
require.NoError(t, err)
|
||||
for _, alert := range resp.Payload {
|
||||
require.Equal(t, 1, len(alert.Status.SilencedBy))
|
||||
require.Equal(t, silenceID, alert.Status.SilencedBy[0])
|
||||
if fp.String() == *alert.Fingerprint {
|
||||
require.Equal(t, models.AlertStatusStateSuppressed, *alert.Status.State)
|
||||
require.Equal(t, fp.String(), *alert.Fingerprint)
|
||||
require.Equal(t, 1, len(alert.Status.InhibitedBy))
|
||||
require.Equal(t, inhibitingFP.String(), alert.Status.InhibitedBy[0])
|
||||
}
|
||||
}
|
||||
|
||||
deleteParams := silence.NewDeleteSilenceParams().WithSilenceID(strfmt.UUID(silenceID))
|
||||
_, err = am.Client().Silence.DeleteSilence(deleteParams)
|
||||
require.NoError(t, err)
|
||||
|
||||
resp, err = am.Client().Alert.GetAlerts(nil)
|
||||
require.NoError(t, err)
|
||||
// Silence has been deleted, inhibiting alert should be active.
|
||||
// Original alert should still be inhibited.
|
||||
for _, alert := range resp.Payload {
|
||||
require.Equal(t, 0, len(alert.Status.SilencedBy))
|
||||
if inhibitingFP.String() == *alert.Fingerprint {
|
||||
require.Equal(t, models.AlertStatusStateActive, *alert.Status.State)
|
||||
} else {
|
||||
require.Equal(t, models.AlertStatusStateSuppressed, *alert.Status.State)
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue