Stn/correctly mark api silences (#1733)
* Update alert status on every GET to alerts Signed-off-by: stuart nelson <stuartnelson3@gmail.com>
This commit is contained in:
parent
da6e2a88dd
commit
51eebbef85
10
api/api.go
10
api/api.go
|
@ -84,7 +84,8 @@ func (o Options) validate() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// New creates a new API object combining all API versions.
|
// New creates a new API object combining all API versions. Note that an Update
|
||||||
|
// call is also needed to get the APIs into an operational state.
|
||||||
func New(opts Options) (*API, error) {
|
func New(opts Options) (*API, error) {
|
||||||
if err := opts.validate(); err != nil {
|
if err := opts.validate(); err != nil {
|
||||||
return nil, fmt.Errorf("invalid API options: %s", err)
|
return nil, fmt.Errorf("invalid API options: %s", err)
|
||||||
|
@ -183,13 +184,14 @@ func (api *API) Register(r *route.Router, routePrefix string) *http.ServeMux {
|
||||||
return mux
|
return mux
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update config and resolve timeout of each API.
|
// Update config and resolve timeout of each API. APIv2 also needs
|
||||||
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration) error {
|
// setAlertStatus to be updated.
|
||||||
|
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration, setAlertStatus func(model.LabelSet) error) error {
|
||||||
if err := api.v1.Update(cfg, resolveTimeout); err != nil {
|
if err := api.v1.Update(cfg, resolveTimeout); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
return api.v2.Update(cfg, resolveTimeout)
|
return api.v2.Update(cfg, resolveTimeout, setAlertStatus)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (api *API) limitHandler(h http.Handler) http.Handler {
|
func (api *API) limitHandler(h http.Handler) http.Handler {
|
||||||
|
|
|
@ -56,13 +56,14 @@ type API struct {
|
||||||
getAlertStatus getAlertStatusFn
|
getAlertStatus getAlertStatusFn
|
||||||
uptime time.Time
|
uptime time.Time
|
||||||
|
|
||||||
// mtx protects resolveTimeout, alertmanagerConfig and route.
|
// mtx protects resolveTimeout, alertmanagerConfig, setAlertStatus and route.
|
||||||
mtx sync.RWMutex
|
mtx sync.RWMutex
|
||||||
// resolveTimeout represents the default resolve timeout that an alert is
|
// resolveTimeout represents the default resolve timeout that an alert is
|
||||||
// assigned if no end time is specified.
|
// assigned if no end time is specified.
|
||||||
resolveTimeout time.Duration
|
resolveTimeout time.Duration
|
||||||
alertmanagerConfig *config.Config
|
alertmanagerConfig *config.Config
|
||||||
route *dispatch.Route
|
route *dispatch.Route
|
||||||
|
setAlertStatus setAlertStatusFn
|
||||||
|
|
||||||
logger log.Logger
|
logger log.Logger
|
||||||
|
|
||||||
|
@ -70,9 +71,16 @@ type API struct {
|
||||||
}
|
}
|
||||||
|
|
||||||
type getAlertStatusFn func(prometheus_model.Fingerprint) types.AlertStatus
|
type getAlertStatusFn func(prometheus_model.Fingerprint) types.AlertStatus
|
||||||
|
type setAlertStatusFn func(prometheus_model.LabelSet) error
|
||||||
|
|
||||||
// NewAPI returns a new Alertmanager API v2
|
// NewAPI returns a new Alertmanager API v2
|
||||||
func NewAPI(alerts provider.Alerts, sf getAlertStatusFn, silences *silence.Silences, peer *cluster.Peer, l log.Logger) (*API, error) {
|
func NewAPI(
|
||||||
|
alerts provider.Alerts,
|
||||||
|
sf getAlertStatusFn,
|
||||||
|
silences *silence.Silences,
|
||||||
|
peer *cluster.Peer,
|
||||||
|
l log.Logger,
|
||||||
|
) (*API, error) {
|
||||||
api := API{
|
api := API{
|
||||||
alerts: alerts,
|
alerts: alerts,
|
||||||
getAlertStatus: sf,
|
getAlertStatus: sf,
|
||||||
|
@ -114,14 +122,15 @@ func NewAPI(alerts provider.Alerts, sf getAlertStatusFn, silences *silence.Silen
|
||||||
return &api, nil
|
return &api, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update sets the configuration string to a new value.
|
// Update sets the API struct members that may change between reloads of alertmanager.
|
||||||
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration) error {
|
func (api *API) Update(cfg *config.Config, resolveTimeout time.Duration, setAlertStatus setAlertStatusFn) error {
|
||||||
api.mtx.Lock()
|
api.mtx.Lock()
|
||||||
defer api.mtx.Unlock()
|
defer api.mtx.Unlock()
|
||||||
|
|
||||||
api.resolveTimeout = resolveTimeout
|
api.resolveTimeout = resolveTimeout
|
||||||
api.alertmanagerConfig = cfg
|
api.alertmanagerConfig = cfg
|
||||||
api.route = dispatch.NewRoute(cfg.Route, nil)
|
api.route = dispatch.NewRoute(cfg.Route, nil)
|
||||||
|
api.setAlertStatus = setAlertStatus
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -252,6 +261,11 @@ func (api *API) getAlertsHandler(params alert_ops.GetAlertsParams) middleware.Re
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Set alert's current status based on its label set.
|
||||||
|
if err := api.setAlertStatus(a.Labels); err != nil {
|
||||||
|
level.Error(api.logger).Log("msg", "set alert status failed", "err", err)
|
||||||
|
}
|
||||||
|
// Get alert's current status after seeing if it is suppressed.
|
||||||
status := api.getAlertStatus(a.Fingerprint())
|
status := api.getAlertStatus(a.Fingerprint())
|
||||||
|
|
||||||
if !*params.Active && status.State == types.AlertStateActive {
|
if !*params.Active && status.State == types.AlertStateActive {
|
||||||
|
|
|
@ -34,6 +34,7 @@ import (
|
||||||
"github.com/go-kit/kit/log/level"
|
"github.com/go-kit/kit/log/level"
|
||||||
"github.com/prometheus/client_golang/prometheus"
|
"github.com/prometheus/client_golang/prometheus"
|
||||||
"github.com/prometheus/client_golang/prometheus/promhttp"
|
"github.com/prometheus/client_golang/prometheus/promhttp"
|
||||||
|
"github.com/prometheus/common/model"
|
||||||
"github.com/prometheus/common/promlog"
|
"github.com/prometheus/common/promlog"
|
||||||
promlogflag "github.com/prometheus/common/promlog/flag"
|
promlogflag "github.com/prometheus/common/promlog/flag"
|
||||||
"github.com/prometheus/common/route"
|
"github.com/prometheus/common/route"
|
||||||
|
@ -324,11 +325,6 @@ func run() int {
|
||||||
|
|
||||||
hash = md5HashAsMetricValue(plainCfg)
|
hash = md5HashAsMetricValue(plainCfg)
|
||||||
|
|
||||||
err = api.Update(conf, time.Duration(conf.Global.ResolveTimeout))
|
|
||||||
if err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
|
|
||||||
tmpl, err = template.FromGlobs(conf.Templates...)
|
tmpl, err = template.FromGlobs(conf.Templates...)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
|
@ -350,6 +346,12 @@ func run() int {
|
||||||
peer,
|
peer,
|
||||||
logger,
|
logger,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
err = api.Update(conf, time.Duration(conf.Global.ResolveTimeout), setAlertStatus(inhibitor, marker, silences))
|
||||||
|
if err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
disp = dispatch.NewDispatcher(alerts, dispatch.NewRoute(conf.Route, nil), pipeline, marker, timeoutFunc, logger)
|
disp = dispatch.NewDispatcher(alerts, dispatch.NewRoute(conf.Route, nil), pipeline, marker, timeoutFunc, logger)
|
||||||
|
|
||||||
go disp.Run()
|
go disp.Run()
|
||||||
|
@ -476,3 +478,28 @@ func md5HashAsMetricValue(data []byte) float64 {
|
||||||
copy(bytes, smallSum)
|
copy(bytes, smallSum)
|
||||||
return float64(binary.LittleEndian.Uint64(bytes))
|
return float64(binary.LittleEndian.Uint64(bytes))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func setAlertStatus(inhibitor *inhibit.Inhibitor, marker types.Marker, silences *silence.Silences) func(model.LabelSet) error {
|
||||||
|
return func(labels model.LabelSet) error {
|
||||||
|
inhibitor.Mutes(labels)
|
||||||
|
// TODO(beorn7): The following code is almost exactly replicated in notify/notify.go.
|
||||||
|
sils, err := silences.Query(
|
||||||
|
silence.QState(types.SilenceStateActive),
|
||||||
|
silence.QMatches(labels),
|
||||||
|
)
|
||||||
|
if err != nil {
|
||||||
|
return fmt.Errorf("failed to query silences: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(sils) > 0 {
|
||||||
|
ids := make([]string, len(sils))
|
||||||
|
for i, s := range sils {
|
||||||
|
ids[i] = s.Id
|
||||||
|
}
|
||||||
|
marker.SetSilenced(labels.Fingerprint(), ids...)
|
||||||
|
} else {
|
||||||
|
marker.SetSilenced(labels.Fingerprint())
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -289,6 +289,11 @@ func (amc *AlertmanagerCluster) Start() error {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Members returns the underlying slice of cluster members.
|
||||||
|
func (amc *AlertmanagerCluster) Members() []*Alertmanager {
|
||||||
|
return amc.ams
|
||||||
|
}
|
||||||
|
|
||||||
// Start the alertmanager and wait until it is ready to receive.
|
// Start the alertmanager and wait until it is ready to receive.
|
||||||
func (am *Alertmanager) Start(additionalArg []string) error {
|
func (am *Alertmanager) Start(additionalArg []string) error {
|
||||||
am.t.Helper()
|
am.t.Helper()
|
||||||
|
@ -555,6 +560,10 @@ func (am *Alertmanager) GenericAPIV2Call(at float64, f func()) {
|
||||||
am.t.Do(at, f)
|
am.t.Do(at, f)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func (am *Alertmanager) Client() *apiclient.Alertmanager {
|
||||||
|
return am.clientV2
|
||||||
|
}
|
||||||
|
|
||||||
func (am *Alertmanager) getURL(path string) string {
|
func (am *Alertmanager) getURL(path string) string {
|
||||||
return fmt.Sprintf("http://%s%s%s", am.apiAddr, am.opts.RoutePrefix, path)
|
return fmt.Sprintf("http://%s%s%s", am.apiAddr, am.opts.RoutePrefix, path)
|
||||||
}
|
}
|
||||||
|
|
|
@ -0,0 +1,167 @@
|
||||||
|
// Copyright 2018 Prometheus Team
|
||||||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
// you may not use this file except in compliance with the License.
|
||||||
|
// You may obtain a copy of the License at
|
||||||
|
//
|
||||||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
//
|
||||||
|
// Unless required by applicable law or agreed to in writing, software
|
||||||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
// See the License for the specific language governing permissions and
|
||||||
|
// limitations under the License.
|
||||||
|
|
||||||
|
package test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"testing"
|
||||||
|
"time"
|
||||||
|
|
||||||
|
"github.com/go-openapi/strfmt"
|
||||||
|
a "github.com/prometheus/alertmanager/test/with_api_v2"
|
||||||
|
"github.com/prometheus/alertmanager/test/with_api_v2/api_v2_client/client/alert"
|
||||||
|
"github.com/prometheus/alertmanager/test/with_api_v2/api_v2_client/client/silence"
|
||||||
|
"github.com/prometheus/alertmanager/test/with_api_v2/api_v2_client/models"
|
||||||
|
"github.com/prometheus/common/model"
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TestAlertGetReturnsCurrentStatus checks that querying the API returns the
|
||||||
|
// current status of each alert, i.e. if it is silenced or inhibited.
|
||||||
|
func TestAlertGetReturnsCurrentAlertStatus(t *testing.T) {
|
||||||
|
t.Parallel()
|
||||||
|
|
||||||
|
conf := `
|
||||||
|
route:
|
||||||
|
receiver: "default"
|
||||||
|
group_by: []
|
||||||
|
group_wait: 1s
|
||||||
|
group_interval: 10m
|
||||||
|
repeat_interval: 1h
|
||||||
|
|
||||||
|
inhibit_rules:
|
||||||
|
- source_match:
|
||||||
|
severity: 'critical'
|
||||||
|
target_match:
|
||||||
|
severity: 'warning'
|
||||||
|
equal: ['alertname']
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
- name: "default"
|
||||||
|
webhook_configs:
|
||||||
|
- url: 'http://%s'
|
||||||
|
`
|
||||||
|
|
||||||
|
at := a.NewAcceptanceTest(t, &a.AcceptanceOpts{
|
||||||
|
Tolerance: 1 * time.Second,
|
||||||
|
})
|
||||||
|
co := at.Collector("webhook")
|
||||||
|
wh := a.NewWebhook(co)
|
||||||
|
|
||||||
|
amc := at.AlertmanagerCluster(fmt.Sprintf(conf, wh.Address()), 1)
|
||||||
|
require.NoError(t, amc.Start())
|
||||||
|
defer amc.Terminate()
|
||||||
|
|
||||||
|
am := amc.Members()[0]
|
||||||
|
|
||||||
|
labelName := "alertname"
|
||||||
|
labelValue := "test1"
|
||||||
|
|
||||||
|
now := time.Now()
|
||||||
|
startsAt := strfmt.DateTime(now)
|
||||||
|
endsAt := strfmt.DateTime(now.Add(5 * time.Minute))
|
||||||
|
|
||||||
|
labels := models.LabelSet(map[string]string{labelName: labelValue, "severity": "warning"})
|
||||||
|
fp := model.LabelSet{model.LabelName(labelName): model.LabelValue(labelValue), "severity": "warning"}.Fingerprint()
|
||||||
|
pa := &models.PostableAlert{
|
||||||
|
StartsAt: startsAt,
|
||||||
|
EndsAt: endsAt,
|
||||||
|
Alert: models.Alert{Labels: labels},
|
||||||
|
}
|
||||||
|
alertParams := alert.NewPostAlertsParams()
|
||||||
|
alertParams.Alerts = models.PostableAlerts{pa}
|
||||||
|
_, err := am.Client().Alert.PostAlerts(alertParams)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
resp, err := am.Client().Alert.GetAlerts(nil)
|
||||||
|
require.NoError(t, err)
|
||||||
|
// No silence has been created or inhibiting alert sent, alert should
|
||||||
|
// be active.
|
||||||
|
for _, alert := range resp.Payload {
|
||||||
|
require.Equal(t, models.AlertStatusStateActive, *alert.Status.State)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Wait for group_wait, so that we are in the group_interval period,
|
||||||
|
// when the pipeline won't update the alert's status.
|
||||||
|
time.Sleep(2 * time.Second)
|
||||||
|
|
||||||
|
// Create silence and verify that the alert is immediately marked
|
||||||
|
// silenced via the API.
|
||||||
|
silenceParams := silence.NewPostSilencesParams()
|
||||||
|
|
||||||
|
cm := "a"
|
||||||
|
isRegex := false
|
||||||
|
ps := &models.PostableSilence{
|
||||||
|
Silence: models.Silence{
|
||||||
|
StartsAt: &startsAt,
|
||||||
|
EndsAt: &endsAt,
|
||||||
|
Comment: &cm,
|
||||||
|
CreatedBy: &cm,
|
||||||
|
Matchers: models.Matchers{
|
||||||
|
&models.Matcher{Name: &labelName, Value: &labelValue, IsRegex: &isRegex},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
}
|
||||||
|
silenceParams.Silence = ps
|
||||||
|
silenceResp, err := am.Client().Silence.PostSilences(silenceParams)
|
||||||
|
require.NoError(t, err)
|
||||||
|
silenceID := silenceResp.Payload.SilenceID
|
||||||
|
|
||||||
|
resp, err = am.Client().Alert.GetAlerts(nil)
|
||||||
|
require.NoError(t, err)
|
||||||
|
for _, alert := range resp.Payload {
|
||||||
|
require.Equal(t, models.AlertStatusStateSuppressed, *alert.Status.State)
|
||||||
|
require.Equal(t, fp.String(), *alert.Fingerprint)
|
||||||
|
require.Equal(t, 1, len(alert.Status.SilencedBy))
|
||||||
|
require.Equal(t, silenceID, alert.Status.SilencedBy[0])
|
||||||
|
}
|
||||||
|
|
||||||
|
// Create inhibiting alert and verify that original alert is
|
||||||
|
// immediately marked as inhibited.
|
||||||
|
labels["severity"] = "critical"
|
||||||
|
_, err = am.Client().Alert.PostAlerts(alertParams)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
inhibitingFP := model.LabelSet{model.LabelName(labelName): model.LabelValue(labelValue), "severity": "critical"}.Fingerprint()
|
||||||
|
|
||||||
|
resp, err = am.Client().Alert.GetAlerts(nil)
|
||||||
|
require.NoError(t, err)
|
||||||
|
for _, alert := range resp.Payload {
|
||||||
|
require.Equal(t, 1, len(alert.Status.SilencedBy))
|
||||||
|
require.Equal(t, silenceID, alert.Status.SilencedBy[0])
|
||||||
|
if fp.String() == *alert.Fingerprint {
|
||||||
|
require.Equal(t, models.AlertStatusStateSuppressed, *alert.Status.State)
|
||||||
|
require.Equal(t, fp.String(), *alert.Fingerprint)
|
||||||
|
require.Equal(t, 1, len(alert.Status.InhibitedBy))
|
||||||
|
require.Equal(t, inhibitingFP.String(), alert.Status.InhibitedBy[0])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
deleteParams := silence.NewDeleteSilenceParams().WithSilenceID(strfmt.UUID(silenceID))
|
||||||
|
_, err = am.Client().Silence.DeleteSilence(deleteParams)
|
||||||
|
require.NoError(t, err)
|
||||||
|
|
||||||
|
resp, err = am.Client().Alert.GetAlerts(nil)
|
||||||
|
require.NoError(t, err)
|
||||||
|
// Silence has been deleted, inhibiting alert should be active.
|
||||||
|
// Original alert should still be inhibited.
|
||||||
|
for _, alert := range resp.Payload {
|
||||||
|
require.Equal(t, 0, len(alert.Status.SilencedBy))
|
||||||
|
if inhibitingFP.String() == *alert.Fingerprint {
|
||||||
|
require.Equal(t, models.AlertStatusStateActive, *alert.Status.State)
|
||||||
|
} else {
|
||||||
|
require.Equal(t, models.AlertStatusStateSuppressed, *alert.Status.State)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue