Merge pull request #863 from prometheus/fabxc/alertstate
Preserve alert state, cleanup rules/
This commit is contained in:
commit
049a106821
|
@ -21,6 +21,8 @@ import (
|
|||
"sync"
|
||||
"time"
|
||||
|
||||
html_template "html/template"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/log"
|
||||
|
||||
|
@ -73,6 +75,20 @@ func init() {
|
|||
prometheus.MustRegister(evalDuration)
|
||||
}
|
||||
|
||||
// A Rule encapsulates a vector expression which is evaluated at a specified
|
||||
// interval and acted upon (currently either recorded or used for alerting).
|
||||
type Rule interface {
|
||||
// Name returns the name of the rule.
|
||||
Name() string
|
||||
// Eval evaluates the rule, including any associated recording or alerting actions.
|
||||
eval(clientmodel.Timestamp, *promql.Engine) (promql.Vector, error)
|
||||
// String returns a human-readable string representation of the rule.
|
||||
String() string
|
||||
// HTMLSnippet returns a human-readable string representation of the rule,
|
||||
// decorated with HTML elements for use the web frontend.
|
||||
HTMLSnippet(pathPrefix string) html_template.HTML
|
||||
}
|
||||
|
||||
// The Manager manages recording and alerting rules.
|
||||
type Manager struct {
|
||||
// Protects the rules list.
|
||||
|
@ -271,12 +287,39 @@ func (m *Manager) runIteration() {
|
|||
wg.Wait()
|
||||
}
|
||||
|
||||
// transferAlertState makes a copy of the state of alerting rules and returns a function
|
||||
// that restores them in the current state.
|
||||
func (m *Manager) transferAlertState() func() {
|
||||
|
||||
alertingRules := map[string]*AlertingRule{}
|
||||
for _, r := range m.rules {
|
||||
if ar, ok := r.(*AlertingRule); ok {
|
||||
alertingRules[ar.name] = ar
|
||||
}
|
||||
}
|
||||
|
||||
return func() {
|
||||
// Restore alerting rule state.
|
||||
for _, r := range m.rules {
|
||||
ar, ok := r.(*AlertingRule)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if old, ok := alertingRules[ar.name]; ok {
|
||||
ar.activeAlerts = old.activeAlerts
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ApplyConfig updates the rule manager's state as the config requires. If
|
||||
// loading the new rules failed the old rule set is restored. Returns true on success.
|
||||
func (m *Manager) ApplyConfig(conf *config.Config) bool {
|
||||
m.Lock()
|
||||
defer m.Unlock()
|
||||
|
||||
defer m.transferAlertState()()
|
||||
|
||||
success := true
|
||||
m.interval = time.Duration(conf.GlobalConfig.EvaluationInterval)
|
||||
|
||||
|
@ -300,6 +343,7 @@ func (m *Manager) ApplyConfig(conf *config.Config) bool {
|
|||
log.Errorf("Error loading rules, previous rule set restored: %s", err)
|
||||
success = false
|
||||
}
|
||||
|
||||
return success
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,183 @@
|
|||
// Copyright 2013 The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package rules
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"reflect"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
clientmodel "github.com/prometheus/client_golang/model"
|
||||
|
||||
"github.com/prometheus/prometheus/promql"
|
||||
)
|
||||
|
||||
func TestAlertingRule(t *testing.T) {
|
||||
suite, err := promql.NewTest(t, `
|
||||
load 5m
|
||||
http_requests{job="api-server", instance="0", group="production"} 0+10x10
|
||||
http_requests{job="api-server", instance="1", group="production"} 0+20x10
|
||||
http_requests{job="api-server", instance="0", group="canary"} 0+30x10
|
||||
http_requests{job="api-server", instance="1", group="canary"} 0+40x10
|
||||
http_requests{job="app-server", instance="0", group="production"} 0+50x10
|
||||
http_requests{job="app-server", instance="1", group="production"} 0+60x10
|
||||
http_requests{job="app-server", instance="0", group="canary"} 0+70x10
|
||||
http_requests{job="app-server", instance="1", group="canary"} 0+80x10
|
||||
`)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer suite.Close()
|
||||
|
||||
if err := suite.Run(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
expr, err := promql.ParseExpr(`http_requests{group="canary", job="app-server"} < 100`)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to parse alert expression: %s", err)
|
||||
}
|
||||
|
||||
rule := NewAlertingRule(
|
||||
"HTTPRequestRateLow",
|
||||
expr,
|
||||
time.Minute,
|
||||
clientmodel.LabelSet{"severity": "critical"},
|
||||
"summary", "description", "runbook",
|
||||
)
|
||||
|
||||
var tests = []struct {
|
||||
time time.Duration
|
||||
result []string
|
||||
}{
|
||||
{
|
||||
time: 0,
|
||||
result: []string{
|
||||
`ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`,
|
||||
`ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`,
|
||||
},
|
||||
}, {
|
||||
time: 5 * time.Minute,
|
||||
result: []string{
|
||||
`ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`,
|
||||
`ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`,
|
||||
`ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`,
|
||||
`ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`,
|
||||
},
|
||||
}, {
|
||||
time: 10 * time.Minute,
|
||||
result: []string{
|
||||
`ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`,
|
||||
`ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`,
|
||||
},
|
||||
},
|
||||
{
|
||||
time: 15 * time.Minute,
|
||||
result: nil,
|
||||
},
|
||||
{
|
||||
time: 20 * time.Minute,
|
||||
result: nil,
|
||||
},
|
||||
}
|
||||
|
||||
for i, test := range tests {
|
||||
evalTime := clientmodel.Timestamp(0).Add(test.time)
|
||||
|
||||
res, err := rule.eval(evalTime, suite.QueryEngine())
|
||||
if err != nil {
|
||||
t.Fatalf("Error during alerting rule evaluation: %s", err)
|
||||
}
|
||||
|
||||
actual := strings.Split(res.String(), "\n")
|
||||
expected := annotateWithTime(test.result, evalTime)
|
||||
if actual[0] == "" {
|
||||
actual = []string{}
|
||||
}
|
||||
|
||||
if len(actual) != len(expected) {
|
||||
t.Errorf("%d. Number of samples in expected and actual output don't match (%d vs. %d)", i, len(expected), len(actual))
|
||||
}
|
||||
|
||||
for j, expectedSample := range expected {
|
||||
found := false
|
||||
for _, actualSample := range actual {
|
||||
if actualSample == expectedSample {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("%d.%d. Couldn't find expected sample in output: '%v'", i, j, expectedSample)
|
||||
}
|
||||
}
|
||||
|
||||
if t.Failed() {
|
||||
t.Errorf("%d. Expected and actual outputs don't match:", i)
|
||||
t.Fatalf("Expected:\n%v\n----\nActual:\n%v", strings.Join(expected, "\n"), strings.Join(actual, "\n"))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func annotateWithTime(lines []string, timestamp clientmodel.Timestamp) []string {
|
||||
annotatedLines := []string{}
|
||||
for _, line := range lines {
|
||||
annotatedLines = append(annotatedLines, fmt.Sprintf(line, timestamp))
|
||||
}
|
||||
return annotatedLines
|
||||
}
|
||||
|
||||
func TestTransferAlertState(t *testing.T) {
|
||||
m := NewManager(&ManagerOptions{})
|
||||
|
||||
alert := &Alert{
|
||||
Name: "testalert",
|
||||
State: StateFiring,
|
||||
}
|
||||
|
||||
arule := AlertingRule{
|
||||
name: "test",
|
||||
activeAlerts: map[clientmodel.Fingerprint]*Alert{},
|
||||
}
|
||||
aruleCopy := arule
|
||||
|
||||
m.rules = append(m.rules, &arule)
|
||||
|
||||
// Set an alert.
|
||||
arule.activeAlerts[0] = alert
|
||||
|
||||
// Save state and get the restore function.
|
||||
restore := m.transferAlertState()
|
||||
|
||||
// Remove arule from the rule list and add an unrelated rule and the
|
||||
// stateless copy of arule.
|
||||
m.rules = []Rule{
|
||||
&AlertingRule{
|
||||
name: "test_other",
|
||||
activeAlerts: map[clientmodel.Fingerprint]*Alert{},
|
||||
},
|
||||
&aruleCopy,
|
||||
}
|
||||
|
||||
// Apply the restore function.
|
||||
restore()
|
||||
|
||||
if ar := m.rules[0].(*AlertingRule); len(ar.activeAlerts) != 0 {
|
||||
t.Fatalf("unexpected alert for unrelated alerting rule")
|
||||
}
|
||||
if ar := m.rules[1].(*AlertingRule); !reflect.DeepEqual(ar.activeAlerts[0], alert) {
|
||||
t.Fatalf("alert state was not restored")
|
||||
}
|
||||
}
|
|
@ -1,36 +0,0 @@
|
|||
// Copyright 2013 The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package rules
|
||||
|
||||
import (
|
||||
"html/template"
|
||||
|
||||
clientmodel "github.com/prometheus/client_golang/model"
|
||||
|
||||
"github.com/prometheus/prometheus/promql"
|
||||
)
|
||||
|
||||
// A Rule encapsulates a vector expression which is evaluated at a specified
|
||||
// interval and acted upon (currently either recorded or used for alerting).
|
||||
type Rule interface {
|
||||
// Name returns the name of the rule.
|
||||
Name() string
|
||||
// Eval evaluates the rule, including any associated recording or alerting actions.
|
||||
eval(clientmodel.Timestamp, *promql.Engine) (promql.Vector, error)
|
||||
// String returns a human-readable string representation of the rule.
|
||||
String() string
|
||||
// HTMLSnippet returns a human-readable string representation of the rule,
|
||||
// decorated with HTML elements for use the web frontend.
|
||||
HTMLSnippet(pathPrefix string) template.HTML
|
||||
}
|
|
@ -1,223 +0,0 @@
|
|||
// Copyright 2013 The Prometheus Authors
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
//
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
package rules
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
clientmodel "github.com/prometheus/client_golang/model"
|
||||
|
||||
"github.com/prometheus/prometheus/promql"
|
||||
"github.com/prometheus/prometheus/storage/local"
|
||||
"github.com/prometheus/prometheus/storage/metric"
|
||||
)
|
||||
|
||||
var (
|
||||
testSampleInterval = time.Duration(5) * time.Minute
|
||||
testStartTime = clientmodel.Timestamp(0)
|
||||
)
|
||||
|
||||
func getTestValueStream(startVal clientmodel.SampleValue, endVal clientmodel.SampleValue, stepVal clientmodel.SampleValue, startTime clientmodel.Timestamp) (resultValues metric.Values) {
|
||||
currentTime := startTime
|
||||
for currentVal := startVal; currentVal <= endVal; currentVal += stepVal {
|
||||
sample := metric.SamplePair{
|
||||
Value: currentVal,
|
||||
Timestamp: currentTime,
|
||||
}
|
||||
resultValues = append(resultValues, sample)
|
||||
currentTime = currentTime.Add(testSampleInterval)
|
||||
}
|
||||
return resultValues
|
||||
}
|
||||
|
||||
func getTestVectorFromTestMatrix(matrix promql.Matrix) promql.Vector {
|
||||
vector := promql.Vector{}
|
||||
for _, sampleStream := range matrix {
|
||||
lastSample := sampleStream.Values[len(sampleStream.Values)-1]
|
||||
vector = append(vector, &promql.Sample{
|
||||
Metric: sampleStream.Metric,
|
||||
Value: lastSample.Value,
|
||||
Timestamp: lastSample.Timestamp,
|
||||
})
|
||||
}
|
||||
return vector
|
||||
}
|
||||
|
||||
func storeMatrix(storage local.Storage, matrix promql.Matrix) {
|
||||
pendingSamples := clientmodel.Samples{}
|
||||
for _, sampleStream := range matrix {
|
||||
for _, sample := range sampleStream.Values {
|
||||
pendingSamples = append(pendingSamples, &clientmodel.Sample{
|
||||
Metric: sampleStream.Metric.Metric,
|
||||
Value: sample.Value,
|
||||
Timestamp: sample.Timestamp,
|
||||
})
|
||||
}
|
||||
}
|
||||
for _, s := range pendingSamples {
|
||||
storage.Append(s)
|
||||
}
|
||||
storage.WaitForIndexing()
|
||||
}
|
||||
|
||||
func vectorComparisonString(expected []string, actual []string) string {
|
||||
separator := "\n--------------\n"
|
||||
return fmt.Sprintf("Expected:%v%v%v\nActual:%v%v%v ",
|
||||
separator,
|
||||
strings.Join(expected, "\n"),
|
||||
separator,
|
||||
separator,
|
||||
strings.Join(actual, "\n"),
|
||||
separator)
|
||||
}
|
||||
|
||||
func annotateWithTime(lines []string, timestamp clientmodel.Timestamp) []string {
|
||||
annotatedLines := []string{}
|
||||
for _, line := range lines {
|
||||
annotatedLines = append(annotatedLines, fmt.Sprintf(line, timestamp))
|
||||
}
|
||||
return annotatedLines
|
||||
}
|
||||
|
||||
var testMatrix = promql.Matrix{
|
||||
{
|
||||
Metric: clientmodel.COWMetric{
|
||||
Metric: clientmodel.Metric{
|
||||
clientmodel.MetricNameLabel: "http_requests",
|
||||
clientmodel.JobLabel: "api-server",
|
||||
"instance": "0",
|
||||
"group": "canary",
|
||||
},
|
||||
},
|
||||
Values: getTestValueStream(0, 300, 30, testStartTime),
|
||||
},
|
||||
{
|
||||
Metric: clientmodel.COWMetric{
|
||||
Metric: clientmodel.Metric{
|
||||
clientmodel.MetricNameLabel: "http_requests",
|
||||
clientmodel.JobLabel: "api-server",
|
||||
"instance": "1",
|
||||
"group": "canary",
|
||||
},
|
||||
},
|
||||
Values: getTestValueStream(0, 400, 40, testStartTime),
|
||||
},
|
||||
{
|
||||
Metric: clientmodel.COWMetric{
|
||||
Metric: clientmodel.Metric{
|
||||
clientmodel.MetricNameLabel: "http_requests",
|
||||
clientmodel.JobLabel: "app-server",
|
||||
"instance": "0",
|
||||
"group": "canary",
|
||||
},
|
||||
},
|
||||
Values: getTestValueStream(0, 700, 70, testStartTime),
|
||||
},
|
||||
{
|
||||
Metric: clientmodel.COWMetric{
|
||||
Metric: clientmodel.Metric{
|
||||
clientmodel.MetricNameLabel: "http_requests",
|
||||
clientmodel.JobLabel: "app-server",
|
||||
"instance": "1",
|
||||
"group": "canary",
|
||||
},
|
||||
},
|
||||
Values: getTestValueStream(0, 800, 80, testStartTime),
|
||||
},
|
||||
}
|
||||
|
||||
func TestAlertingRule(t *testing.T) {
|
||||
// Labels in expected output need to be alphabetically sorted.
|
||||
var evalOutputs = [][]string{
|
||||
{
|
||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`,
|
||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`,
|
||||
},
|
||||
{
|
||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`,
|
||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`,
|
||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`,
|
||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`,
|
||||
},
|
||||
{
|
||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`,
|
||||
`ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`,
|
||||
},
|
||||
{
|
||||
/* empty */
|
||||
},
|
||||
{
|
||||
/* empty */
|
||||
},
|
||||
}
|
||||
|
||||
storage, closer := local.NewTestStorage(t, 1)
|
||||
defer closer.Close()
|
||||
|
||||
storeMatrix(storage, testMatrix)
|
||||
|
||||
engine := promql.NewEngine(storage, nil)
|
||||
defer engine.Stop()
|
||||
|
||||
expr, err := promql.ParseExpr(`http_requests{group="canary", job="app-server"} < 100`)
|
||||
if err != nil {
|
||||
t.Fatalf("Unable to parse alert expression: %s", err)
|
||||
}
|
||||
|
||||
alertLabels := clientmodel.LabelSet{
|
||||
"severity": "critical",
|
||||
}
|
||||
rule := NewAlertingRule("HttpRequestRateLow", expr, time.Minute, alertLabels, "summary", "description", "runbook")
|
||||
|
||||
for i, expectedLines := range evalOutputs {
|
||||
evalTime := testStartTime.Add(testSampleInterval * time.Duration(i))
|
||||
|
||||
res, err := rule.eval(evalTime, engine)
|
||||
if err != nil {
|
||||
t.Fatalf("Error during alerting rule evaluation: %s", err)
|
||||
}
|
||||
|
||||
actualLines := strings.Split(res.String(), "\n")
|
||||
expectedLines := annotateWithTime(expectedLines, evalTime)
|
||||
if actualLines[0] == "" {
|
||||
actualLines = []string{}
|
||||
}
|
||||
|
||||
failed := false
|
||||
if len(actualLines) != len(expectedLines) {
|
||||
t.Errorf("%d. Number of samples in expected and actual output don't match (%d vs. %d)", i, len(expectedLines), len(actualLines))
|
||||
failed = true
|
||||
}
|
||||
|
||||
for j, expectedSample := range expectedLines {
|
||||
found := false
|
||||
for _, actualSample := range actualLines {
|
||||
if actualSample == expectedSample {
|
||||
found = true
|
||||
}
|
||||
}
|
||||
if !found {
|
||||
t.Errorf("%d.%d. Couldn't find expected sample in output: '%v'", i, j, expectedSample)
|
||||
failed = true
|
||||
}
|
||||
}
|
||||
|
||||
if failed {
|
||||
t.Fatalf("%d. Expected and actual outputs don't match:\n%v", i, vectorComparisonString(expectedLines, actualLines))
|
||||
}
|
||||
}
|
||||
}
|
Loading…
Reference in New Issue