Merge pull request #863 from prometheus/fabxc/alertstate

Preserve alert state, cleanup rules/
This commit is contained in:
Fabian Reinartz 2015-06-30 18:53:07 +02:00
commit 049a106821
4 changed files with 227 additions and 259 deletions

View File

@ -21,6 +21,8 @@ import (
"sync"
"time"
html_template "html/template"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/log"
@ -73,6 +75,20 @@ func init() {
prometheus.MustRegister(evalDuration)
}
// A Rule encapsulates a vector expression which is evaluated at a specified
// interval and acted upon (currently either recorded or used for alerting).
type Rule interface {
// Name returns the name of the rule.
Name() string
// Eval evaluates the rule, including any associated recording or alerting actions.
eval(clientmodel.Timestamp, *promql.Engine) (promql.Vector, error)
// String returns a human-readable string representation of the rule.
String() string
// HTMLSnippet returns a human-readable string representation of the rule,
// decorated with HTML elements for use the web frontend.
HTMLSnippet(pathPrefix string) html_template.HTML
}
// The Manager manages recording and alerting rules.
type Manager struct {
// Protects the rules list.
@ -271,12 +287,39 @@ func (m *Manager) runIteration() {
wg.Wait()
}
// transferAlertState makes a copy of the state of alerting rules and returns a function
// that restores them in the current state.
func (m *Manager) transferAlertState() func() {
alertingRules := map[string]*AlertingRule{}
for _, r := range m.rules {
if ar, ok := r.(*AlertingRule); ok {
alertingRules[ar.name] = ar
}
}
return func() {
// Restore alerting rule state.
for _, r := range m.rules {
ar, ok := r.(*AlertingRule)
if !ok {
continue
}
if old, ok := alertingRules[ar.name]; ok {
ar.activeAlerts = old.activeAlerts
}
}
}
}
// ApplyConfig updates the rule manager's state as the config requires. If
// loading the new rules failed the old rule set is restored. Returns true on success.
func (m *Manager) ApplyConfig(conf *config.Config) bool {
m.Lock()
defer m.Unlock()
defer m.transferAlertState()()
success := true
m.interval = time.Duration(conf.GlobalConfig.EvaluationInterval)
@ -300,6 +343,7 @@ func (m *Manager) ApplyConfig(conf *config.Config) bool {
log.Errorf("Error loading rules, previous rule set restored: %s", err)
success = false
}
return success
}

183
rules/manager_test.go Normal file
View File

@ -0,0 +1,183 @@
// Copyright 2013 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package rules
import (
"fmt"
"reflect"
"strings"
"testing"
"time"
clientmodel "github.com/prometheus/client_golang/model"
"github.com/prometheus/prometheus/promql"
)
func TestAlertingRule(t *testing.T) {
suite, err := promql.NewTest(t, `
load 5m
http_requests{job="api-server", instance="0", group="production"} 0+10x10
http_requests{job="api-server", instance="1", group="production"} 0+20x10
http_requests{job="api-server", instance="0", group="canary"} 0+30x10
http_requests{job="api-server", instance="1", group="canary"} 0+40x10
http_requests{job="app-server", instance="0", group="production"} 0+50x10
http_requests{job="app-server", instance="1", group="production"} 0+60x10
http_requests{job="app-server", instance="0", group="canary"} 0+70x10
http_requests{job="app-server", instance="1", group="canary"} 0+80x10
`)
if err != nil {
t.Fatal(err)
}
defer suite.Close()
if err := suite.Run(); err != nil {
t.Fatal(err)
}
expr, err := promql.ParseExpr(`http_requests{group="canary", job="app-server"} < 100`)
if err != nil {
t.Fatalf("Unable to parse alert expression: %s", err)
}
rule := NewAlertingRule(
"HTTPRequestRateLow",
expr,
time.Minute,
clientmodel.LabelSet{"severity": "critical"},
"summary", "description", "runbook",
)
var tests = []struct {
time time.Duration
result []string
}{
{
time: 0,
result: []string{
`ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`,
`ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`,
},
}, {
time: 5 * time.Minute,
result: []string{
`ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`,
`ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`,
`ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`,
`ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`,
},
}, {
time: 10 * time.Minute,
result: []string{
`ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`,
`ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`,
},
},
{
time: 15 * time.Minute,
result: nil,
},
{
time: 20 * time.Minute,
result: nil,
},
}
for i, test := range tests {
evalTime := clientmodel.Timestamp(0).Add(test.time)
res, err := rule.eval(evalTime, suite.QueryEngine())
if err != nil {
t.Fatalf("Error during alerting rule evaluation: %s", err)
}
actual := strings.Split(res.String(), "\n")
expected := annotateWithTime(test.result, evalTime)
if actual[0] == "" {
actual = []string{}
}
if len(actual) != len(expected) {
t.Errorf("%d. Number of samples in expected and actual output don't match (%d vs. %d)", i, len(expected), len(actual))
}
for j, expectedSample := range expected {
found := false
for _, actualSample := range actual {
if actualSample == expectedSample {
found = true
}
}
if !found {
t.Errorf("%d.%d. Couldn't find expected sample in output: '%v'", i, j, expectedSample)
}
}
if t.Failed() {
t.Errorf("%d. Expected and actual outputs don't match:", i)
t.Fatalf("Expected:\n%v\n----\nActual:\n%v", strings.Join(expected, "\n"), strings.Join(actual, "\n"))
}
}
}
func annotateWithTime(lines []string, timestamp clientmodel.Timestamp) []string {
annotatedLines := []string{}
for _, line := range lines {
annotatedLines = append(annotatedLines, fmt.Sprintf(line, timestamp))
}
return annotatedLines
}
func TestTransferAlertState(t *testing.T) {
m := NewManager(&ManagerOptions{})
alert := &Alert{
Name: "testalert",
State: StateFiring,
}
arule := AlertingRule{
name: "test",
activeAlerts: map[clientmodel.Fingerprint]*Alert{},
}
aruleCopy := arule
m.rules = append(m.rules, &arule)
// Set an alert.
arule.activeAlerts[0] = alert
// Save state and get the restore function.
restore := m.transferAlertState()
// Remove arule from the rule list and add an unrelated rule and the
// stateless copy of arule.
m.rules = []Rule{
&AlertingRule{
name: "test_other",
activeAlerts: map[clientmodel.Fingerprint]*Alert{},
},
&aruleCopy,
}
// Apply the restore function.
restore()
if ar := m.rules[0].(*AlertingRule); len(ar.activeAlerts) != 0 {
t.Fatalf("unexpected alert for unrelated alerting rule")
}
if ar := m.rules[1].(*AlertingRule); !reflect.DeepEqual(ar.activeAlerts[0], alert) {
t.Fatalf("alert state was not restored")
}
}

View File

@ -1,36 +0,0 @@
// Copyright 2013 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package rules
import (
"html/template"
clientmodel "github.com/prometheus/client_golang/model"
"github.com/prometheus/prometheus/promql"
)
// A Rule encapsulates a vector expression which is evaluated at a specified
// interval and acted upon (currently either recorded or used for alerting).
type Rule interface {
// Name returns the name of the rule.
Name() string
// Eval evaluates the rule, including any associated recording or alerting actions.
eval(clientmodel.Timestamp, *promql.Engine) (promql.Vector, error)
// String returns a human-readable string representation of the rule.
String() string
// HTMLSnippet returns a human-readable string representation of the rule,
// decorated with HTML elements for use the web frontend.
HTMLSnippet(pathPrefix string) template.HTML
}

View File

@ -1,223 +0,0 @@
// Copyright 2013 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package rules
import (
"fmt"
"strings"
"testing"
"time"
clientmodel "github.com/prometheus/client_golang/model"
"github.com/prometheus/prometheus/promql"
"github.com/prometheus/prometheus/storage/local"
"github.com/prometheus/prometheus/storage/metric"
)
var (
testSampleInterval = time.Duration(5) * time.Minute
testStartTime = clientmodel.Timestamp(0)
)
func getTestValueStream(startVal clientmodel.SampleValue, endVal clientmodel.SampleValue, stepVal clientmodel.SampleValue, startTime clientmodel.Timestamp) (resultValues metric.Values) {
currentTime := startTime
for currentVal := startVal; currentVal <= endVal; currentVal += stepVal {
sample := metric.SamplePair{
Value: currentVal,
Timestamp: currentTime,
}
resultValues = append(resultValues, sample)
currentTime = currentTime.Add(testSampleInterval)
}
return resultValues
}
func getTestVectorFromTestMatrix(matrix promql.Matrix) promql.Vector {
vector := promql.Vector{}
for _, sampleStream := range matrix {
lastSample := sampleStream.Values[len(sampleStream.Values)-1]
vector = append(vector, &promql.Sample{
Metric: sampleStream.Metric,
Value: lastSample.Value,
Timestamp: lastSample.Timestamp,
})
}
return vector
}
func storeMatrix(storage local.Storage, matrix promql.Matrix) {
pendingSamples := clientmodel.Samples{}
for _, sampleStream := range matrix {
for _, sample := range sampleStream.Values {
pendingSamples = append(pendingSamples, &clientmodel.Sample{
Metric: sampleStream.Metric.Metric,
Value: sample.Value,
Timestamp: sample.Timestamp,
})
}
}
for _, s := range pendingSamples {
storage.Append(s)
}
storage.WaitForIndexing()
}
func vectorComparisonString(expected []string, actual []string) string {
separator := "\n--------------\n"
return fmt.Sprintf("Expected:%v%v%v\nActual:%v%v%v ",
separator,
strings.Join(expected, "\n"),
separator,
separator,
strings.Join(actual, "\n"),
separator)
}
func annotateWithTime(lines []string, timestamp clientmodel.Timestamp) []string {
annotatedLines := []string{}
for _, line := range lines {
annotatedLines = append(annotatedLines, fmt.Sprintf(line, timestamp))
}
return annotatedLines
}
var testMatrix = promql.Matrix{
{
Metric: clientmodel.COWMetric{
Metric: clientmodel.Metric{
clientmodel.MetricNameLabel: "http_requests",
clientmodel.JobLabel: "api-server",
"instance": "0",
"group": "canary",
},
},
Values: getTestValueStream(0, 300, 30, testStartTime),
},
{
Metric: clientmodel.COWMetric{
Metric: clientmodel.Metric{
clientmodel.MetricNameLabel: "http_requests",
clientmodel.JobLabel: "api-server",
"instance": "1",
"group": "canary",
},
},
Values: getTestValueStream(0, 400, 40, testStartTime),
},
{
Metric: clientmodel.COWMetric{
Metric: clientmodel.Metric{
clientmodel.MetricNameLabel: "http_requests",
clientmodel.JobLabel: "app-server",
"instance": "0",
"group": "canary",
},
},
Values: getTestValueStream(0, 700, 70, testStartTime),
},
{
Metric: clientmodel.COWMetric{
Metric: clientmodel.Metric{
clientmodel.MetricNameLabel: "http_requests",
clientmodel.JobLabel: "app-server",
"instance": "1",
"group": "canary",
},
},
Values: getTestValueStream(0, 800, 80, testStartTime),
},
}
func TestAlertingRule(t *testing.T) {
// Labels in expected output need to be alphabetically sorted.
var evalOutputs = [][]string{
{
`ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`,
`ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`,
},
{
`ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`,
`ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`,
`ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`,
`ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`,
},
{
`ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`,
`ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`,
},
{
/* empty */
},
{
/* empty */
},
}
storage, closer := local.NewTestStorage(t, 1)
defer closer.Close()
storeMatrix(storage, testMatrix)
engine := promql.NewEngine(storage, nil)
defer engine.Stop()
expr, err := promql.ParseExpr(`http_requests{group="canary", job="app-server"} < 100`)
if err != nil {
t.Fatalf("Unable to parse alert expression: %s", err)
}
alertLabels := clientmodel.LabelSet{
"severity": "critical",
}
rule := NewAlertingRule("HttpRequestRateLow", expr, time.Minute, alertLabels, "summary", "description", "runbook")
for i, expectedLines := range evalOutputs {
evalTime := testStartTime.Add(testSampleInterval * time.Duration(i))
res, err := rule.eval(evalTime, engine)
if err != nil {
t.Fatalf("Error during alerting rule evaluation: %s", err)
}
actualLines := strings.Split(res.String(), "\n")
expectedLines := annotateWithTime(expectedLines, evalTime)
if actualLines[0] == "" {
actualLines = []string{}
}
failed := false
if len(actualLines) != len(expectedLines) {
t.Errorf("%d. Number of samples in expected and actual output don't match (%d vs. %d)", i, len(expectedLines), len(actualLines))
failed = true
}
for j, expectedSample := range expectedLines {
found := false
for _, actualSample := range actualLines {
if actualSample == expectedSample {
found = true
}
}
if !found {
t.Errorf("%d.%d. Couldn't find expected sample in output: '%v'", i, j, expectedSample)
failed = true
}
}
if failed {
t.Fatalf("%d. Expected and actual outputs don't match:\n%v", i, vectorComparisonString(expectedLines, actualLines))
}
}
}