diff --git a/rules/manager.go b/rules/manager.go index 59f9573b9..1e85cf5c5 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -21,6 +21,8 @@ import ( "sync" "time" + html_template "html/template" + "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/log" @@ -84,7 +86,7 @@ type Rule interface { String() string // HTMLSnippet returns a human-readable string representation of the rule, // decorated with HTML elements for use the web frontend. - HTMLSnippet(pathPrefix string) template.HTML + HTMLSnippet(pathPrefix string) html_template.HTML } // The Manager manages recording and alerting rules. @@ -285,14 +287,9 @@ func (m *Manager) runIteration() { wg.Wait() } -// ApplyConfig updates the rule manager's state as the config requires. If -// loading the new rules failed the old rule set is restored. Returns true on success. -func (m *Manager) ApplyConfig(conf *config.Config) bool { - m.Lock() - defer m.Unlock() - - success := true - m.interval = time.Duration(conf.GlobalConfig.EvaluationInterval) +// transferAlertState makes a copy of the state of alerting rules and returns a function +// that restores them in the current state. +func (m *Manager) transferAlertState() func() { alertingRules := map[string]*AlertingRule{} for _, r := range m.rules { @@ -301,6 +298,31 @@ func (m *Manager) ApplyConfig(conf *config.Config) bool { } } + return func() { + // Restore alerting rule state. + for _, r := range m.rules { + ar, ok := r.(*AlertingRule) + if !ok { + continue + } + if old, ok := alertingRules[ar.name]; ok { + ar.activeAlerts = old.activeAlerts + } + } + } +} + +// ApplyConfig updates the rule manager's state as the config requires. If +// loading the new rules failed the old rule set is restored. Returns true on success. +func (m *Manager) ApplyConfig(conf *config.Config) bool { + m.Lock() + defer m.Unlock() + + defer m.transferAlertState()() + + success := true + m.interval = time.Duration(conf.GlobalConfig.EvaluationInterval) + rulesSnapshot := make([]Rule, len(m.rules)) copy(rulesSnapshot, m.rules) m.rules = m.rules[:0] @@ -321,16 +343,6 @@ func (m *Manager) ApplyConfig(conf *config.Config) bool { log.Errorf("Error loading rules, previous rule set restored: %s", err) success = false } - // Restore alerting rule state. - for _, r := range m.rules { - ar, ok := r.(*AlertingRule) - if !ok { - continue - } - if old, ok := alertingRules[ar.name]; ok { - ar.activeAlerts = old.activeAlerts - } - } return success } diff --git a/rules/manager_test.go b/rules/manager_test.go index d29cc6088..f51899f71 100644 --- a/rules/manager_test.go +++ b/rules/manager_test.go @@ -15,6 +15,7 @@ package rules import ( "fmt" + "reflect" "strings" "testing" "time" @@ -22,67 +23,112 @@ import ( clientmodel "github.com/prometheus/client_golang/model" "github.com/prometheus/prometheus/promql" - "github.com/prometheus/prometheus/storage/local" - "github.com/prometheus/prometheus/storage/metric" ) -var ( - testSampleInterval = time.Duration(5) * time.Minute - testStartTime = clientmodel.Timestamp(0) -) +func TestAlertingRule(t *testing.T) { + suite, err := promql.NewTest(t, ` + load 5m + http_requests{job="api-server", instance="0", group="production"} 0+10x10 + http_requests{job="api-server", instance="1", group="production"} 0+20x10 + http_requests{job="api-server", instance="0", group="canary"} 0+30x10 + http_requests{job="api-server", instance="1", group="canary"} 0+40x10 + http_requests{job="app-server", instance="0", group="production"} 0+50x10 + http_requests{job="app-server", instance="1", group="production"} 0+60x10 + http_requests{job="app-server", instance="0", group="canary"} 0+70x10 + http_requests{job="app-server", instance="1", group="canary"} 0+80x10 + `) + if err != nil { + t.Fatal(err) + } + defer suite.Close() -func getTestValueStream(startVal clientmodel.SampleValue, endVal clientmodel.SampleValue, stepVal clientmodel.SampleValue, startTime clientmodel.Timestamp) (resultValues metric.Values) { - currentTime := startTime - for currentVal := startVal; currentVal <= endVal; currentVal += stepVal { - sample := metric.SamplePair{ - Value: currentVal, - Timestamp: currentTime, + if err := suite.Run(); err != nil { + t.Fatal(err) + } + + expr, err := promql.ParseExpr(`http_requests{group="canary", job="app-server"} < 100`) + if err != nil { + t.Fatalf("Unable to parse alert expression: %s", err) + } + + rule := NewAlertingRule( + "HTTPRequestRateLow", + expr, + time.Minute, + clientmodel.LabelSet{"severity": "critical"}, + "summary", "description", "runbook", + ) + + var tests = []struct { + time time.Duration + result []string + }{ + { + time: 0, + result: []string{ + `ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`, + }, + }, { + time: 5 * time.Minute, + result: []string{ + `ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`, + }, + }, { + time: 10 * time.Minute, + result: []string{ + `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`, + `ALERTS{alertname="HTTPRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`, + }, + }, + { + time: 15 * time.Minute, + result: nil, + }, + { + time: 20 * time.Minute, + result: nil, + }, + } + + for i, test := range tests { + evalTime := clientmodel.Timestamp(0).Add(test.time) + + res, err := rule.eval(evalTime, suite.QueryEngine()) + if err != nil { + t.Fatalf("Error during alerting rule evaluation: %s", err) } - resultValues = append(resultValues, sample) - currentTime = currentTime.Add(testSampleInterval) - } - return resultValues -} -func getTestVectorFromTestMatrix(matrix promql.Matrix) promql.Vector { - vector := promql.Vector{} - for _, sampleStream := range matrix { - lastSample := sampleStream.Values[len(sampleStream.Values)-1] - vector = append(vector, &promql.Sample{ - Metric: sampleStream.Metric, - Value: lastSample.Value, - Timestamp: lastSample.Timestamp, - }) - } - return vector -} + actual := strings.Split(res.String(), "\n") + expected := annotateWithTime(test.result, evalTime) + if actual[0] == "" { + actual = []string{} + } -func storeMatrix(storage local.Storage, matrix promql.Matrix) { - pendingSamples := clientmodel.Samples{} - for _, sampleStream := range matrix { - for _, sample := range sampleStream.Values { - pendingSamples = append(pendingSamples, &clientmodel.Sample{ - Metric: sampleStream.Metric.Metric, - Value: sample.Value, - Timestamp: sample.Timestamp, - }) + if len(actual) != len(expected) { + t.Errorf("%d. Number of samples in expected and actual output don't match (%d vs. %d)", i, len(expected), len(actual)) + } + + for j, expectedSample := range expected { + found := false + for _, actualSample := range actual { + if actualSample == expectedSample { + found = true + } + } + if !found { + t.Errorf("%d.%d. Couldn't find expected sample in output: '%v'", i, j, expectedSample) + } + } + + if t.Failed() { + t.Errorf("%d. Expected and actual outputs don't match:", i) + t.Fatalf("Expected:\n%v\n----\nActual:\n%v", strings.Join(expected, "\n"), strings.Join(actual, "\n")) } } - for _, s := range pendingSamples { - storage.Append(s) - } - storage.WaitForIndexing() -} - -func vectorComparisonString(expected []string, actual []string) string { - separator := "\n--------------\n" - return fmt.Sprintf("Expected:%v%v%v\nActual:%v%v%v ", - separator, - strings.Join(expected, "\n"), - separator, - separator, - strings.Join(actual, "\n"), - separator) } func annotateWithTime(lines []string, timestamp clientmodel.Timestamp) []string { @@ -93,131 +139,45 @@ func annotateWithTime(lines []string, timestamp clientmodel.Timestamp) []string return annotatedLines } -var testMatrix = promql.Matrix{ - { - Metric: clientmodel.COWMetric{ - Metric: clientmodel.Metric{ - clientmodel.MetricNameLabel: "http_requests", - clientmodel.JobLabel: "api-server", - "instance": "0", - "group": "canary", - }, - }, - Values: getTestValueStream(0, 300, 30, testStartTime), - }, - { - Metric: clientmodel.COWMetric{ - Metric: clientmodel.Metric{ - clientmodel.MetricNameLabel: "http_requests", - clientmodel.JobLabel: "api-server", - "instance": "1", - "group": "canary", - }, - }, - Values: getTestValueStream(0, 400, 40, testStartTime), - }, - { - Metric: clientmodel.COWMetric{ - Metric: clientmodel.Metric{ - clientmodel.MetricNameLabel: "http_requests", - clientmodel.JobLabel: "app-server", - "instance": "0", - "group": "canary", - }, - }, - Values: getTestValueStream(0, 700, 70, testStartTime), - }, - { - Metric: clientmodel.COWMetric{ - Metric: clientmodel.Metric{ - clientmodel.MetricNameLabel: "http_requests", - clientmodel.JobLabel: "app-server", - "instance": "1", - "group": "canary", - }, - }, - Values: getTestValueStream(0, 800, 80, testStartTime), - }, -} +func TestTransferAlertState(t *testing.T) { + m := NewManager(&ManagerOptions{}) -func TestAlertingRule(t *testing.T) { - // Labels in expected output need to be alphabetically sorted. - var evalOutputs = [][]string{ - { - `ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`, - `ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`, - }, - { - `ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`, - `ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 1 @[%v]`, - `ALERTS{alertname="HttpRequestRateLow", alertstate="pending", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`, - `ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 1 @[%v]`, - }, - { - `ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="1", job="app-server", severity="critical"} => 0 @[%v]`, - `ALERTS{alertname="HttpRequestRateLow", alertstate="firing", group="canary", instance="0", job="app-server", severity="critical"} => 0 @[%v]`, - }, - { - /* empty */ - }, - { - /* empty */ - }, + alert := &Alert{ + Name: "testalert", + State: StateFiring, } - storage, closer := local.NewTestStorage(t, 1) - defer closer.Close() + arule := AlertingRule{ + name: "test", + activeAlerts: map[clientmodel.Fingerprint]*Alert{}, + } + aruleCopy := arule - storeMatrix(storage, testMatrix) + m.rules = append(m.rules, &arule) - engine := promql.NewEngine(storage, nil) - defer engine.Stop() + // Set an alert. + arule.activeAlerts[0] = alert - expr, err := promql.ParseExpr(`http_requests{group="canary", job="app-server"} < 100`) - if err != nil { - t.Fatalf("Unable to parse alert expression: %s", err) + // Save state and get the restore function. + restore := m.transferAlertState() + + // Remove arule from the rule list and add an unrelated rule and the + // stateless copy of arule. + m.rules = []Rule{ + &AlertingRule{ + name: "test_other", + activeAlerts: map[clientmodel.Fingerprint]*Alert{}, + }, + &aruleCopy, } - alertLabels := clientmodel.LabelSet{ - "severity": "critical", + // Apply the restore function. + restore() + + if ar := m.rules[0].(*AlertingRule); len(ar.activeAlerts) != 0 { + t.Fatalf("unexpected alert for unrelated alerting rule") } - rule := NewAlertingRule("HttpRequestRateLow", expr, time.Minute, alertLabels, "summary", "description", "runbook") - - for i, expectedLines := range evalOutputs { - evalTime := testStartTime.Add(testSampleInterval * time.Duration(i)) - - res, err := rule.eval(evalTime, engine) - if err != nil { - t.Fatalf("Error during alerting rule evaluation: %s", err) - } - - actualLines := strings.Split(res.String(), "\n") - expectedLines := annotateWithTime(expectedLines, evalTime) - if actualLines[0] == "" { - actualLines = []string{} - } - - failed := false - if len(actualLines) != len(expectedLines) { - t.Errorf("%d. Number of samples in expected and actual output don't match (%d vs. %d)", i, len(expectedLines), len(actualLines)) - failed = true - } - - for j, expectedSample := range expectedLines { - found := false - for _, actualSample := range actualLines { - if actualSample == expectedSample { - found = true - } - } - if !found { - t.Errorf("%d.%d. Couldn't find expected sample in output: '%v'", i, j, expectedSample) - failed = true - } - } - - if failed { - t.Fatalf("%d. Expected and actual outputs don't match:\n%v", i, vectorComparisonString(expectedLines, actualLines)) - } + if ar := m.rules[1].(*AlertingRule); !reflect.DeepEqual(ar.activeAlerts[0], alert) { + t.Fatalf("alert state was not restored") } }