From 0226d1ac7a9b624ac140543d47cf5295b0c90437 Mon Sep 17 00:00:00 2001 From: Julius Volz Date: Thu, 13 Jun 2013 16:10:05 +0200 Subject: [PATCH] Implement alerts dashboard and expression console links. --- main.go | 7 +- model/metric.go | 8 +++ rules/alerting.go | 121 ++++++++++++++++++++++++++--------- rules/ast/printer.go | 7 +- rules/helpers.go | 6 ++ rules/manager.go | 20 ++++++ rules/recording.go | 15 ++++- rules/rules.go | 7 +- web/Makefile | 2 +- web/alerts.go | 61 ++++++++++++++++++ web/static/css/alerts.css | 48 ++++++++++++++ web/static/js/alerts.js | 5 ++ web/static/js/graph.js | 2 + web/status.go | 2 +- web/templates/_base.html | 1 + web/templates/alerts.html | 41 ++++++++++++ web/templates/databases.html | 2 +- web/templates/graph.html | 1 - web/templates/status.html | 2 +- web/web.go | 2 + 20 files changed, 321 insertions(+), 39 deletions(-) create mode 100644 web/alerts.go create mode 100644 web/static/css/alerts.css create mode 100644 web/static/js/alerts.js create mode 100644 web/templates/alerts.html diff --git a/main.go b/main.go index 2eda11bb5..d33b5d9d6 100644 --- a/main.go +++ b/main.go @@ -230,7 +230,7 @@ func main() { PrometheusStatus: &web.PrometheusStatus{ BuildInfo: BuildInfo, Config: conf.String(), - Rules: ruleManager.Rules(), + RuleManager: ruleManager, TargetPools: targetManager.Pools(), Flags: flags, Birth: time.Now(), @@ -238,6 +238,10 @@ func main() { CurationState: curationState, } + alertsHandler := &web.AlertsHandler{ + RuleManager: ruleManager, + } + databasesHandler := &web.DatabasesHandler{ Incoming: databaseStates, } @@ -252,6 +256,7 @@ func main() { StatusHandler: statusHandler, MetricsHandler: metricsService, DatabasesHandler: databasesHandler, + AlertsHandler: alertsHandler, } prometheus := prometheus{ diff --git a/model/metric.go b/model/metric.go index 6615d1dda..2d3386f42 100644 --- a/model/metric.go +++ b/model/metric.go @@ -94,6 +94,14 @@ func (l LabelSet) ToMetric() Metric { return metric } +func (m Metric) ToLabelSet() LabelSet { + labels := LabelSet{} + for label, value := range m { + labels[label] = value + } + return labels +} + // A Metric is similar to a LabelSet, but the key difference is that a Metric is // a singleton and refers to one and only one stream of samples. type Metric map[LabelName]LabelValue diff --git a/rules/alerting.go b/rules/alerting.go index 04ff5b05a..dd269fc5e 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -15,19 +15,24 @@ package rules import ( "fmt" + "html/template" + "sync" + "time" + "github.com/prometheus/prometheus/model" "github.com/prometheus/prometheus/rules/ast" "github.com/prometheus/prometheus/stats" "github.com/prometheus/prometheus/storage/metric" "github.com/prometheus/prometheus/utility" - "time" ) // States that active alerts can be in. -type alertState int +type AlertState int -func (s alertState) String() string { +func (s AlertState) String() string { switch s { + case INACTIVE: + return "inactive" case PENDING: return "pending" case FIRING: @@ -38,32 +43,35 @@ func (s alertState) String() string { } const ( - PENDING alertState = iota + INACTIVE AlertState = iota + PENDING FIRING ) -// alert is used to track active (pending/firing) alerts over time. -type alert struct { +// Alert is used to track active (pending/firing) alerts over time. +type Alert struct { // The name of the alert. - name string + Name string // The vector element labelset triggering this alert. - metric model.Metric + Labels model.LabelSet // The state of the alert (PENDING or FIRING). - state alertState + State AlertState // The time when the alert first transitioned into PENDING state. - activeSince time.Time + ActiveSince time.Time + // The value of the alert expression for this vector element. + Value model.SampleValue } // sample returns a Sample suitable for recording the alert. -func (a alert) sample(timestamp time.Time, value model.SampleValue) model.Sample { +func (a Alert) sample(timestamp time.Time, value model.SampleValue) model.Sample { recordedMetric := model.Metric{} - for label, value := range a.metric { + for label, value := range a.Labels { recordedMetric[label] = value } recordedMetric[model.MetricNameLabel] = model.AlertMetricName - recordedMetric[model.AlertNameLabel] = model.LabelValue(a.name) - recordedMetric[model.AlertStateLabel] = model.LabelValue(a.state.String()) + recordedMetric[model.AlertNameLabel] = model.LabelValue(a.Name) + recordedMetric[model.AlertStateLabel] = model.LabelValue(a.State.String()) return model.Sample{ Metric: recordedMetric, @@ -83,37 +91,51 @@ type AlertingRule struct { holdDuration time.Duration // Extra labels to attach to the resulting alert sample vectors. labels model.LabelSet + + // Protects the below. + mutex sync.Mutex // A map of alerts which are currently active (PENDING or FIRING), keyed by // the fingerprint of the labelset they correspond to. - activeAlerts map[model.Fingerprint]*alert + activeAlerts map[model.Fingerprint]*Alert } -func (rule AlertingRule) Name() string { return rule.name } +func (rule *AlertingRule) Name() string { return rule.name } -func (rule AlertingRule) EvalRaw(timestamp time.Time, storage *metric.TieredStorage) (ast.Vector, error) { +func (rule *AlertingRule) EvalRaw(timestamp time.Time, storage *metric.TieredStorage) (ast.Vector, error) { return ast.EvalVectorInstant(rule.vector, timestamp, storage, stats.NewTimerGroup()) } -func (rule AlertingRule) Eval(timestamp time.Time, storage *metric.TieredStorage) (ast.Vector, error) { +func (rule *AlertingRule) Eval(timestamp time.Time, storage *metric.TieredStorage) (ast.Vector, error) { // Get the raw value of the rule expression. exprResult, err := rule.EvalRaw(timestamp, storage) if err != nil { return nil, err } + rule.mutex.Lock() + defer rule.mutex.Unlock() + // Create pending alerts for any new vector elements in the alert expression. resultFingerprints := utility.Set{} for _, sample := range exprResult { fp := *model.NewFingerprintFromMetric(sample.Metric) resultFingerprints.Add(fp) - if _, ok := rule.activeAlerts[fp]; !ok { - rule.activeAlerts[fp] = &alert{ - name: rule.name, - metric: sample.Metric, - state: PENDING, - activeSince: timestamp, + alert, ok := rule.activeAlerts[fp] + if !ok { + labels := sample.Metric.ToLabelSet() + if _, ok := labels[model.MetricNameLabel]; ok { + delete(labels, model.MetricNameLabel) } + rule.activeAlerts[fp] = &Alert{ + Name: rule.name, + Labels: sample.Metric.ToLabelSet(), + State: PENDING, + ActiveSince: timestamp, + Value: sample.Value, + } + } else { + alert.Value = sample.Value } } @@ -127,9 +149,9 @@ func (rule AlertingRule) Eval(timestamp time.Time, storage *metric.TieredStorage continue } - if activeAlert.state == PENDING && timestamp.Sub(activeAlert.activeSince) >= rule.holdDuration { + if activeAlert.State == PENDING && timestamp.Sub(activeAlert.ActiveSince) >= rule.holdDuration { vector = append(vector, activeAlert.sample(timestamp, 0)) - activeAlert.state = FIRING + activeAlert.State = FIRING } vector = append(vector, activeAlert.sample(timestamp, 1)) @@ -138,7 +160,7 @@ func (rule AlertingRule) Eval(timestamp time.Time, storage *metric.TieredStorage return vector, nil } -func (rule AlertingRule) ToDotGraph() string { +func (rule *AlertingRule) ToDotGraph() string { graph := fmt.Sprintf(`digraph "Rules" { %#p[shape="box",label="ALERT %s IF FOR %s"]; %#p -> %#p; @@ -147,8 +169,47 @@ func (rule AlertingRule) ToDotGraph() string { return graph } -func (rule AlertingRule) String() string { - return fmt.Sprintf("ALERT %s IF %s FOR %s WITH %s\n", rule.name, rule.vector, utility.DurationToString(rule.holdDuration), rule.labels) +func (rule *AlertingRule) String() string { + return fmt.Sprintf("ALERT %s IF %s FOR %s WITH %s", rule.name, rule.vector, utility.DurationToString(rule.holdDuration), rule.labels) +} + +func (rule *AlertingRule) HTMLSnippet() template.HTML { + alertMetric := model.Metric{ + model.MetricNameLabel: model.AlertMetricName, + model.AlertNameLabel: model.LabelValue(rule.name), + } + return template.HTML(fmt.Sprintf( + `ALERT %s IF %s FOR %s WITH %s`, + ConsoleLinkForExpression(alertMetric.String()), + rule.name, + ConsoleLinkForExpression(rule.vector.String()), + rule.vector, + utility.DurationToString(rule.holdDuration), + rule.labels)) +} + +func (rule *AlertingRule) State() AlertState { + rule.mutex.Lock() + defer rule.mutex.Unlock() + + maxState := INACTIVE + for _, activeAlert := range rule.activeAlerts { + if activeAlert.State > maxState { + maxState = activeAlert.State + } + } + return maxState +} + +func (rule *AlertingRule) ActiveAlerts() []Alert { + rule.mutex.Lock() + defer rule.mutex.Unlock() + + alerts := make([]Alert, 0, len(rule.activeAlerts)) + for _, alert := range rule.activeAlerts { + alerts = append(alerts, *alert) + } + return alerts } // Construct a new AlertingRule. @@ -158,6 +219,6 @@ func NewAlertingRule(name string, vector ast.VectorNode, holdDuration time.Durat vector: vector, holdDuration: holdDuration, labels: labels, - activeAlerts: map[model.Fingerprint]*alert{}, + activeAlerts: map[model.Fingerprint]*Alert{}, } } diff --git a/rules/ast/printer.go b/rules/ast/printer.go index f09f3ff4d..6830f0864 100644 --- a/rules/ast/printer.go +++ b/rules/ast/printer.go @@ -321,7 +321,12 @@ func (node *VectorFunctionCall) String() string { } func (node *VectorAggregation) String() string { - return fmt.Sprintf("%s(%s) BY (%s)", node.aggrType, node.vector, node.groupBy) + aggrString := fmt.Sprintf("%s(%s)", node.aggrType, node.vector) + if len(node.groupBy) > 0 { + return fmt.Sprintf("%s BY (%s)", aggrString, node.groupBy) + } else { + return aggrString + } } func (node *VectorArithExpr) String() string { diff --git a/rules/helpers.go b/rules/helpers.go index a3a501f57..30dd67686 100644 --- a/rules/helpers.go +++ b/rules/helpers.go @@ -15,6 +15,8 @@ package rules import ( "fmt" + "html" + "github.com/prometheus/prometheus/model" "github.com/prometheus/prometheus/rules/ast" "github.com/prometheus/prometheus/utility" @@ -111,3 +113,7 @@ func NewMatrix(vector ast.Node, intervalStr string) (ast.MatrixNode, error) { vectorLiteral := vector.(*ast.VectorLiteral) return ast.NewMatrixLiteral(vectorLiteral, interval), nil } + +func ConsoleLinkForExpression(expr string) string { + return html.EscapeString(fmt.Sprintf(`graph#[{"expr":%q,"tab":1}]`, expr)) +} diff --git a/rules/manager.go b/rules/manager.go index 717e2c986..5bde027b4 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -28,9 +28,16 @@ type Result struct { } type RuleManager interface { + // Load and add rules from rule files specified in the configuration. AddRulesFromConfig(config config.Config) error + // Start the rule manager's periodic rule evaluation. Run() + // Stop the rule manager's rule evaluation cycles. + Stop() + // Return all rules. Rules() []Rule + // Return all alerting rules. + AlertingRules() []*AlertingRule } type ruleManager struct { @@ -127,3 +134,16 @@ func (m *ruleManager) Rules() []Rule { copy(rules, m.rules) return rules } + +func (m *ruleManager) AlertingRules() []*AlertingRule { + m.Lock() + defer m.Unlock() + + alerts := []*AlertingRule{} + for _, rule := range m.rules { + if alertingRule, ok := rule.(*AlertingRule); ok { + alerts = append(alerts, alertingRule) + } + } + return alerts +} diff --git a/rules/recording.go b/rules/recording.go index 81b713749..bb19ed9ae 100644 --- a/rules/recording.go +++ b/rules/recording.go @@ -15,11 +15,13 @@ package rules import ( "fmt" + "html/template" + "time" + "github.com/prometheus/prometheus/model" "github.com/prometheus/prometheus/rules/ast" "github.com/prometheus/prometheus/stats" "github.com/prometheus/prometheus/storage/metric" - "time" ) // A RecordingRule records its vector expression into new timeseries. @@ -71,6 +73,17 @@ func (rule RecordingRule) String() string { return fmt.Sprintf("%s%s = %s\n", rule.name, rule.labels, rule.vector) } +func (rule RecordingRule) HTMLSnippet() template.HTML { + ruleExpr := rule.vector.String() + return template.HTML(fmt.Sprintf( + `%s%s = %s`, + ConsoleLinkForExpression(rule.name), + rule.name, + rule.labels, + ConsoleLinkForExpression(ruleExpr), + ruleExpr)) +} + // Construct a new RecordingRule. func NewRecordingRule(name string, labels model.LabelSet, vector ast.VectorNode, permanent bool) *RecordingRule { return &RecordingRule{ diff --git a/rules/rules.go b/rules/rules.go index ea7c31246..03a8b0b8b 100644 --- a/rules/rules.go +++ b/rules/rules.go @@ -14,9 +14,11 @@ package rules import ( + "html/template" + "time" + "github.com/prometheus/prometheus/rules/ast" "github.com/prometheus/prometheus/storage/metric" - "time" ) // A Rule encapsulates a vector expression which is evaluated at a specified @@ -33,4 +35,7 @@ type Rule interface { ToDotGraph() string // String returns a human-readable string representation of the rule. String() string + // HTMLSnippet returns a human-readable string representation of the rule, + // decorated with HTML elements for use the web frontend. + HTMLSnippet() template.HTML } diff --git a/web/Makefile b/web/Makefile index adfa2043e..47b9a6b0b 100644 --- a/web/Makefile +++ b/web/Makefile @@ -13,7 +13,7 @@ all: blob-stamp -blob-stamp: static/generated/protocol_buffer.descriptor +blob-stamp: static/generated/protocol_buffer.descriptor templates/* $(MAKE) -C blob touch $@ diff --git a/web/alerts.go b/web/alerts.go new file mode 100644 index 000000000..430f17d30 --- /dev/null +++ b/web/alerts.go @@ -0,0 +1,61 @@ +// Copyright 2013 Prometheus Team +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package web + +import ( + "github.com/prometheus/prometheus/rules" + "net/http" + "sort" + "sync" +) + +type AlertStatus struct { + AlertingRules []*rules.AlertingRule +} + +type AlertsHandler struct { + RuleManager rules.RuleManager + + mutex sync.Mutex +} + +type byAlertStateSorter struct { + alerts []*rules.AlertingRule +} + +func (s byAlertStateSorter) Len() int { + return len(s.alerts) +} + +func (s byAlertStateSorter) Less(i, j int) bool { + return s.alerts[i].State() > s.alerts[j].State() +} + +func (s byAlertStateSorter) Swap(i, j int) { + s.alerts[i], s.alerts[j] = s.alerts[j], s.alerts[i] +} + +func (h *AlertsHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) { + h.mutex.Lock() + defer h.mutex.Unlock() + + alerts := h.RuleManager.AlertingRules() + alertsSorter := byAlertStateSorter{alerts: alerts} + sort.Sort(alertsSorter) + + alertStatus := AlertStatus{ + AlertingRules: alertsSorter.alerts, + } + executeTemplate(w, "alerts", alertStatus) +} diff --git a/web/static/css/alerts.css b/web/static/css/alerts.css new file mode 100644 index 000000000..5c57ed237 --- /dev/null +++ b/web/static/css/alerts.css @@ -0,0 +1,48 @@ +.alert_wrapper { + padding: 2px; +} + +.alert_header { + padding: 3px; + cursor: pointer; +} + +.alert_content { + padding: 3px; + display: none; +} + +.alert_header.firing { + background-color: #ff7673; +} + +.alert_header.pending { + background-color: #ffcf40; +} + +.alert_header.inactive { + background-color: #92ed6b; +} + +.alert_description { + margin-left: 3px; + padding: 8px 0 8px 0; +} + +.alert_active_elements { + border: 1px solid #dddddd; +} + +.alert_active_elements th { + background-color: #dddddd; + padding: 0 5px 0 5px; +} + +.alert_active_elements td { + background-color: #eebbbb; + padding: 0 5px 0 5px; +} + +.alert_active_elements tr:hover td { + background-color: #ffcf40; +} diff --git a/web/static/js/alerts.js b/web/static/js/alerts.js new file mode 100644 index 000000000..b66c82b02 --- /dev/null +++ b/web/static/js/alerts.js @@ -0,0 +1,5 @@ +function init() { + $(".alert_header").click(function() {$(this).next().toggle(); }); +} + +$(init); diff --git a/web/static/js/graph.js b/web/static/js/graph.js index b92d51fa2..7aa612508 100644 --- a/web/static/js/graph.js +++ b/web/static/js/graph.js @@ -519,6 +519,7 @@ function parseGraphOptionsFromUrl() { return options; } +// NOTE: This needs to be kept in sync with rules/helpers.go:ConsoleLinkForExpression! function storeGraphOptionsInUrl() { var allGraphsOptions = []; for (var i = 0; i < graphs.length; i++) { @@ -559,4 +560,5 @@ function init() { } }) } + $(init); diff --git a/web/status.go b/web/status.go index f42092548..0a9a7ba1e 100644 --- a/web/status.go +++ b/web/status.go @@ -27,7 +27,7 @@ type PrometheusStatus struct { Config string Curation metric.CurationState Flags map[string]string - Rules []rules.Rule + RuleManager rules.RuleManager TargetPools map[string]*retrieval.TargetPool Birth time.Time diff --git a/web/templates/_base.html b/web/templates/_base.html index 6ad6e054e..2f62d231c 100644 --- a/web/templates/_base.html +++ b/web/templates/_base.html @@ -13,6 +13,7 @@ Graph & Console Status Databases + Alerts {{ define "user_dashboard_link" }}{{ end }} {{ template "user_dashboard_link" .}} diff --git a/web/templates/alerts.html b/web/templates/alerts.html new file mode 100644 index 000000000..0f7407496 --- /dev/null +++ b/web/templates/alerts.html @@ -0,0 +1,41 @@ +{{define "head"}} + + +{{end}} + +{{define "content"}} +

Alerts

+
+ {{range .AlertingRules}} + {{$activeAlerts := .ActiveAlerts}} +
+
+ {{.Name}} ({{len $activeAlerts}} active) +
+
+
+ Rule: {{.HTMLSnippet}} +
+ {{if $activeAlerts}} + + + + + + + + {{range $activeAlerts}} + + + + + + + {{end}} +
LabelsStateActive SinceValue
{{.Labels}}{{.State}}{{.ActiveSince}}{{.Value}}
+ {{end}} +
+
+ {{end}} +
+{{end}} diff --git a/web/templates/databases.html b/web/templates/databases.html index 1958cdf44..f2be7230e 100644 --- a/web/templates/databases.html +++ b/web/templates/databases.html @@ -1,8 +1,8 @@ {{define "head"}}{{end}} {{define "content"}} -

Database Information

+
{{range .States}}

{{.Name}}

diff --git a/web/templates/graph.html b/web/templates/graph.html index 14ee16113..b50cf3c6b 100644 --- a/web/templates/graph.html +++ b/web/templates/graph.html @@ -2,7 +2,6 @@ - diff --git a/web/templates/status.html b/web/templates/status.html index cf90904fa..9ec4eaee5 100644 --- a/web/templates/status.html +++ b/web/templates/status.html @@ -35,7 +35,7 @@

Rules

-
{{range .Rules}}{{.String}}{{end}}
+
{{range .RuleManager.Rules}}{{.HTMLSnippet}}
{{end}}

Targets

diff --git a/web/web.go b/web/web.go index 4d7bf9886..754d3a85d 100644 --- a/web/web.go +++ b/web/web.go @@ -38,6 +38,7 @@ type WebService struct { StatusHandler *StatusHandler DatabasesHandler *DatabasesHandler MetricsHandler *api.MetricsService + AlertsHandler *AlertsHandler } func (w WebService) ServeForever() error { @@ -56,6 +57,7 @@ func (w WebService) ServeForever() error { exp.Handle("/", w.StatusHandler) exp.Handle("/databases", w.DatabasesHandler) + exp.Handle("/alerts", w.AlertsHandler) exp.HandleFunc("/graph", graphHandler) exp.Handle("/api/", gorest.Handle())