General cleanup of rules.
This commit is contained in:
parent
75c920c95e
commit
5e13880201
|
@ -16,7 +16,6 @@ package rules
|
|||
import (
|
||||
"fmt"
|
||||
"html/template"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
|
@ -28,12 +27,12 @@ import (
|
|||
|
||||
const (
|
||||
// AlertMetricName is the metric name for synthetic alert timeseries.
|
||||
AlertMetricName clientmodel.LabelValue = "ALERTS"
|
||||
alertMetricName clientmodel.LabelValue = "ALERTS"
|
||||
|
||||
// AlertNameLabel is the label name indicating the name of an alert.
|
||||
AlertNameLabel clientmodel.LabelName = "alertname"
|
||||
alertNameLabel clientmodel.LabelName = "alertname"
|
||||
// AlertStateLabel is the label name indicating the state of an alert.
|
||||
AlertStateLabel clientmodel.LabelName = "alertstate"
|
||||
alertStateLabel clientmodel.LabelName = "alertstate"
|
||||
)
|
||||
|
||||
// AlertState denotes the state of an active alert.
|
||||
|
@ -41,11 +40,11 @@ type AlertState int
|
|||
|
||||
func (s AlertState) String() string {
|
||||
switch s {
|
||||
case Inactive:
|
||||
case StateInactive:
|
||||
return "inactive"
|
||||
case Pending:
|
||||
case StatePending:
|
||||
return "pending"
|
||||
case Firing:
|
||||
case StateFiring:
|
||||
return "firing"
|
||||
default:
|
||||
panic("undefined")
|
||||
|
@ -54,13 +53,13 @@ func (s AlertState) String() string {
|
|||
|
||||
const (
|
||||
// Inactive alerts are neither firing nor pending.
|
||||
Inactive AlertState = iota
|
||||
StateInactive AlertState = iota
|
||||
// Pending alerts have been active for less than the configured
|
||||
// threshold duration.
|
||||
Pending
|
||||
StatePending
|
||||
// Firing alerts have been active for longer than the configured
|
||||
// threshold duration.
|
||||
Firing
|
||||
StateFiring
|
||||
)
|
||||
|
||||
// Alert is used to track active (pending/firing) alerts over time.
|
||||
|
@ -84,9 +83,9 @@ func (a Alert) sample(timestamp clientmodel.Timestamp, value clientmodel.SampleV
|
|||
recordedMetric[label] = value
|
||||
}
|
||||
|
||||
recordedMetric[clientmodel.MetricNameLabel] = AlertMetricName
|
||||
recordedMetric[AlertNameLabel] = clientmodel.LabelValue(a.Name)
|
||||
recordedMetric[AlertStateLabel] = clientmodel.LabelValue(a.State.String())
|
||||
recordedMetric[clientmodel.MetricNameLabel] = alertMetricName
|
||||
recordedMetric[alertNameLabel] = clientmodel.LabelValue(a.Name)
|
||||
recordedMetric[alertStateLabel] = clientmodel.LabelValue(a.State.String())
|
||||
|
||||
return &promql.Sample{
|
||||
Metric: clientmodel.COWMetric{
|
||||
|
@ -103,16 +102,16 @@ type AlertingRule struct {
|
|||
// The name of the alert.
|
||||
name string
|
||||
// The vector expression from which to generate alerts.
|
||||
Vector promql.Expr
|
||||
vector promql.Expr
|
||||
// The duration for which a labelset needs to persist in the expression
|
||||
// output vector before an alert transitions from Pending to Firing state.
|
||||
holdDuration time.Duration
|
||||
// Extra labels to attach to the resulting alert sample vectors.
|
||||
Labels clientmodel.LabelSet
|
||||
labels clientmodel.LabelSet
|
||||
// Short alert summary, suitable for email subjects.
|
||||
Summary string
|
||||
summary string
|
||||
// More detailed alert description.
|
||||
Description string
|
||||
description string
|
||||
|
||||
// Protects the below.
|
||||
mutex sync.Mutex
|
||||
|
@ -121,15 +120,36 @@ type AlertingRule struct {
|
|||
activeAlerts map[clientmodel.Fingerprint]*Alert
|
||||
}
|
||||
|
||||
// NewAlertingRule constructs a new AlertingRule.
|
||||
func NewAlertingRule(
|
||||
name string,
|
||||
vector promql.Expr,
|
||||
holdDuration time.Duration,
|
||||
labels clientmodel.LabelSet,
|
||||
summary string,
|
||||
description string,
|
||||
) *AlertingRule {
|
||||
return &AlertingRule{
|
||||
name: name,
|
||||
vector: vector,
|
||||
holdDuration: holdDuration,
|
||||
labels: labels,
|
||||
summary: summary,
|
||||
description: description,
|
||||
|
||||
activeAlerts: map[clientmodel.Fingerprint]*Alert{},
|
||||
}
|
||||
}
|
||||
|
||||
// Name returns the name of the alert.
|
||||
func (rule *AlertingRule) Name() string {
|
||||
return rule.name
|
||||
}
|
||||
|
||||
// Eval evaluates the rule expression and then creates pending alerts and fires
|
||||
// eval evaluates the rule expression and then creates pending alerts and fires
|
||||
// or removes previously pending alerts accordingly.
|
||||
func (rule *AlertingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.Engine) (promql.Vector, error) {
|
||||
query, err := engine.NewInstantQuery(rule.Vector.String(), timestamp)
|
||||
func (rule *AlertingRule) eval(timestamp clientmodel.Timestamp, engine *promql.Engine) (promql.Vector, error) {
|
||||
query, err := engine.NewInstantQuery(rule.vector.String(), timestamp)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
@ -151,14 +171,14 @@ func (rule *AlertingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.E
|
|||
if alert, ok := rule.activeAlerts[fp]; !ok {
|
||||
labels := clientmodel.LabelSet{}
|
||||
labels.MergeFromMetric(sample.Metric.Metric)
|
||||
labels = labels.Merge(rule.Labels)
|
||||
labels = labels.Merge(rule.labels)
|
||||
if _, ok := labels[clientmodel.MetricNameLabel]; ok {
|
||||
delete(labels, clientmodel.MetricNameLabel)
|
||||
}
|
||||
rule.activeAlerts[fp] = &Alert{
|
||||
Name: rule.name,
|
||||
Labels: labels,
|
||||
State: Pending,
|
||||
State: StatePending,
|
||||
ActiveSince: timestamp,
|
||||
Value: sample.Value,
|
||||
}
|
||||
|
@ -177,9 +197,9 @@ func (rule *AlertingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.E
|
|||
continue
|
||||
}
|
||||
|
||||
if activeAlert.State == Pending && timestamp.Sub(activeAlert.ActiveSince) >= rule.holdDuration {
|
||||
if activeAlert.State == StatePending && timestamp.Sub(activeAlert.ActiveSince) >= rule.holdDuration {
|
||||
vector = append(vector, activeAlert.sample(timestamp, 0))
|
||||
activeAlert.State = Firing
|
||||
activeAlert.State = StateFiring
|
||||
}
|
||||
|
||||
vector = append(vector, activeAlert.sample(timestamp, 1))
|
||||
|
@ -189,23 +209,23 @@ func (rule *AlertingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.E
|
|||
}
|
||||
|
||||
func (rule *AlertingRule) String() string {
|
||||
return fmt.Sprintf("ALERT %s IF %s FOR %s WITH %s", rule.name, rule.Vector, strutil.DurationToString(rule.holdDuration), rule.Labels)
|
||||
return fmt.Sprintf("ALERT %s IF %s FOR %s WITH %s", rule.name, rule.vector, strutil.DurationToString(rule.holdDuration), rule.labels)
|
||||
}
|
||||
|
||||
// HTMLSnippet returns an HTML snippet representing this alerting rule.
|
||||
func (rule *AlertingRule) HTMLSnippet(pathPrefix string) template.HTML {
|
||||
alertMetric := clientmodel.Metric{
|
||||
clientmodel.MetricNameLabel: AlertMetricName,
|
||||
AlertNameLabel: clientmodel.LabelValue(rule.name),
|
||||
clientmodel.MetricNameLabel: alertMetricName,
|
||||
alertNameLabel: clientmodel.LabelValue(rule.name),
|
||||
}
|
||||
return template.HTML(fmt.Sprintf(
|
||||
`ALERT <a href="%s">%s</a> IF <a href="%s">%s</a> FOR %s WITH %s`,
|
||||
pathPrefix+strutil.GraphLinkForExpression(alertMetric.String()),
|
||||
rule.name,
|
||||
pathPrefix+strutil.GraphLinkForExpression(rule.Vector.String()),
|
||||
rule.Vector,
|
||||
pathPrefix+strutil.GraphLinkForExpression(rule.vector.String()),
|
||||
rule.vector,
|
||||
strutil.DurationToString(rule.holdDuration),
|
||||
rule.Labels))
|
||||
rule.labels))
|
||||
}
|
||||
|
||||
// State returns the "maximum" state: firing > pending > inactive.
|
||||
|
@ -213,7 +233,7 @@ func (rule *AlertingRule) State() AlertState {
|
|||
rule.mutex.Lock()
|
||||
defer rule.mutex.Unlock()
|
||||
|
||||
maxState := Inactive
|
||||
maxState := StateInactive
|
||||
for _, activeAlert := range rule.activeAlerts {
|
||||
if activeAlert.State > maxState {
|
||||
maxState = activeAlert.State
|
||||
|
@ -233,17 +253,3 @@ func (rule *AlertingRule) ActiveAlerts() []Alert {
|
|||
}
|
||||
return alerts
|
||||
}
|
||||
|
||||
// NewAlertingRule constructs a new AlertingRule.
|
||||
func NewAlertingRule(name string, vector promql.Expr, holdDuration time.Duration, labels clientmodel.LabelSet, summary string, description string) *AlertingRule {
|
||||
return &AlertingRule{
|
||||
name: name,
|
||||
Vector: vector,
|
||||
holdDuration: holdDuration,
|
||||
Labels: labels,
|
||||
Summary: summary,
|
||||
Description: description,
|
||||
|
||||
activeAlerts: map[clientmodel.Fingerprint]*Alert{},
|
||||
}
|
||||
}
|
||||
|
|
|
@ -39,8 +39,8 @@ const (
|
|||
namespace = "prometheus"
|
||||
|
||||
ruleTypeLabel = "rule_type"
|
||||
alertingRuleType = "alerting"
|
||||
recordingRuleType = "recording"
|
||||
ruleTypeAlerting = "alerting"
|
||||
ruleTypeRecording = "recording"
|
||||
)
|
||||
|
||||
var (
|
||||
|
@ -173,7 +173,7 @@ func (m *Manager) queueAlertNotifications(rule *AlertingRule, timestamp clientmo
|
|||
|
||||
notifications := make(notification.NotificationReqs, 0, len(activeAlerts))
|
||||
for _, aa := range activeAlerts {
|
||||
if aa.State != Firing {
|
||||
if aa.State != StateFiring {
|
||||
// BUG: In the future, make AlertManager support pending alerts?
|
||||
continue
|
||||
}
|
||||
|
@ -205,15 +205,15 @@ func (m *Manager) queueAlertNotifications(rule *AlertingRule, timestamp clientmo
|
|||
}
|
||||
|
||||
notifications = append(notifications, ¬ification.NotificationReq{
|
||||
Summary: expand(rule.Summary),
|
||||
Description: expand(rule.Description),
|
||||
Summary: expand(rule.summary),
|
||||
Description: expand(rule.description),
|
||||
Labels: aa.Labels.Merge(clientmodel.LabelSet{
|
||||
AlertNameLabel: clientmodel.LabelValue(rule.Name()),
|
||||
alertNameLabel: clientmodel.LabelValue(rule.Name()),
|
||||
}),
|
||||
Value: aa.Value,
|
||||
ActiveSince: aa.ActiveSince.Time(),
|
||||
RuleString: rule.String(),
|
||||
GeneratorURL: m.prometheusURL + strings.TrimLeft(strutil.GraphLinkForExpression(rule.Vector.String()), "/"),
|
||||
GeneratorURL: m.prometheusURL + strings.TrimLeft(strutil.GraphLinkForExpression(rule.vector.String()), "/"),
|
||||
})
|
||||
}
|
||||
m.notificationHandler.SubmitReqs(notifications)
|
||||
|
@ -235,7 +235,7 @@ func (m *Manager) runIteration() {
|
|||
defer wg.Done()
|
||||
|
||||
start := time.Now()
|
||||
vector, err := rule.Eval(now, m.queryEngine)
|
||||
vector, err := rule.eval(now, m.queryEngine)
|
||||
duration := time.Since(start)
|
||||
|
||||
if err != nil {
|
||||
|
@ -247,11 +247,11 @@ func (m *Manager) runIteration() {
|
|||
switch r := rule.(type) {
|
||||
case *AlertingRule:
|
||||
m.queueAlertNotifications(r, now)
|
||||
evalDuration.WithLabelValues(alertingRuleType).Observe(
|
||||
evalDuration.WithLabelValues(ruleTypeAlerting).Observe(
|
||||
float64(duration / time.Millisecond),
|
||||
)
|
||||
case *RecordingRule:
|
||||
evalDuration.WithLabelValues(recordingRuleType).Observe(
|
||||
evalDuration.WithLabelValues(ruleTypeRecording).Observe(
|
||||
float64(duration / time.Millisecond),
|
||||
)
|
||||
default:
|
||||
|
@ -319,7 +319,7 @@ func (m *Manager) loadRuleFiles(filenames ...string) error {
|
|||
rule := NewAlertingRule(r.Name, r.Expr, r.Duration, r.Labels, r.Summary, r.Description)
|
||||
m.rules = append(m.rules, rule)
|
||||
case *promql.RecordStmt:
|
||||
rule := &RecordingRule{r.Name, r.Expr, r.Labels}
|
||||
rule := NewRecordingRule(r.Name, r.Expr, r.Labels)
|
||||
m.rules = append(m.rules, rule)
|
||||
default:
|
||||
panic("retrieval.Manager.LoadRuleFiles: unknown statement type")
|
||||
|
|
|
@ -16,7 +16,6 @@ package rules
|
|||
import (
|
||||
"fmt"
|
||||
"html/template"
|
||||
"strings"
|
||||
|
||||
clientmodel "github.com/prometheus/client_golang/model"
|
||||
|
||||
|
@ -31,11 +30,20 @@ type RecordingRule struct {
|
|||
labels clientmodel.LabelSet
|
||||
}
|
||||
|
||||
// NewRecordingRule returns a new recording rule.
|
||||
func NewRecordingRule(name string, vector promql.Expr, labels clientmodel.LabelSet) *RecordingRule {
|
||||
return &RecordingRule{
|
||||
name: name,
|
||||
vector: vector,
|
||||
labels: labels,
|
||||
}
|
||||
}
|
||||
|
||||
// Name returns the rule name.
|
||||
func (rule RecordingRule) Name() string { return rule.name }
|
||||
|
||||
// Eval evaluates the rule and then overrides the metric names and labels accordingly.
|
||||
func (rule RecordingRule) Eval(timestamp clientmodel.Timestamp, engine *promql.Engine) (promql.Vector, error) {
|
||||
// eval evaluates the rule and then overrides the metric names and labels accordingly.
|
||||
func (rule RecordingRule) eval(timestamp clientmodel.Timestamp, engine *promql.Engine) (promql.Vector, error) {
|
||||
query, err := engine.NewInstantQuery(rule.vector.String(), timestamp)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
|
|
@ -27,7 +27,7 @@ type Rule interface {
|
|||
// Name returns the name of the rule.
|
||||
Name() string
|
||||
// Eval evaluates the rule, including any associated recording or alerting actions.
|
||||
Eval(clientmodel.Timestamp, *promql.Engine) (promql.Vector, error)
|
||||
eval(clientmodel.Timestamp, *promql.Engine) (promql.Vector, error)
|
||||
// String returns a human-readable string representation of the rule.
|
||||
String() string
|
||||
// HTMLSnippet returns a human-readable string representation of the rule,
|
||||
|
|
|
@ -186,7 +186,7 @@ func TestAlertingRule(t *testing.T) {
|
|||
for i, expectedLines := range evalOutputs {
|
||||
evalTime := testStartTime.Add(testSampleInterval * time.Duration(i))
|
||||
|
||||
res, err := rule.Eval(evalTime, engine)
|
||||
res, err := rule.eval(evalTime, engine)
|
||||
if err != nil {
|
||||
t.Fatalf("Error during alerting rule evaluation: %s", err)
|
||||
}
|
||||
|
|
|
@ -63,9 +63,9 @@ func (h *AlertsHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
|
|||
alertStatus := AlertStatus{
|
||||
AlertingRules: alertsSorter.alerts,
|
||||
AlertStateToRowClass: map[rules.AlertState]string{
|
||||
rules.Inactive: "success",
|
||||
rules.Pending: "warning",
|
||||
rules.Firing: "danger",
|
||||
rules.StateInactive: "success",
|
||||
rules.StatePending: "warning",
|
||||
rules.StateFiring: "danger",
|
||||
},
|
||||
}
|
||||
executeTemplate(w, "alerts", alertStatus, h.PathPrefix)
|
||||
|
|
Loading…
Reference in New Issue