diff --git a/notification/notification.go b/notification/notification.go index 500c43eaf..acf3b49d7 100644 --- a/notification/notification.go +++ b/notification/notification.go @@ -159,7 +159,7 @@ func (n *Handler) nextBatch() []*model.Alert { // Run dispatches notifications continuously. func (n *Handler) Run() { - // Just warn one in the beginning to prevent nosiy logs. + // Just warn once in the beginning to prevent noisy logs. if n.opts.AlertmanagerURL == "" { log.Warnf("No AlertManager configured, not dispatching any alerts") } diff --git a/rules/alerting.go b/rules/alerting.go index 861fdd99b..cb89b8b1a 100644 --- a/rules/alerting.go +++ b/rules/alerting.go @@ -63,11 +63,13 @@ func (s AlertState) String() string { // Alert is the user-level representation of a single instance of an alerting rule. type Alert struct { - State AlertState - Labels model.LabelSet - Value model.SampleValue - ActiveAt model.Time - ResolvedAt model.Time + State AlertState + Labels model.LabelSet + // The value at the last evaluation of the alerting expression. + Value model.SampleValue + // The interval during which the condition of this alert held true. + // ResolvedAt will be 0 to indicate a still active alert. + ActiveAt, ResolvedAt model.Time } // An AlertingRule generates alerts from its vector expression. @@ -109,7 +111,6 @@ func (rule *AlertingRule) Name() string { } func (r *AlertingRule) sample(alert *Alert, ts model.Time, set bool) *model.Sample { - // Build alert labels in order they can be overwritten. metric := model.Metric(r.labels.Clone()) for ln, lv := range alert.Labels { @@ -180,7 +181,7 @@ func (r *AlertingRule) eval(ts model.Time, engine *promql.Engine) (model.Vector, if a.State != StateInactive { vec = append(vec, r.sample(a, ts, false)) } - // If the alert was previously firing, keep it aroud for a given + // If the alert was previously firing, keep it around for a given // retention time so it is reported as resolved to the AlertManager. if a.State == StatePending || (a.ResolvedAt != 0 && ts.Sub(a.ResolvedAt) > resolvedRetention) { delete(r.active, fp) @@ -203,6 +204,8 @@ func (r *AlertingRule) eval(ts model.Time, engine *promql.Engine) (model.Vector, return vec, nil } +// State returns the maximum state of alert instances for this rule. +// StateFiring > StatePending > StateInactive func (r *AlertingRule) State() AlertState { r.mtx.Lock() defer r.mtx.Unlock() @@ -219,7 +222,7 @@ func (r *AlertingRule) State() AlertState { // ActiveAlerts returns a slice of active alerts. func (r *AlertingRule) ActiveAlerts() []*Alert { var res []*Alert - for _, a := range r.recentAlerts() { + for _, a := range r.currentAlerts() { if a.ResolvedAt == 0 { res = append(res, a) } @@ -227,7 +230,9 @@ func (r *AlertingRule) ActiveAlerts() []*Alert { return res } -func (r *AlertingRule) recentAlerts() []*Alert { +// currentAlerts returns all instances of alerts for this rule. This may include +// inactive alerts that were previously firing. +func (r *AlertingRule) currentAlerts() []*Alert { r.mtx.Lock() defer r.mtx.Unlock() diff --git a/rules/manager.go b/rules/manager.go index 1ff9eefe6..3592e76fb 100644 --- a/rules/manager.go +++ b/rules/manager.go @@ -99,6 +99,7 @@ type Rule interface { HTMLSnippet(pathPrefix string) html_template.HTML } +// Group is a set of rules that have a logical relation. type Group struct { name string interval time.Duration @@ -160,6 +161,7 @@ func (g *Group) fingerprint() model.Fingerprint { return l.Fingerprint() } +// offset returns until the next consistently slotted evaluation interval. func (g *Group) offset() time.Duration { now := time.Now().UnixNano() @@ -175,6 +177,7 @@ func (g *Group) offset() time.Duration { return time.Duration(next - now) } +// copyState copies the alerting rule state from the given group. func (g *Group) copyState(from *Group) { for _, fromRule := range from.rules { far, ok := fromRule.(*AlertingRule) @@ -193,6 +196,9 @@ func (g *Group) copyState(from *Group) { } } +// eval runs a single evaluation cycle in which all rules are evaluated in parallel. +// In the future a single group will be evaluated sequentially to properly handle +// rule dependency. func (g *Group) eval() { var ( now = model.Now() @@ -239,10 +245,11 @@ func (g *Group) eval() { wg.Wait() } +// sendAlerts sends alert notifications for the given rule. func (g *Group) sendAlerts(rule *AlertingRule, timestamp model.Time) error { var alerts model.Alerts - for _, alert := range rule.recentAlerts() { + for _, alert := range rule.currentAlerts() { // Only send actually firing alerts. if alert.State == StatePending { continue @@ -407,6 +414,9 @@ func (m *Manager) ApplyConfig(conf *config.Config) bool { return true } +// loadGroups reads groups from a list of files. +// As there's currently no group syntax a single group named "default" containing +// all rules will be returned. func (m *Manager) loadGroups(filenames ...string) (map[string]*Group, error) { groups := map[string]*Group{}