alertmanager/manager/manager.go
Julius Volz 0642137a11 Fix alert expiry crashbug.
No, the array representation of a binary heap is of course *not* sorted.

Change-Id: Ib18c9b7e1bee24391f98d73135ac19c77026b168
2014-10-27 23:50:31 +01:00

410 lines
11 KiB
Go

// Copyright 2013 Prometheus Team
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package manager
import (
"container/heap"
"strings"
"sync"
"time"
"github.com/golang/glog"
)
// AlertManager stores Alerts and removes them upon expiry.
type AlertManager interface {
// Ingests a new alert entry into the store. If an alert with the same
// fingerprint already exists, it only updates the existing entry's metadata.
Receive(Alerts)
// Retrieves all alerts from the store that match the provided Filters.
GetAll(Filters) AlertAggregates
// Sets the AggregationRules to associate with alerts.
SetAggregationRules(AggregationRules)
// Runs the AlertManager dispatcher loop.
Run()
}
type AggregationRules []*AggregationRule
// AggregationRule creates and manages the scope for received events.
type AggregationRule struct {
Filters Filters
RepeatRate time.Duration
NotificationConfigName string
}
// Returns whether a given AggregationRule matches an Alert.
func (r *AggregationRule) Handles(l *Alert) bool {
return r.Filters.Handles(l.Labels)
}
// An AlertAggregate tracks the latest alert received for a given alert
// fingerprint and some metadata about the alert.
type AlertAggregate struct {
Alert *Alert
Rule *AggregationRule
// When was this AggregationInstance created?
Created time.Time
// When was the last refresh received into this AlertAggregate?
LastRefreshed time.Time
// When was the last notification sent out for this AlertAggregate?
LastNotification time.Time
// When should the next notification be sent according to the current Rule's
// RepeatRate?
NextNotification time.Time
}
// Ingests a received Alert into this AlertAggregate and updates metadata.
func (agg *AlertAggregate) Ingest(a *Alert) {
agg.Alert = a
agg.LastRefreshed = time.Now()
}
type AlertAggregates []*AlertAggregate
// Helper type for managing a heap based on LastRefreshed time.
type aggregatesByLastRefreshed struct {
AlertAggregates
}
// Helper type for managing a heap based on NextNotification time.
type aggregatesByNextNotification struct {
AlertAggregates
}
// Methods implementing heap.Interface.
func (aggs AlertAggregates) Len() int {
return len(aggs)
}
func (aggs aggregatesByLastRefreshed) Less(i, j int) bool {
return aggs.AlertAggregates[i].LastRefreshed.Before(aggs.AlertAggregates[j].LastRefreshed)
}
func (aggs aggregatesByNextNotification) Less(i, j int) bool {
return aggs.AlertAggregates[i].NextNotification.Before(aggs.AlertAggregates[j].NextNotification)
}
// rebuildFrom rebuilds the aggregatesByNextNotification index from a provided
// authoritative AlertAggregates slice.
func (aggs *aggregatesByNextNotification) rebuildFrom(aa AlertAggregates) {
aggs.AlertAggregates = aggs.AlertAggregates[:0]
for _, a := range aa {
aggs.Push(a)
}
}
func (aggs AlertAggregates) Swap(i, j int) {
aggs[i], aggs[j] = aggs[j], aggs[i]
}
func (aggs *AlertAggregates) Push(agg interface{}) {
*aggs = append(*aggs, agg.(*AlertAggregate))
}
func (aggs *AlertAggregates) Pop() interface{} {
old := *aggs
n := len(old)
item := old[n-1]
*aggs = old[:n-1]
return item
}
// memoryAlertManager implements the AlertManager interface and only keeps
// state in memory.
type memoryAlertManager struct {
// The minimum interval for alert refreshes before being purged.
minRefreshInterval time.Duration
// Inhibitor for filtering out inhibited alerts.
inhibitor *Inhibitor
// Silencer for filtering out silenced alerts.
silencer *Silencer
// Notifier for dispatching notifications.
notifier Notifier
// Mutex protecting all fields below.
mu sync.Mutex
// Currently loaded set of AggregationRules.
rules AggregationRules
// Main AlertAggregates index by fingerprint.
aggregates map[AlertFingerprint]*AlertAggregate
// Secondary AlertAggregates index by LastRefreshed time.
aggregatesByLastRefreshed aggregatesByLastRefreshed
// Secondary AlertAggregates index by NextNotification time.
aggregatesByNextNotification aggregatesByNextNotification
// Cache of the last result of computing uninhibited/unsilenced alerts.
filteredAlerts AlertLabelSets
// Tracks whether a change has occurred that requires a recomputation of
// notification outputs.
needsNotificationRefresh bool
}
// Options for constructing a memoryAlertManager.
type MemoryAlertManagerOptions struct {
// Inhibitor for filtering out inhibited alerts.
Inhibitor *Inhibitor
// Silencer for filtering out silenced alerts.
Silencer *Silencer
// Notifier for dispatching notifications.
Notifier Notifier
// The minimum interval for alert refreshes before being purged.
MinRefreshInterval time.Duration
}
// Constructs a new memoryAlertManager.
func NewMemoryAlertManager(o *MemoryAlertManagerOptions) AlertManager {
return &memoryAlertManager{
aggregates: make(map[AlertFingerprint]*AlertAggregate),
minRefreshInterval: o.MinRefreshInterval,
inhibitor: o.Inhibitor,
silencer: o.Silencer,
notifier: o.Notifier,
}
}
// Receive and ingest a new list of alert messages (e.g. from the web API).
func (s *memoryAlertManager) Receive(as Alerts) {
s.mu.Lock()
defer s.mu.Unlock()
for _, a := range as {
s.ingest(a)
}
}
// Ingests an alert into the memoryAlertManager and creates a new
// AggregationInstance for it, if necessary.
func (s *memoryAlertManager) ingest(a *Alert) {
fp := a.Fingerprint()
agg, ok := s.aggregates[fp]
if !ok {
agg = &AlertAggregate{
Created: time.Now(),
}
agg.Ingest(a)
for _, r := range s.rules {
if r.Handles(agg.Alert) {
agg.Rule = r
break
}
}
s.aggregates[fp] = agg
heap.Push(&s.aggregatesByLastRefreshed, agg)
heap.Push(&s.aggregatesByNextNotification, agg)
s.needsNotificationRefresh = true
} else {
agg.Ingest(a)
heap.Init(&s.aggregatesByLastRefreshed)
}
}
// Get all AlertAggregates that match a given set of Filters.
func (s memoryAlertManager) GetAll(f Filters) AlertAggregates {
s.mu.Lock()
defer s.mu.Unlock()
aggs := make(AlertAggregates, 0, len(s.aggregates))
for _, agg := range s.aggregates {
if f.Handles(agg.Alert.Labels) {
// Make a deep copy of the AggregationRule so we can safely pass it to the
// outside.
aggCopy := *agg
if agg.Rule != nil {
rule := *agg.Rule
aggCopy.Rule = &rule
}
alert := *agg.Alert
aggCopy.Alert = &alert
aggs = append(aggs, &aggCopy)
}
}
return aggs
}
// Replace the current set of loaded AggregationRules by another.
func (s *memoryAlertManager) SetAggregationRules(rules AggregationRules) {
s.mu.Lock()
defer s.mu.Unlock()
glog.Infof("Replacing aggregator rules (old: %d, new: %d)...", len(s.rules), len(rules))
s.rules = rules
// Reassign AlertAggregates to the first new matching rule, set the rule to
// nil if there is no matching rule.
for _, agg := range s.aggregates {
agg.Rule = nil
for _, r := range s.rules {
if r.Handles(agg.Alert) {
agg.Rule = r
agg.NextNotification = agg.LastNotification.Add(r.RepeatRate)
break
}
}
}
heap.Init(&s.aggregatesByNextNotification)
s.needsNotificationRefresh = true
}
// Check for any expired AlertAggregates and remove them from all indexes.
func (s *memoryAlertManager) removeExpiredAggregates() {
s.mu.Lock()
defer s.mu.Unlock()
// This loop is interrupted if either the heap is empty or only non-expired
// aggregates remain in the heap.
for {
if len(s.aggregatesByLastRefreshed.AlertAggregates) == 0 {
break
}
agg := heap.Pop(&s.aggregatesByLastRefreshed).(*AlertAggregate)
if time.Since(agg.LastRefreshed) > s.minRefreshInterval {
delete(s.aggregates, agg.Alert.Fingerprint())
s.needsNotificationRefresh = true
} else {
heap.Push(&s.aggregatesByLastRefreshed, agg)
break
}
}
if s.needsNotificationRefresh {
s.aggregatesByNextNotification.rebuildFrom(s.aggregatesByLastRefreshed.AlertAggregates)
}
}
// Check whether one of the filtered (uninhibited, unsilenced) alerts should
// trigger a new notification.
func (s *memoryAlertManager) checkNotificationRepeats() {
s.mu.Lock()
defer s.mu.Unlock()
now := time.Now()
f := s.filteredLabelSets(true)
for _, agg := range s.aggregatesByNextNotification.AlertAggregates {
for _, fl := range f {
if agg.Alert.Labels.Equal(fl) && agg.NextNotification.Before(now) {
s.needsNotificationRefresh = true
return
}
}
}
}
// Returns all active AlertLabelSets that are neither inhibited nor silenced.
func (s *memoryAlertManager) filteredLabelSets(useCache bool) AlertLabelSets {
if useCache && s.filteredAlerts != nil {
return s.filteredAlerts
}
l := make(AlertLabelSets, 0, len(s.aggregates))
for _, agg := range s.aggregates {
l = append(l, agg.Alert.Labels)
}
l = s.inhibitor.Filter(l)
s.filteredAlerts = s.silencer.Filter(l)
return s.filteredAlerts
}
// Recomputes all currently uninhibited/unsilenced alerts and queues
// notifications for them according to their RepeatRate.
func (s *memoryAlertManager) refreshNotifications() {
s.mu.Lock()
defer s.mu.Unlock()
s.needsNotificationRefresh = false
l := s.filteredLabelSets(false)
numSent := 0
for _, lb := range l {
agg := s.aggregates[lb.Fingerprint()]
if agg.NextNotification.After(time.Now()) {
continue
}
if agg.Rule != nil {
s.notifier.QueueNotification(agg.Alert, agg.Rule.NotificationConfigName)
agg.LastNotification = time.Now()
agg.NextNotification = agg.LastNotification.Add(agg.Rule.RepeatRate)
numSent++
}
}
if numSent > 0 {
glog.Infof("Sent %d notifications", numSent)
heap.Init(&s.aggregatesByNextNotification)
}
}
// Reports whether a notification recomputation is required.
func (s *memoryAlertManager) refreshNeeded() (bool, []string) {
s.mu.Lock()
defer s.mu.Unlock()
needsRefresh := false
reasons := []string{}
if s.needsNotificationRefresh {
needsRefresh = true
reasons = append(reasons, "active alerts have changed")
}
if s.inhibitor.HasChanged() {
needsRefresh = true
reasons = append(reasons, "inhibit rules have changed")
}
if s.silencer.HasChanged() {
needsRefresh = true
reasons = append(reasons, "silences have changed")
}
return needsRefresh, reasons
}
// Perform some cheap state sanity checks.
func (s *memoryAlertManager) checkSanity() {
s.mu.Lock()
defer s.mu.Unlock()
if len(s.aggregates) != len(s.aggregatesByLastRefreshed.AlertAggregates) {
panic("len(aggregates) != len(aggregatesByLastRefreshed)")
}
if len(s.aggregates) != len(s.aggregatesByNextNotification.AlertAggregates) {
panic("len(aggregates) != len(aggregatesByNextNotification)")
}
}
// Run a single memoryAlertManager iteration.
func (s *memoryAlertManager) runIteration() {
s.removeExpiredAggregates()
s.checkNotificationRepeats()
if refresh, reasons := s.refreshNeeded(); refresh {
glog.Infof("Recomputing notification outputs (%s)", strings.Join(reasons, ", "))
s.refreshNotifications()
}
}
// Run the memoryAlertManager's main dispatcher loop.
func (s *memoryAlertManager) Run() {
iterationTicker := time.NewTicker(time.Second)
for _ = range iterationTicker.C {
s.checkSanity()
s.runIteration()
}
}