mirror of
https://github.com/prometheus/prometheus
synced 2025-01-11 08:59:37 +00:00
notifier: optionally drain queued notifications before shutting down (#14290)
* Add draining of queued notifications to `notifier.Manager` Signed-off-by: Charles Korn <charles.korn@grafana.com> * Update docs Signed-off-by: Charles Korn <charles.korn@grafana.com> * Address PR feedback Signed-off-by: Charles Korn <charles.korn@grafana.com> * Add more logging Signed-off-by: Charles Korn <charles.korn@grafana.com> * Address offline feedback: remove timeout Signed-off-by: Charles Korn <charles.korn@grafana.com> * Ensure stopping takes priority over further processing, make tests more robust Signed-off-by: Charles Korn <charles.korn@grafana.com> * Make channel unbuffered Signed-off-by: Charles Korn <charles.korn@grafana.com> * Update docs Signed-off-by: Charles Korn <charles.korn@grafana.com> * Fix race in test Signed-off-by: Charles Korn <charles.korn@grafana.com> * Remove unnecessary context Signed-off-by: Charles Korn <charles.korn@grafana.com> * Make Stop safe to call multiple times Signed-off-by: Charles Korn <charles.korn@grafana.com> --------- Signed-off-by: Charles Korn <charles.korn@grafana.com>
This commit is contained in:
parent
f24ce00320
commit
2dd07fbb1b
@ -445,6 +445,9 @@ func main() {
|
||||
serverOnlyFlag(a, "alertmanager.notification-queue-capacity", "The capacity of the queue for pending Alertmanager notifications.").
|
||||
Default("10000").IntVar(&cfg.notifier.QueueCapacity)
|
||||
|
||||
serverOnlyFlag(a, "alertmanager.drain-notification-queue-on-shutdown", "Send any outstanding Alertmanager notifications when shutting down. If false, any outstanding Alertmanager notifications will be dropped when shutting down.").
|
||||
Default("true").BoolVar(&cfg.notifier.DrainOnShutdown)
|
||||
|
||||
// TODO: Remove in Prometheus 3.0.
|
||||
alertmanagerTimeout := a.Flag("alertmanager.timeout", "[DEPRECATED] This flag has no effect.").Hidden().String()
|
||||
|
||||
|
@ -50,6 +50,7 @@ The Prometheus monitoring server
|
||||
| <code class="text-nowrap">--rules.alert.resend-delay</code> | Minimum amount of time to wait before resending an alert to Alertmanager. Use with server mode only. | `1m` |
|
||||
| <code class="text-nowrap">--rules.max-concurrent-evals</code> | Global concurrency limit for independent rules that can run concurrently. When set, "query.max-concurrency" may need to be adjusted accordingly. Use with server mode only. | `4` |
|
||||
| <code class="text-nowrap">--alertmanager.notification-queue-capacity</code> | The capacity of the queue for pending Alertmanager notifications. Use with server mode only. | `10000` |
|
||||
| <code class="text-nowrap">--alertmanager.drain-notification-queue-on-shutdown</code> | Send any outstanding Alertmanager notifications when shutting down. If false, any outstanding Alertmanager notifications will be dropped when shutting down. Use with server mode only. | `true` |
|
||||
| <code class="text-nowrap">--query.lookback-delta</code> | The maximum lookback duration for retrieving metrics during expression evaluations and federation. Use with server mode only. | `5m` |
|
||||
| <code class="text-nowrap">--query.timeout</code> | Maximum time a query may take before being aborted. Use with server mode only. | `2m` |
|
||||
| <code class="text-nowrap">--query.max-concurrency</code> | Maximum number of queries executed concurrently. Use with server mode only. | `20` |
|
||||
|
@ -110,10 +110,11 @@ type Manager struct {
|
||||
|
||||
metrics *alertMetrics
|
||||
|
||||
more chan struct{}
|
||||
mtx sync.RWMutex
|
||||
ctx context.Context
|
||||
cancel func()
|
||||
more chan struct{}
|
||||
mtx sync.RWMutex
|
||||
|
||||
stopOnce *sync.Once
|
||||
stopRequested chan struct{}
|
||||
|
||||
alertmanagers map[string]*alertmanagerSet
|
||||
logger log.Logger
|
||||
@ -121,9 +122,10 @@ type Manager struct {
|
||||
|
||||
// Options are the configurable parameters of a Handler.
|
||||
type Options struct {
|
||||
QueueCapacity int
|
||||
ExternalLabels labels.Labels
|
||||
RelabelConfigs []*relabel.Config
|
||||
QueueCapacity int
|
||||
DrainOnShutdown bool
|
||||
ExternalLabels labels.Labels
|
||||
RelabelConfigs []*relabel.Config
|
||||
// Used for sending HTTP requests to the Alertmanager.
|
||||
Do func(ctx context.Context, client *http.Client, req *http.Request) (*http.Response, error)
|
||||
|
||||
@ -217,8 +219,6 @@ func do(ctx context.Context, client *http.Client, req *http.Request) (*http.Resp
|
||||
|
||||
// NewManager is the manager constructor.
|
||||
func NewManager(o *Options, logger log.Logger) *Manager {
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
|
||||
if o.Do == nil {
|
||||
o.Do = do
|
||||
}
|
||||
@ -227,12 +227,12 @@ func NewManager(o *Options, logger log.Logger) *Manager {
|
||||
}
|
||||
|
||||
n := &Manager{
|
||||
queue: make([]*Alert, 0, o.QueueCapacity),
|
||||
ctx: ctx,
|
||||
cancel: cancel,
|
||||
more: make(chan struct{}, 1),
|
||||
opts: o,
|
||||
logger: logger,
|
||||
queue: make([]*Alert, 0, o.QueueCapacity),
|
||||
more: make(chan struct{}, 1),
|
||||
stopRequested: make(chan struct{}),
|
||||
stopOnce: &sync.Once{},
|
||||
opts: o,
|
||||
logger: logger,
|
||||
}
|
||||
|
||||
queueLenFunc := func() float64 { return float64(n.queueLen()) }
|
||||
@ -298,42 +298,100 @@ func (n *Manager) nextBatch() []*Alert {
|
||||
return alerts
|
||||
}
|
||||
|
||||
// Run dispatches notifications continuously, returning once Stop has been called and all
|
||||
// pending notifications have been drained from the queue (if draining is enabled).
|
||||
//
|
||||
// Dispatching of notifications occurs in parallel to processing target updates to avoid one starving the other.
|
||||
// Refer to https://github.com/prometheus/prometheus/issues/13676 for more details.
|
||||
func (n *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) {
|
||||
wg := sync.WaitGroup{}
|
||||
wg.Add(2)
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
n.targetUpdateLoop(tsets)
|
||||
}()
|
||||
|
||||
go func() {
|
||||
defer wg.Done()
|
||||
n.sendLoop()
|
||||
n.drainQueue()
|
||||
}()
|
||||
|
||||
wg.Wait()
|
||||
level.Info(n.logger).Log("msg", "Notification manager stopped")
|
||||
}
|
||||
|
||||
// sendLoop continuously consumes the notifications queue and sends alerts to
|
||||
// the configured Alertmanagers.
|
||||
func (n *Manager) sendLoop() {
|
||||
for {
|
||||
// If we've been asked to stop, that takes priority over sending any further notifications.
|
||||
select {
|
||||
case <-n.ctx.Done():
|
||||
case <-n.stopRequested:
|
||||
return
|
||||
case <-n.more:
|
||||
}
|
||||
alerts := n.nextBatch()
|
||||
default:
|
||||
select {
|
||||
case <-n.stopRequested:
|
||||
return
|
||||
|
||||
if !n.sendAll(alerts...) {
|
||||
n.metrics.dropped.Add(float64(len(alerts)))
|
||||
}
|
||||
// If the queue still has items left, kick off the next iteration.
|
||||
if n.queueLen() > 0 {
|
||||
n.setMore()
|
||||
case <-n.more:
|
||||
n.sendOneBatch()
|
||||
|
||||
// If the queue still has items left, kick off the next iteration.
|
||||
if n.queueLen() > 0 {
|
||||
n.setMore()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Run receives updates of target groups and triggers a reload.
|
||||
// The dispatching of notifications occurs in the background to prevent blocking the receipt of target updates.
|
||||
// Refer to https://github.com/prometheus/prometheus/issues/13676 for more details.
|
||||
func (n *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) {
|
||||
go n.sendLoop()
|
||||
// targetUpdateLoop receives updates of target groups and triggers a reload.
|
||||
func (n *Manager) targetUpdateLoop(tsets <-chan map[string][]*targetgroup.Group) {
|
||||
for {
|
||||
// If we've been asked to stop, that takes priority over processing any further target group updates.
|
||||
select {
|
||||
case <-n.ctx.Done():
|
||||
case <-n.stopRequested:
|
||||
return
|
||||
case ts := <-tsets:
|
||||
n.reload(ts)
|
||||
default:
|
||||
select {
|
||||
case <-n.stopRequested:
|
||||
return
|
||||
case ts := <-tsets:
|
||||
n.reload(ts)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (n *Manager) sendOneBatch() {
|
||||
alerts := n.nextBatch()
|
||||
|
||||
if !n.sendAll(alerts...) {
|
||||
n.metrics.dropped.Add(float64(len(alerts)))
|
||||
}
|
||||
}
|
||||
|
||||
func (n *Manager) drainQueue() {
|
||||
if !n.opts.DrainOnShutdown {
|
||||
if n.queueLen() > 0 {
|
||||
level.Warn(n.logger).Log("msg", "Draining remaining notifications on shutdown is disabled, and some notifications have been dropped", "count", n.queueLen())
|
||||
n.metrics.dropped.Add(float64(n.queueLen()))
|
||||
}
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
level.Info(n.logger).Log("msg", "Draining any remaining notifications...")
|
||||
|
||||
for n.queueLen() > 0 {
|
||||
n.sendOneBatch()
|
||||
}
|
||||
|
||||
level.Info(n.logger).Log("msg", "Remaining notifications drained")
|
||||
}
|
||||
|
||||
func (n *Manager) reload(tgs map[string][]*targetgroup.Group) {
|
||||
n.mtx.Lock()
|
||||
defer n.mtx.Unlock()
|
||||
@ -546,7 +604,7 @@ func (n *Manager) sendAll(alerts ...*Alert) bool {
|
||||
for _, am := range ams.ams {
|
||||
wg.Add(1)
|
||||
|
||||
ctx, cancel := context.WithTimeout(n.ctx, time.Duration(ams.cfg.Timeout))
|
||||
ctx, cancel := context.WithTimeout(context.Background(), time.Duration(ams.cfg.Timeout))
|
||||
defer cancel()
|
||||
|
||||
go func(ctx context.Context, client *http.Client, url string, payload []byte, count int) {
|
||||
@ -624,10 +682,19 @@ func (n *Manager) sendOne(ctx context.Context, c *http.Client, url string, b []b
|
||||
return nil
|
||||
}
|
||||
|
||||
// Stop shuts down the notification handler.
|
||||
// Stop signals the notification manager to shut down and immediately returns.
|
||||
//
|
||||
// Run will return once the notification manager has successfully shut down.
|
||||
//
|
||||
// The manager will optionally drain any queued notifications before shutting down.
|
||||
//
|
||||
// Stop is safe to call multiple times.
|
||||
func (n *Manager) Stop() {
|
||||
level.Info(n.logger).Log("msg", "Stopping notification manager...")
|
||||
n.cancel()
|
||||
|
||||
n.stopOnce.Do(func() {
|
||||
close(n.stopRequested)
|
||||
})
|
||||
}
|
||||
|
||||
// Alertmanager holds Alertmanager endpoint information.
|
||||
|
@ -847,3 +847,173 @@ loop2:
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestStop_DrainingDisabled(t *testing.T) {
|
||||
releaseReceiver := make(chan struct{})
|
||||
receiverReceivedRequest := make(chan struct{}, 2)
|
||||
alertsReceived := atomic.NewInt64(0)
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Let the test know we've received a request.
|
||||
receiverReceivedRequest <- struct{}{}
|
||||
|
||||
var alerts []*Alert
|
||||
|
||||
b, err := io.ReadAll(r.Body)
|
||||
require.NoError(t, err)
|
||||
|
||||
err = json.Unmarshal(b, &alerts)
|
||||
require.NoError(t, err)
|
||||
|
||||
alertsReceived.Add(int64(len(alerts)))
|
||||
|
||||
// Wait for the test to release us.
|
||||
<-releaseReceiver
|
||||
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer func() {
|
||||
server.Close()
|
||||
}()
|
||||
|
||||
m := NewManager(
|
||||
&Options{
|
||||
QueueCapacity: 10,
|
||||
DrainOnShutdown: false,
|
||||
},
|
||||
nil,
|
||||
)
|
||||
|
||||
m.alertmanagers = make(map[string]*alertmanagerSet)
|
||||
|
||||
am1Cfg := config.DefaultAlertmanagerConfig
|
||||
am1Cfg.Timeout = model.Duration(time.Second)
|
||||
|
||||
m.alertmanagers["1"] = &alertmanagerSet{
|
||||
ams: []alertmanager{
|
||||
alertmanagerMock{
|
||||
urlf: func() string { return server.URL },
|
||||
},
|
||||
},
|
||||
cfg: &am1Cfg,
|
||||
}
|
||||
|
||||
notificationManagerStopped := make(chan struct{})
|
||||
|
||||
go func() {
|
||||
defer close(notificationManagerStopped)
|
||||
m.Run(nil)
|
||||
}()
|
||||
|
||||
// Queue two alerts. The first should be immediately sent to the receiver, which should block until we release it later.
|
||||
m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-1")})
|
||||
|
||||
select {
|
||||
case <-receiverReceivedRequest:
|
||||
// Nothing more to do.
|
||||
case <-time.After(time.Second):
|
||||
require.FailNow(t, "gave up waiting for receiver to receive notification of first alert")
|
||||
}
|
||||
|
||||
m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-2")})
|
||||
|
||||
// Stop the notification manager, pause to allow the shutdown to be observed, and then allow the receiver to proceed.
|
||||
m.Stop()
|
||||
time.Sleep(time.Second)
|
||||
close(releaseReceiver)
|
||||
|
||||
// Wait for the notification manager to stop and confirm only the first notification was sent.
|
||||
// The second notification should be dropped.
|
||||
select {
|
||||
case <-notificationManagerStopped:
|
||||
// Nothing more to do.
|
||||
case <-time.After(time.Second):
|
||||
require.FailNow(t, "gave up waiting for notification manager to stop")
|
||||
}
|
||||
|
||||
require.Equal(t, int64(1), alertsReceived.Load())
|
||||
}
|
||||
|
||||
func TestStop_DrainingEnabled(t *testing.T) {
|
||||
releaseReceiver := make(chan struct{})
|
||||
receiverReceivedRequest := make(chan struct{}, 2)
|
||||
alertsReceived := atomic.NewInt64(0)
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
// Let the test know we've received a request.
|
||||
receiverReceivedRequest <- struct{}{}
|
||||
|
||||
var alerts []*Alert
|
||||
|
||||
b, err := io.ReadAll(r.Body)
|
||||
require.NoError(t, err)
|
||||
|
||||
err = json.Unmarshal(b, &alerts)
|
||||
require.NoError(t, err)
|
||||
|
||||
alertsReceived.Add(int64(len(alerts)))
|
||||
|
||||
// Wait for the test to release us.
|
||||
<-releaseReceiver
|
||||
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer func() {
|
||||
server.Close()
|
||||
}()
|
||||
|
||||
m := NewManager(
|
||||
&Options{
|
||||
QueueCapacity: 10,
|
||||
DrainOnShutdown: true,
|
||||
},
|
||||
nil,
|
||||
)
|
||||
|
||||
m.alertmanagers = make(map[string]*alertmanagerSet)
|
||||
|
||||
am1Cfg := config.DefaultAlertmanagerConfig
|
||||
am1Cfg.Timeout = model.Duration(time.Second)
|
||||
|
||||
m.alertmanagers["1"] = &alertmanagerSet{
|
||||
ams: []alertmanager{
|
||||
alertmanagerMock{
|
||||
urlf: func() string { return server.URL },
|
||||
},
|
||||
},
|
||||
cfg: &am1Cfg,
|
||||
}
|
||||
|
||||
notificationManagerStopped := make(chan struct{})
|
||||
|
||||
go func() {
|
||||
defer close(notificationManagerStopped)
|
||||
m.Run(nil)
|
||||
}()
|
||||
|
||||
// Queue two alerts. The first should be immediately sent to the receiver, which should block until we release it later.
|
||||
m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-1")})
|
||||
|
||||
select {
|
||||
case <-receiverReceivedRequest:
|
||||
// Nothing more to do.
|
||||
case <-time.After(time.Second):
|
||||
require.FailNow(t, "gave up waiting for receiver to receive notification of first alert")
|
||||
}
|
||||
|
||||
m.Send(&Alert{Labels: labels.FromStrings(labels.AlertName, "alert-2")})
|
||||
|
||||
// Stop the notification manager and allow the receiver to proceed.
|
||||
m.Stop()
|
||||
close(releaseReceiver)
|
||||
|
||||
// Wait for the notification manager to stop and confirm both notifications were sent.
|
||||
select {
|
||||
case <-notificationManagerStopped:
|
||||
// Nothing more to do.
|
||||
case <-time.After(200 * time.Millisecond):
|
||||
require.FailNow(t, "gave up waiting for notification manager to stop")
|
||||
}
|
||||
|
||||
require.Equal(t, int64(2), alertsReceived.Load())
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user