Add timeout option for webhook notifier. (#4137)

* Add timeout option for webhook notifier.

Signed-off-by: Steve Simpson <steve@grafana.com>

* Tweak error handling and de-pointer timeout field.

Signed-off-by: Steve Simpson <steve@grafana.com>

---------

Signed-off-by: Steve Simpson <steve@grafana.com>
This commit is contained in:
Steve Simpson 2024-12-02 15:53:51 +01:00 committed by GitHub
parent eb7aba4e30
commit 0f65e8fa5f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 69 additions and 0 deletions

View File

@ -535,6 +535,10 @@ type WebhookConfig struct {
// Alerts exceeding this threshold will be truncated. Setting this to 0 // Alerts exceeding this threshold will be truncated. Setting this to 0
// allows an unlimited number of alerts. // allows an unlimited number of alerts.
MaxAlerts uint64 `yaml:"max_alerts" json:"max_alerts"` MaxAlerts uint64 `yaml:"max_alerts" json:"max_alerts"`
// Timeout is the maximum time allowed to invoke the webhook. Setting this to 0
// does not impose a timeout.
Timeout time.Duration `yaml:"timeout" json:"timeout"`
} }
// UnmarshalYAML implements the yaml.Unmarshaler interface. // UnmarshalYAML implements the yaml.Unmarshaler interface.

View File

@ -1594,6 +1594,13 @@ url_file: <filepath>
# above this threshold are truncated. When leaving this at its default value of # above this threshold are truncated. When leaving this at its default value of
# 0, all alerts are included. # 0, all alerts are included.
[ max_alerts: <int> | default = 0 ] [ max_alerts: <int> | default = 0 ]
# The maximum time to wait for a webhook request to complete, before failing the
# request and allowing it to be retried. The default value of 0s indicates that
# no timeout should be applied.
# NOTE: This will have no effect if set higher than the group_interval.
[ timeout: <duration> | default = 0s ]
``` ```
The Alertmanager The Alertmanager

View File

@ -112,8 +112,17 @@ func (n *Notifier) Notify(ctx context.Context, alerts ...*types.Alert) (bool, er
url = strings.TrimSpace(string(content)) url = strings.TrimSpace(string(content))
} }
if n.conf.Timeout > 0 {
postCtx, cancel := context.WithTimeoutCause(ctx, n.conf.Timeout, fmt.Errorf("configured webhook timeout reached (%s)", n.conf.Timeout))
defer cancel()
ctx = postCtx
}
resp, err := notify.PostJSON(ctx, n.client, url, &buf) resp, err := notify.PostJSON(ctx, n.client, url, &buf)
if err != nil { if err != nil {
if ctx.Err() != nil {
err = fmt.Errorf("%w: %w", err, context.Cause(ctx))
}
return true, notify.RedactURL(err) return true, notify.RedactURL(err)
} }
defer notify.Drain(resp) defer notify.Drain(resp)

View File

@ -464,3 +464,52 @@ receivers:
t.Log(co.Check()) t.Log(co.Check())
} }
func TestWebhookTimeout(t *testing.T) {
t.Parallel()
// This integration test uses an extended group_interval to check that
// the webhook level timeout has the desired effect, and that notification
// sending is retried in this case.
conf := `
route:
receiver: "default"
group_by: [alertname]
group_wait: 1s
group_interval: 1m
repeat_interval: 1m
receivers:
- name: "default"
webhook_configs:
- url: 'http://%s'
timeout: 500ms
`
at := NewAcceptanceTest(t, &AcceptanceOpts{
Tolerance: 150 * time.Millisecond,
})
co := at.Collector("webhook")
wh := NewWebhook(t, co)
wh.Func = func(ts float64) bool {
// Make some webhook requests slow enough to hit the webhook
// timeout, but not so slow as to hit the dispatcher timeout.
if ts < 3 {
time.Sleep(time.Second)
return true
}
return false
}
am := at.AlertmanagerCluster(fmt.Sprintf(conf, wh.Address()), 1)
am.Push(At(1), Alert("alertname", "test1"))
co.Want(Between(3, 4), Alert("alertname", "test1").Active(1))
at.Run()
t.Log(co.Check())
}