From 1ed94142fc6293f396f77708438c11c3d5b0ca5d Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Tue, 9 Nov 2021 22:08:24 +0000 Subject: [PATCH] remote-write: slow down retries to avoid DDOS (#9634) * remote-write: slow down retries to avoid DDOS Increase the default max retry time from 100ms to 5 seconds. Remote write calls are retried after a recoverable error such as the back-end returning 500. Prometheus waits the minimum time and retries, then doubles the wait on each subsequent retry until the maximum is reached. If some data is still getting through, remote-write will also increase shards, and the default maximum is 200. 200 shards sending every 100ms is 20 calls per second, to a back-end that is already in trouble. 5 seconds was chosen to match the default BatchSendDeadline: if we can afford to wait that long for no response, then we can wait the same time to retry. We will reach 5 seconds after 9 successive failures. Signed-off-by: Bryan Boreham * Update config doc for max_backoff change Signed-off-by: Bryan Boreham --- config/config.go | 2 +- docs/configuration/configuration.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/config/config.go b/config/config.go index 502077e68..24441d2bc 100644 --- a/config/config.go +++ b/config/config.go @@ -188,7 +188,7 @@ var ( // Backoff times for retrying a batch of samples on recoverable errors. MinBackoff: model.Duration(30 * time.Millisecond), - MaxBackoff: model.Duration(100 * time.Millisecond), + MaxBackoff: model.Duration(5 * time.Second), } // DefaultMetadataConfig is the default metadata configuration for a remote write endpoint. diff --git a/docs/configuration/configuration.md b/docs/configuration/configuration.md index e3d1f76b8..96e60dfcd 100644 --- a/docs/configuration/configuration.md +++ b/docs/configuration/configuration.md @@ -2752,7 +2752,7 @@ queue_config: # Initial retry delay. Gets doubled for every retry. [ min_backoff: | default = 30ms ] # Maximum retry delay. - [ max_backoff: | default = 100ms ] + [ max_backoff: | default = 5s ] # Retry upon receiving a 429 status code from the remote-write storage. # This is experimental and might change in the future. [ retry_on_http_429: | default = false ]