From 49a8ce5239d5b05bfb6051301e7bfff6569ce2d9 Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Fri, 12 Feb 2021 09:43:59 +0100 Subject: [PATCH 1/3] Revert "Consider status code 429 as recoverable errors to avoid resharding (#8237)" This reverts commit cd412470d724becdf85ee381d63bb408fcdbb1e0. Signed-off-by: Julien Pivotto --- storage/remote/client.go | 26 ++------------------------ storage/remote/client_test.go | 29 +---------------------------- storage/remote/queue_manager.go | 21 +++------------------ 3 files changed, 6 insertions(+), 70 deletions(-) diff --git a/storage/remote/client.go b/storage/remote/client.go index e122dcbf4..13502f75e 100644 --- a/storage/remote/client.go +++ b/storage/remote/client.go @@ -148,11 +148,8 @@ func NewWriteClient(name string, conf *ClientConfig) (WriteClient, error) { }, nil } -const defaultBackoff = 0 - type RecoverableError struct { error - retryAfter model.Duration } // Store sends a batch of samples to the HTTP endpoint, the request is the proto marshalled @@ -191,7 +188,7 @@ func (c *Client) Store(ctx context.Context, req []byte) error { if err != nil { // Errors from Client.Do are from (for example) network errors, so are // recoverable. - return RecoverableError{err, defaultBackoff} + return RecoverableError{err} } defer func() { io.Copy(ioutil.Discard, httpResp.Body) @@ -207,30 +204,11 @@ func (c *Client) Store(ctx context.Context, req []byte) error { err = errors.Errorf("server returned HTTP status %s: %s", httpResp.Status, line) } if httpResp.StatusCode/100 == 5 { - return RecoverableError{err, defaultBackoff} - } - if httpResp.StatusCode == http.StatusTooManyRequests { - return RecoverableError{err, retryAfterDuration(httpResp.Header.Get("Retry-After"))} + return RecoverableError{err} } return err } -// retryAfterDuration returns the duration for the Retry-After header. In case of any errors, it -// returns the defaultBackoff as if the header was never supplied. -func retryAfterDuration(t string) model.Duration { - parsedDuration, err := time.Parse(http.TimeFormat, t) - if err == nil { - s := time.Until(parsedDuration).Seconds() - return model.Duration(s) * model.Duration(time.Second) - } - // The duration can be in seconds. - d, err := strconv.Atoi(t) - if err != nil { - return defaultBackoff - } - return model.Duration(d) * model.Duration(time.Second) -} - // Name uniquely identifies the client. func (c Client) Name() string { return c.remoteName diff --git a/storage/remote/client_test.go b/storage/remote/client_test.go index 82067d3af..93a4c59c7 100644 --- a/storage/remote/client_test.go +++ b/storage/remote/client_test.go @@ -49,7 +49,7 @@ func TestStoreHTTPErrorHandling(t *testing.T) { }, { code: 500, - err: RecoverableError{errors.New("server returned HTTP status 500 Internal Server Error: " + longErrMessage[:maxErrMsgLen]), defaultBackoff}, + err: RecoverableError{errors.New("server returned HTTP status 500 Internal Server Error: " + longErrMessage[:maxErrMsgLen])}, }, } @@ -83,30 +83,3 @@ func TestStoreHTTPErrorHandling(t *testing.T) { server.Close() } } - -func TestRetryAfterDuration(t *testing.T) { - tc := []struct { - name string - tInput string - expected model.Duration - }{ - { - name: "seconds", - tInput: "120", - expected: model.Duration(time.Second * 120), - }, - { - name: "date-time default", - tInput: time.RFC1123, // Expected layout is http.TimeFormat, hence an error. - expected: defaultBackoff, - }, - { - name: "retry-after not provided", - tInput: "", // Expected layout is http.TimeFormat, hence an error. - expected: defaultBackoff, - }, - } - for _, c := range tc { - require.Equal(t, c.expected, retryAfterDuration(c.tInput), c.name) - } -} diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go index a3eb35e46..4e9067659 100644 --- a/storage/remote/queue_manager.go +++ b/storage/remote/queue_manager.go @@ -29,7 +29,6 @@ import ( "go.uber.org/atomic" "github.com/prometheus/client_golang/prometheus" - "github.com/prometheus/common/model" "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/pkg/relabel" @@ -1043,7 +1042,6 @@ func (s *shards) sendSamplesWithBackoff(ctx context.Context, samples []prompb.Ti func sendWriteRequestWithBackoff(ctx context.Context, cfg config.QueueConfig, l log.Logger, attempt func(int) error, onRetry func()) error { backoff := cfg.MinBackoff - sleepDuration := model.Duration(0) try := 0 for { @@ -1060,29 +1058,16 @@ func sendWriteRequestWithBackoff(ctx context.Context, cfg config.QueueConfig, l } // If the error is unrecoverable, we should not retry. - backoffErr, ok := err.(RecoverableError) - if !ok { + if _, ok := err.(RecoverableError); !ok { return err } - sleepDuration = backoff - if backoffErr.retryAfter > 0 { - sleepDuration = backoffErr.retryAfter - level.Info(l).Log("msg", "Retrying after duration specified by Retry-After header", "duration", sleepDuration) - } else if backoffErr.retryAfter < 0 { - level.Debug(l).Log("msg", "retry-after cannot be in past, retrying using default backoff mechanism") - } - - select { - case <-ctx.Done(): - case <-time.After(time.Duration(sleepDuration)): - } - // If we make it this far, we've encountered a recoverable error and will retry. onRetry() level.Warn(l).Log("msg", "Failed to send batch, retrying", "err", err) - backoff = sleepDuration * 2 + time.Sleep(time.Duration(backoff)) + backoff = backoff * 2 if backoff > cfg.MaxBackoff { backoff = cfg.MaxBackoff From 2a9e05b777919702906095c2e9126df0eb3b03c6 Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Fri, 12 Feb 2021 10:21:09 +0100 Subject: [PATCH 2/3] Release 2.25.0-rc.0 Signed-off-by: Julien Pivotto --- CHANGELOG.md | 25 +++++++++++++++++++++++++ VERSION | 2 +- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 711bd4918..8a105a702 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,28 @@ +## 2.25.0-rc.0 / 2021-02-12 + +This release includes a new `--enable-feature=` flag that enables +experimental features. Such features might be changed or removed in the future. + +* [FEATURE] **experimental** API: Accept remote_write requests. Behind the --enable-feature=remote-write-receiver flag. #8424 +* [FEATURE] **experimental** PromQL: Add '@ ' modifier. Behind the --enable-feature=promql-at-modifier flag. #8121 #8436 #8425 +* [ENHANCEMENT] Add optional name property to testgroup for better test failure output. #8440 +* [ENHANCEMENT] Add warnings into React Panel on the Graph page. #8427 +* [ENHANCEMENT] TSDB: Increase the number of buckets for the compaction duration metric. #8342 +* [ENHANCEMENT] Remote: Allow passing along custom remote_write HTTP headers. #8416 +* [ENHANCEMENT] Mixins: Scope grafana configuration. #8332 +* [ENHANCEMENT] Kubernetes SD: Add endpoint labels metadata. #8273 +* [ENHANCEMENT] UI: Expose total number of label pairs in head in TSDB stats page. #8343 +* [ENHANCEMENT] TSDB: Reload blocks every minute, to detect new blocks and enforce retention more often. #8343 +* [BUGFIX] API: Fix global URL when external address has no port. #8359 +* [BUGFIX] Backfill: Fix error message handling. #8432 +* [BUGFIX] Deprecate unused flag --alertmanager.timeout. #8407 +* [BUGFIX] Mixins: Support remote-write metrics renamed in v2.23 in alerts. #8423 +* [BUGFIX] Remote: Fix garbage collection of dropped series in remote write. #8387 +* [BUGFIX] Remote: Log recoverable remote write errors as warnings. #8412 +* [BUGFIX] TSDB: Remove pre-2.21 temporary blocks on start. #8353. +* [BUGFIX] UI: Fix duplicated keys on /targets page. #8456 +* [BUGFIX] UI: Fix label name leak into class name. #8459 + ## 2.24.1 / 2021-01-20 * [ENHANCEMENT] Cache basic authentication results to significantly improve performance of HTTP endpoints (via an update of prometheus/exporter-toolkit). diff --git a/VERSION b/VERSION index 0f5dfbe87..0257f1b3f 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -2.24.1 +2.25.0-rc.0 From 93dcc3c7be310be52d87113cdce4d9ac8297c7fb Mon Sep 17 00:00:00 2001 From: Julien Pivotto Date: Fri, 12 Feb 2021 23:34:48 +0100 Subject: [PATCH 3/3] Consider status code 429 as recoverable errors to avoid resharding (#8237) This reverts commit 49a8ce5239d5b05bfb6051301e7bfff6569ce2d9. This commit is necessary since we only wanted to not have the functionality in 2.25. It will be improved soon on the main branch. Co-authored-by: Harkishen-Singh Signed-off-by: Julien Pivotto --- storage/remote/client.go | 26 ++++++++++++++++++++++++-- storage/remote/client_test.go | 29 ++++++++++++++++++++++++++++- storage/remote/queue_manager.go | 21 ++++++++++++++++++--- 3 files changed, 70 insertions(+), 6 deletions(-) diff --git a/storage/remote/client.go b/storage/remote/client.go index 13502f75e..e122dcbf4 100644 --- a/storage/remote/client.go +++ b/storage/remote/client.go @@ -148,8 +148,11 @@ func NewWriteClient(name string, conf *ClientConfig) (WriteClient, error) { }, nil } +const defaultBackoff = 0 + type RecoverableError struct { error + retryAfter model.Duration } // Store sends a batch of samples to the HTTP endpoint, the request is the proto marshalled @@ -188,7 +191,7 @@ func (c *Client) Store(ctx context.Context, req []byte) error { if err != nil { // Errors from Client.Do are from (for example) network errors, so are // recoverable. - return RecoverableError{err} + return RecoverableError{err, defaultBackoff} } defer func() { io.Copy(ioutil.Discard, httpResp.Body) @@ -204,11 +207,30 @@ func (c *Client) Store(ctx context.Context, req []byte) error { err = errors.Errorf("server returned HTTP status %s: %s", httpResp.Status, line) } if httpResp.StatusCode/100 == 5 { - return RecoverableError{err} + return RecoverableError{err, defaultBackoff} + } + if httpResp.StatusCode == http.StatusTooManyRequests { + return RecoverableError{err, retryAfterDuration(httpResp.Header.Get("Retry-After"))} } return err } +// retryAfterDuration returns the duration for the Retry-After header. In case of any errors, it +// returns the defaultBackoff as if the header was never supplied. +func retryAfterDuration(t string) model.Duration { + parsedDuration, err := time.Parse(http.TimeFormat, t) + if err == nil { + s := time.Until(parsedDuration).Seconds() + return model.Duration(s) * model.Duration(time.Second) + } + // The duration can be in seconds. + d, err := strconv.Atoi(t) + if err != nil { + return defaultBackoff + } + return model.Duration(d) * model.Duration(time.Second) +} + // Name uniquely identifies the client. func (c Client) Name() string { return c.remoteName diff --git a/storage/remote/client_test.go b/storage/remote/client_test.go index 93a4c59c7..82067d3af 100644 --- a/storage/remote/client_test.go +++ b/storage/remote/client_test.go @@ -49,7 +49,7 @@ func TestStoreHTTPErrorHandling(t *testing.T) { }, { code: 500, - err: RecoverableError{errors.New("server returned HTTP status 500 Internal Server Error: " + longErrMessage[:maxErrMsgLen])}, + err: RecoverableError{errors.New("server returned HTTP status 500 Internal Server Error: " + longErrMessage[:maxErrMsgLen]), defaultBackoff}, }, } @@ -83,3 +83,30 @@ func TestStoreHTTPErrorHandling(t *testing.T) { server.Close() } } + +func TestRetryAfterDuration(t *testing.T) { + tc := []struct { + name string + tInput string + expected model.Duration + }{ + { + name: "seconds", + tInput: "120", + expected: model.Duration(time.Second * 120), + }, + { + name: "date-time default", + tInput: time.RFC1123, // Expected layout is http.TimeFormat, hence an error. + expected: defaultBackoff, + }, + { + name: "retry-after not provided", + tInput: "", // Expected layout is http.TimeFormat, hence an error. + expected: defaultBackoff, + }, + } + for _, c := range tc { + require.Equal(t, c.expected, retryAfterDuration(c.tInput), c.name) + } +} diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go index 4e9067659..a3eb35e46 100644 --- a/storage/remote/queue_manager.go +++ b/storage/remote/queue_manager.go @@ -29,6 +29,7 @@ import ( "go.uber.org/atomic" "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/common/model" "github.com/prometheus/prometheus/config" "github.com/prometheus/prometheus/pkg/labels" "github.com/prometheus/prometheus/pkg/relabel" @@ -1042,6 +1043,7 @@ func (s *shards) sendSamplesWithBackoff(ctx context.Context, samples []prompb.Ti func sendWriteRequestWithBackoff(ctx context.Context, cfg config.QueueConfig, l log.Logger, attempt func(int) error, onRetry func()) error { backoff := cfg.MinBackoff + sleepDuration := model.Duration(0) try := 0 for { @@ -1058,16 +1060,29 @@ func sendWriteRequestWithBackoff(ctx context.Context, cfg config.QueueConfig, l } // If the error is unrecoverable, we should not retry. - if _, ok := err.(RecoverableError); !ok { + backoffErr, ok := err.(RecoverableError) + if !ok { return err } + sleepDuration = backoff + if backoffErr.retryAfter > 0 { + sleepDuration = backoffErr.retryAfter + level.Info(l).Log("msg", "Retrying after duration specified by Retry-After header", "duration", sleepDuration) + } else if backoffErr.retryAfter < 0 { + level.Debug(l).Log("msg", "retry-after cannot be in past, retrying using default backoff mechanism") + } + + select { + case <-ctx.Done(): + case <-time.After(time.Duration(sleepDuration)): + } + // If we make it this far, we've encountered a recoverable error and will retry. onRetry() level.Warn(l).Log("msg", "Failed to send batch, retrying", "err", err) - time.Sleep(time.Duration(backoff)) - backoff = backoff * 2 + backoff = sleepDuration * 2 if backoff > cfg.MaxBackoff { backoff = cfg.MaxBackoff