From 49a8ce5239d5b05bfb6051301e7bfff6569ce2d9 Mon Sep 17 00:00:00 2001
From: Julien Pivotto <roidelapluie@inuits.eu>
Date: Fri, 12 Feb 2021 09:43:59 +0100
Subject: [PATCH 1/3] Revert "Consider status code 429 as recoverable errors to
 avoid resharding (#8237)"

This reverts commit cd412470d724becdf85ee381d63bb408fcdbb1e0.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
---
 storage/remote/client.go        | 26 ++------------------------
 storage/remote/client_test.go   | 29 +----------------------------
 storage/remote/queue_manager.go | 21 +++------------------
 3 files changed, 6 insertions(+), 70 deletions(-)

diff --git a/storage/remote/client.go b/storage/remote/client.go
index e122dcbf4..13502f75e 100644
--- a/storage/remote/client.go
+++ b/storage/remote/client.go
@@ -148,11 +148,8 @@ func NewWriteClient(name string, conf *ClientConfig) (WriteClient, error) {
 	}, nil
 }
 
-const defaultBackoff = 0
-
 type RecoverableError struct {
 	error
-	retryAfter model.Duration
 }
 
 // Store sends a batch of samples to the HTTP endpoint, the request is the proto marshalled
@@ -191,7 +188,7 @@ func (c *Client) Store(ctx context.Context, req []byte) error {
 	if err != nil {
 		// Errors from Client.Do are from (for example) network errors, so are
 		// recoverable.
-		return RecoverableError{err, defaultBackoff}
+		return RecoverableError{err}
 	}
 	defer func() {
 		io.Copy(ioutil.Discard, httpResp.Body)
@@ -207,30 +204,11 @@ func (c *Client) Store(ctx context.Context, req []byte) error {
 		err = errors.Errorf("server returned HTTP status %s: %s", httpResp.Status, line)
 	}
 	if httpResp.StatusCode/100 == 5 {
-		return RecoverableError{err, defaultBackoff}
-	}
-	if httpResp.StatusCode == http.StatusTooManyRequests {
-		return RecoverableError{err, retryAfterDuration(httpResp.Header.Get("Retry-After"))}
+		return RecoverableError{err}
 	}
 	return err
 }
 
-// retryAfterDuration returns the duration for the Retry-After header. In case of any errors, it
-// returns the defaultBackoff as if the header was never supplied.
-func retryAfterDuration(t string) model.Duration {
-	parsedDuration, err := time.Parse(http.TimeFormat, t)
-	if err == nil {
-		s := time.Until(parsedDuration).Seconds()
-		return model.Duration(s) * model.Duration(time.Second)
-	}
-	// The duration can be in seconds.
-	d, err := strconv.Atoi(t)
-	if err != nil {
-		return defaultBackoff
-	}
-	return model.Duration(d) * model.Duration(time.Second)
-}
-
 // Name uniquely identifies the client.
 func (c Client) Name() string {
 	return c.remoteName
diff --git a/storage/remote/client_test.go b/storage/remote/client_test.go
index 82067d3af..93a4c59c7 100644
--- a/storage/remote/client_test.go
+++ b/storage/remote/client_test.go
@@ -49,7 +49,7 @@ func TestStoreHTTPErrorHandling(t *testing.T) {
 		},
 		{
 			code: 500,
-			err:  RecoverableError{errors.New("server returned HTTP status 500 Internal Server Error: " + longErrMessage[:maxErrMsgLen]), defaultBackoff},
+			err:  RecoverableError{errors.New("server returned HTTP status 500 Internal Server Error: " + longErrMessage[:maxErrMsgLen])},
 		},
 	}
 
@@ -83,30 +83,3 @@ func TestStoreHTTPErrorHandling(t *testing.T) {
 		server.Close()
 	}
 }
-
-func TestRetryAfterDuration(t *testing.T) {
-	tc := []struct {
-		name     string
-		tInput   string
-		expected model.Duration
-	}{
-		{
-			name:     "seconds",
-			tInput:   "120",
-			expected: model.Duration(time.Second * 120),
-		},
-		{
-			name:     "date-time default",
-			tInput:   time.RFC1123, // Expected layout is http.TimeFormat, hence an error.
-			expected: defaultBackoff,
-		},
-		{
-			name:     "retry-after not provided",
-			tInput:   "", // Expected layout is http.TimeFormat, hence an error.
-			expected: defaultBackoff,
-		},
-	}
-	for _, c := range tc {
-		require.Equal(t, c.expected, retryAfterDuration(c.tInput), c.name)
-	}
-}
diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go
index a3eb35e46..4e9067659 100644
--- a/storage/remote/queue_manager.go
+++ b/storage/remote/queue_manager.go
@@ -29,7 +29,6 @@ import (
 	"go.uber.org/atomic"
 
 	"github.com/prometheus/client_golang/prometheus"
-	"github.com/prometheus/common/model"
 	"github.com/prometheus/prometheus/config"
 	"github.com/prometheus/prometheus/pkg/labels"
 	"github.com/prometheus/prometheus/pkg/relabel"
@@ -1043,7 +1042,6 @@ func (s *shards) sendSamplesWithBackoff(ctx context.Context, samples []prompb.Ti
 
 func sendWriteRequestWithBackoff(ctx context.Context, cfg config.QueueConfig, l log.Logger, attempt func(int) error, onRetry func()) error {
 	backoff := cfg.MinBackoff
-	sleepDuration := model.Duration(0)
 	try := 0
 
 	for {
@@ -1060,29 +1058,16 @@ func sendWriteRequestWithBackoff(ctx context.Context, cfg config.QueueConfig, l
 		}
 
 		// If the error is unrecoverable, we should not retry.
-		backoffErr, ok := err.(RecoverableError)
-		if !ok {
+		if _, ok := err.(RecoverableError); !ok {
 			return err
 		}
 
-		sleepDuration = backoff
-		if backoffErr.retryAfter > 0 {
-			sleepDuration = backoffErr.retryAfter
-			level.Info(l).Log("msg", "Retrying after duration specified by Retry-After header", "duration", sleepDuration)
-		} else if backoffErr.retryAfter < 0 {
-			level.Debug(l).Log("msg", "retry-after cannot be in past, retrying using default backoff mechanism")
-		}
-
-		select {
-		case <-ctx.Done():
-		case <-time.After(time.Duration(sleepDuration)):
-		}
-
 		// If we make it this far, we've encountered a recoverable error and will retry.
 		onRetry()
 		level.Warn(l).Log("msg", "Failed to send batch, retrying", "err", err)
 
-		backoff = sleepDuration * 2
+		time.Sleep(time.Duration(backoff))
+		backoff = backoff * 2
 
 		if backoff > cfg.MaxBackoff {
 			backoff = cfg.MaxBackoff

From 2a9e05b777919702906095c2e9126df0eb3b03c6 Mon Sep 17 00:00:00 2001
From: Julien Pivotto <roidelapluie@inuits.eu>
Date: Fri, 12 Feb 2021 10:21:09 +0100
Subject: [PATCH 2/3] Release 2.25.0-rc.0

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
---
 CHANGELOG.md | 25 +++++++++++++++++++++++++
 VERSION      |  2 +-
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 711bd4918..8a105a702 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,28 @@
+## 2.25.0-rc.0 / 2021-02-12
+
+This release includes a new `--enable-feature=` flag that enables
+experimental features. Such features might be changed or removed in the future.
+
+* [FEATURE] **experimental** API: Accept remote_write requests. Behind the --enable-feature=remote-write-receiver flag. #8424
+* [FEATURE] **experimental** PromQL: Add '@ <timestamp>' modifier. Behind the --enable-feature=promql-at-modifier flag. #8121 #8436 #8425
+* [ENHANCEMENT] Add optional name property to testgroup for better test failure output. #8440
+* [ENHANCEMENT] Add warnings into React Panel on the Graph page. #8427
+* [ENHANCEMENT] TSDB: Increase the number of buckets for the compaction duration metric. #8342
+* [ENHANCEMENT] Remote: Allow passing along custom remote_write HTTP headers. #8416
+* [ENHANCEMENT] Mixins: Scope grafana configuration. #8332
+* [ENHANCEMENT] Kubernetes SD: Add endpoint labels metadata. #8273
+* [ENHANCEMENT] UI: Expose total number of label pairs in head in TSDB stats page. #8343
+* [ENHANCEMENT] TSDB: Reload blocks every minute, to detect new blocks and enforce retention more often. #8343
+* [BUGFIX] API: Fix global URL when external address has no port. #8359
+* [BUGFIX] Backfill: Fix error message handling. #8432
+* [BUGFIX] Deprecate unused flag --alertmanager.timeout. #8407
+* [BUGFIX] Mixins: Support remote-write metrics renamed in v2.23 in alerts. #8423
+* [BUGFIX] Remote: Fix garbage collection of dropped series in remote write. #8387
+* [BUGFIX] Remote: Log recoverable remote write errors as warnings. #8412
+* [BUGFIX] TSDB: Remove pre-2.21 temporary blocks on start. #8353.
+* [BUGFIX] UI: Fix duplicated keys on /targets page. #8456
+* [BUGFIX] UI: Fix label name leak into class name. #8459
+
 ## 2.24.1 / 2021-01-20
 
 * [ENHANCEMENT] Cache basic authentication results to significantly improve performance of HTTP endpoints (via an update of prometheus/exporter-toolkit).
diff --git a/VERSION b/VERSION
index 0f5dfbe87..0257f1b3f 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.24.1
+2.25.0-rc.0

From 93dcc3c7be310be52d87113cdce4d9ac8297c7fb Mon Sep 17 00:00:00 2001
From: Julien Pivotto <roidelapluie@inuits.eu>
Date: Fri, 12 Feb 2021 23:34:48 +0100
Subject: [PATCH 3/3] Consider status code 429 as recoverable errors to avoid
 resharding (#8237)

This reverts commit 49a8ce5239d5b05bfb6051301e7bfff6569ce2d9.

This commit is necessary since we only wanted to not have the
functionality in 2.25. It will be improved soon on the main branch.

Co-authored-by: Harkishen-Singh <harkishensingh@hotmail.com>

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
---
 storage/remote/client.go        | 26 ++++++++++++++++++++++++--
 storage/remote/client_test.go   | 29 ++++++++++++++++++++++++++++-
 storage/remote/queue_manager.go | 21 ++++++++++++++++++---
 3 files changed, 70 insertions(+), 6 deletions(-)

diff --git a/storage/remote/client.go b/storage/remote/client.go
index 13502f75e..e122dcbf4 100644
--- a/storage/remote/client.go
+++ b/storage/remote/client.go
@@ -148,8 +148,11 @@ func NewWriteClient(name string, conf *ClientConfig) (WriteClient, error) {
 	}, nil
 }
 
+const defaultBackoff = 0
+
 type RecoverableError struct {
 	error
+	retryAfter model.Duration
 }
 
 // Store sends a batch of samples to the HTTP endpoint, the request is the proto marshalled
@@ -188,7 +191,7 @@ func (c *Client) Store(ctx context.Context, req []byte) error {
 	if err != nil {
 		// Errors from Client.Do are from (for example) network errors, so are
 		// recoverable.
-		return RecoverableError{err}
+		return RecoverableError{err, defaultBackoff}
 	}
 	defer func() {
 		io.Copy(ioutil.Discard, httpResp.Body)
@@ -204,11 +207,30 @@ func (c *Client) Store(ctx context.Context, req []byte) error {
 		err = errors.Errorf("server returned HTTP status %s: %s", httpResp.Status, line)
 	}
 	if httpResp.StatusCode/100 == 5 {
-		return RecoverableError{err}
+		return RecoverableError{err, defaultBackoff}
+	}
+	if httpResp.StatusCode == http.StatusTooManyRequests {
+		return RecoverableError{err, retryAfterDuration(httpResp.Header.Get("Retry-After"))}
 	}
 	return err
 }
 
+// retryAfterDuration returns the duration for the Retry-After header. In case of any errors, it
+// returns the defaultBackoff as if the header was never supplied.
+func retryAfterDuration(t string) model.Duration {
+	parsedDuration, err := time.Parse(http.TimeFormat, t)
+	if err == nil {
+		s := time.Until(parsedDuration).Seconds()
+		return model.Duration(s) * model.Duration(time.Second)
+	}
+	// The duration can be in seconds.
+	d, err := strconv.Atoi(t)
+	if err != nil {
+		return defaultBackoff
+	}
+	return model.Duration(d) * model.Duration(time.Second)
+}
+
 // Name uniquely identifies the client.
 func (c Client) Name() string {
 	return c.remoteName
diff --git a/storage/remote/client_test.go b/storage/remote/client_test.go
index 93a4c59c7..82067d3af 100644
--- a/storage/remote/client_test.go
+++ b/storage/remote/client_test.go
@@ -49,7 +49,7 @@ func TestStoreHTTPErrorHandling(t *testing.T) {
 		},
 		{
 			code: 500,
-			err:  RecoverableError{errors.New("server returned HTTP status 500 Internal Server Error: " + longErrMessage[:maxErrMsgLen])},
+			err:  RecoverableError{errors.New("server returned HTTP status 500 Internal Server Error: " + longErrMessage[:maxErrMsgLen]), defaultBackoff},
 		},
 	}
 
@@ -83,3 +83,30 @@ func TestStoreHTTPErrorHandling(t *testing.T) {
 		server.Close()
 	}
 }
+
+func TestRetryAfterDuration(t *testing.T) {
+	tc := []struct {
+		name     string
+		tInput   string
+		expected model.Duration
+	}{
+		{
+			name:     "seconds",
+			tInput:   "120",
+			expected: model.Duration(time.Second * 120),
+		},
+		{
+			name:     "date-time default",
+			tInput:   time.RFC1123, // Expected layout is http.TimeFormat, hence an error.
+			expected: defaultBackoff,
+		},
+		{
+			name:     "retry-after not provided",
+			tInput:   "", // Expected layout is http.TimeFormat, hence an error.
+			expected: defaultBackoff,
+		},
+	}
+	for _, c := range tc {
+		require.Equal(t, c.expected, retryAfterDuration(c.tInput), c.name)
+	}
+}
diff --git a/storage/remote/queue_manager.go b/storage/remote/queue_manager.go
index 4e9067659..a3eb35e46 100644
--- a/storage/remote/queue_manager.go
+++ b/storage/remote/queue_manager.go
@@ -29,6 +29,7 @@ import (
 	"go.uber.org/atomic"
 
 	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/common/model"
 	"github.com/prometheus/prometheus/config"
 	"github.com/prometheus/prometheus/pkg/labels"
 	"github.com/prometheus/prometheus/pkg/relabel"
@@ -1042,6 +1043,7 @@ func (s *shards) sendSamplesWithBackoff(ctx context.Context, samples []prompb.Ti
 
 func sendWriteRequestWithBackoff(ctx context.Context, cfg config.QueueConfig, l log.Logger, attempt func(int) error, onRetry func()) error {
 	backoff := cfg.MinBackoff
+	sleepDuration := model.Duration(0)
 	try := 0
 
 	for {
@@ -1058,16 +1060,29 @@ func sendWriteRequestWithBackoff(ctx context.Context, cfg config.QueueConfig, l
 		}
 
 		// If the error is unrecoverable, we should not retry.
-		if _, ok := err.(RecoverableError); !ok {
+		backoffErr, ok := err.(RecoverableError)
+		if !ok {
 			return err
 		}
 
+		sleepDuration = backoff
+		if backoffErr.retryAfter > 0 {
+			sleepDuration = backoffErr.retryAfter
+			level.Info(l).Log("msg", "Retrying after duration specified by Retry-After header", "duration", sleepDuration)
+		} else if backoffErr.retryAfter < 0 {
+			level.Debug(l).Log("msg", "retry-after cannot be in past, retrying using default backoff mechanism")
+		}
+
+		select {
+		case <-ctx.Done():
+		case <-time.After(time.Duration(sleepDuration)):
+		}
+
 		// If we make it this far, we've encountered a recoverable error and will retry.
 		onRetry()
 		level.Warn(l).Log("msg", "Failed to send batch, retrying", "err", err)
 
-		time.Sleep(time.Duration(backoff))
-		backoff = backoff * 2
+		backoff = sleepDuration * 2
 
 		if backoff > cfg.MaxBackoff {
 			backoff = cfg.MaxBackoff