Remote writes: retry on recoverable errors. (#2552)

* Remote writes: retry on recoverable errors.

* Add comments

* Review feedback

* Comments

* Review feedback

* Final spelling misteak (I hope).  Plus, record failed samples correctly.
This commit is contained in:
Tom Wilkie 2017-04-06 23:15:41 +01:00 committed by Julius Volz
parent 9775ad4754
commit e5d7bbfc3c
4 changed files with 119 additions and 87 deletions

View File

@ -60,6 +60,10 @@ func NewClient(index int, conf *clientConfig) (*Client, error) {
}, nil
}
type recoverableError struct {
error
}
// Store sends a batch of samples to the HTTP endpoint.
func (c *Client) Store(samples model.Samples) error {
req := &WriteRequest{
@ -97,6 +101,8 @@ func (c *Client) Store(samples model.Samples) error {
httpReq, err := http.NewRequest("POST", c.url.String(), &buf)
if err != nil {
// Errors from NewRequest are from unparseable URLs, so are not
// recoverable.
return err
}
httpReq.Header.Add("Content-Encoding", "snappy")
@ -108,11 +114,17 @@ func (c *Client) Store(samples model.Samples) error {
httpResp, err := ctxhttp.Do(ctx, c.client, httpReq)
if err != nil {
return err
// Errors from client.Do are from (for example) network errors, so are
// recoverable.
return recoverableError{err}
}
defer httpResp.Body.Close()
if httpResp.StatusCode/100 != 2 {
return fmt.Errorf("server returned HTTP status %s", httpResp.Status)
err = fmt.Errorf("server returned HTTP status %s", httpResp.Status)
}
if httpResp.StatusCode/100 == 5 {
return recoverableError{err}
}
return nil
}

View File

@ -33,17 +33,6 @@ const (
subsystem = "remote_storage"
queue = "queue"
// With a maximum of 1000 shards, assuming an average of 100ms remote write
// time and 100 samples per batch, we will be able to push 1M samples/s.
defaultMaxShards = 1000
defaultMaxSamplesPerSend = 100
// defaultQueueCapacity is per shard - at 1000 shards, this will buffer
// 100M samples. It is configured to buffer 1000 batches, which at 100ms
// per batch is 1:40mins.
defaultQueueCapacity = defaultMaxSamplesPerSend * 1000
defaultBatchSendDeadline = 5 * time.Second
// We track samples in/out and how long pushes take using an Exponentially
// Weighted Moving Average.
ewmaWeight = 0.2
@ -58,12 +47,12 @@ const (
)
var (
sentSamplesTotal = prometheus.NewCounterVec(
succeededSamplesTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Subsystem: subsystem,
Name: "sent_samples_total",
Help: "Total number of processed samples sent to remote storage.",
Name: "succeeded_samples_total",
Help: "Total number of samples successfully sent to remote storage.",
},
[]string{queue},
)
@ -72,7 +61,7 @@ var (
Namespace: namespace,
Subsystem: subsystem,
Name: "failed_samples_total",
Help: "Total number of processed samples which failed on send to remote storage.",
Help: "Total number of samples which failed on send to remote storage.",
},
[]string{queue},
)
@ -125,7 +114,7 @@ var (
)
func init() {
prometheus.MustRegister(sentSamplesTotal)
prometheus.MustRegister(succeededSamplesTotal)
prometheus.MustRegister(failedSamplesTotal)
prometheus.MustRegister(droppedSamplesTotal)
prometheus.MustRegister(sentBatchDuration)
@ -134,6 +123,42 @@ func init() {
prometheus.MustRegister(numShards)
}
// QueueManagerConfig is the configuration for the queue used to write to remote
// storage.
type QueueManagerConfig struct {
// Number of samples to buffer per shard before we start dropping them.
QueueCapacity int
// Max number of shards, i.e. amount of concurrency.
MaxShards int
// Maximum number of samples per send.
MaxSamplesPerSend int
// Maximum time sample will wait in buffer.
BatchSendDeadline time.Duration
// Max number of times to retry a batch on recoverable errors.
MaxRetries int
// On recoverable errors, backoff exponentially.
MinBackoff time.Duration
MaxBackoff time.Duration
}
// defaultQueueManagerConfig is the default remote queue configuration.
var defaultQueueManagerConfig = QueueManagerConfig{
// With a maximum of 1000 shards, assuming an average of 100ms remote write
// time and 100 samples per batch, we will be able to push 1M samples/s.
MaxShards: 1000,
MaxSamplesPerSend: 100,
// By default, buffer 1000 batches, which at 100ms per batch is 1:40mins. At
// 1000 shards, this will buffer 100M samples total.
QueueCapacity: 100 * 1000,
BatchSendDeadline: 5 * time.Second,
// Max number of times to retry a batch on recoverable errors.
MaxRetries: 10,
MinBackoff: 30 * time.Millisecond,
MaxBackoff: 100 * time.Millisecond,
}
// StorageClient defines an interface for sending a batch of samples to an
// external timeseries database.
type StorageClient interface {
@ -143,21 +168,13 @@ type StorageClient interface {
Name() string
}
// QueueManagerConfig configures a storage queue.
type QueueManagerConfig struct {
QueueCapacity int // Number of samples to buffer per shard before we start dropping them.
MaxShards int // Max number of shards, i.e. amount of concurrency.
MaxSamplesPerSend int // Maximum number of samples per send.
BatchSendDeadline time.Duration // Maximum time sample will wait in buffer.
ExternalLabels model.LabelSet
RelabelConfigs []*config.RelabelConfig
Client StorageClient
}
// QueueManager manages a queue of samples to be sent to the Storage
// indicated by the provided StorageClient.
type QueueManager struct {
cfg QueueManagerConfig
externalLabels model.LabelSet
relabelConfigs []*config.RelabelConfig
client StorageClient
queueName string
logLimiter *rate.Limiter
@ -173,23 +190,14 @@ type QueueManager struct {
}
// NewQueueManager builds a new QueueManager.
func NewQueueManager(cfg QueueManagerConfig) *QueueManager {
if cfg.QueueCapacity == 0 {
cfg.QueueCapacity = defaultQueueCapacity
}
if cfg.MaxShards == 0 {
cfg.MaxShards = defaultMaxShards
}
if cfg.MaxSamplesPerSend == 0 {
cfg.MaxSamplesPerSend = defaultMaxSamplesPerSend
}
if cfg.BatchSendDeadline == 0 {
cfg.BatchSendDeadline = defaultBatchSendDeadline
}
func NewQueueManager(cfg QueueManagerConfig, externalLabels model.LabelSet, relabelConfigs []*config.RelabelConfig, client StorageClient) *QueueManager {
t := &QueueManager{
cfg: cfg,
queueName: cfg.Client.Name(),
externalLabels: externalLabels,
relabelConfigs: relabelConfigs,
client: client,
queueName: client.Name(),
logLimiter: rate.NewLimiter(logRateLimit, logBurst),
numShards: 1,
reshardChan: make(chan int),
@ -214,14 +222,14 @@ func (t *QueueManager) Append(s *model.Sample) error {
snew = *s
snew.Metric = s.Metric.Clone()
for ln, lv := range t.cfg.ExternalLabels {
for ln, lv := range t.externalLabels {
if _, ok := s.Metric[ln]; !ok {
snew.Metric[ln] = lv
}
}
snew.Metric = model.Metric(
relabel.Process(model.LabelSet(snew.Metric), t.cfg.RelabelConfigs...))
relabel.Process(model.LabelSet(snew.Metric), t.relabelConfigs...))
if snew.Metric == nil {
return nil
@ -467,21 +475,38 @@ func (s *shards) runShard(i int) {
}
func (s *shards) sendSamples(samples model.Samples) {
// Samples are sent to the remote storage on a best-effort basis. If a
// sample isn't sent correctly the first time, it's simply dropped on the
// floor.
begin := time.Now()
err := s.qm.cfg.Client.Store(samples)
duration := time.Since(begin)
if err != nil {
log.Warnf("error sending %d samples to remote storage: %s", len(samples), err)
failedSamplesTotal.WithLabelValues(s.qm.queueName).Add(float64(len(samples)))
} else {
sentSamplesTotal.WithLabelValues(s.qm.queueName).Add(float64(len(samples)))
}
sentBatchDuration.WithLabelValues(s.qm.queueName).Observe(duration.Seconds())
s.sendSamplesWithBackoff(samples)
// These counters are used to caclulate the dynamic sharding, and as such
// should be maintained irrespective of success or failure.
s.qm.samplesOut.incr(int64(len(samples)))
s.qm.samplesOutDuration.incr(int64(duration))
s.qm.samplesOutDuration.incr(int64(time.Since(begin)))
}
// sendSamples to the remote storage with backoff for recoverable errors.
func (s *shards) sendSamplesWithBackoff(samples model.Samples) {
backoff := s.qm.cfg.MinBackoff
for retries := s.qm.cfg.MaxRetries; retries > 0; retries-- {
begin := time.Now()
err := s.qm.client.Store(samples)
sentBatchDuration.WithLabelValues(s.qm.queueName).Observe(time.Since(begin).Seconds())
if err == nil {
succeededSamplesTotal.WithLabelValues(s.qm.queueName).Add(float64(len(samples)))
return
}
log.Warnf("Error sending %d samples to remote storage: %s", len(samples), err)
if _, ok := err.(recoverableError); !ok {
break
}
time.Sleep(backoff)
backoff = backoff * 2
if backoff > s.qm.cfg.MaxBackoff {
backoff = s.qm.cfg.MaxBackoff
}
}
failedSamplesTotal.WithLabelValues(s.qm.queueName).Add(float64(len(samples)))
}

View File

@ -81,7 +81,7 @@ func (c *TestStorageClient) Name() string {
func TestSampleDelivery(t *testing.T) {
// Let's create an even number of send batches so we don't run into the
// batch timeout case.
n := defaultQueueCapacity * 2
n := defaultQueueManagerConfig.QueueCapacity * 2
samples := make(model.Samples, 0, n)
for i := 0; i < n; i++ {
@ -97,10 +97,9 @@ func TestSampleDelivery(t *testing.T) {
c := NewTestStorageClient()
c.expectSamples(samples[:len(samples)/2])
m := NewQueueManager(QueueManagerConfig{
Client: c,
MaxShards: 1,
})
cfg := defaultQueueManagerConfig
cfg.MaxShards = 1
m := NewQueueManager(cfg, nil, nil, c)
// These should be received by the client.
for _, s := range samples[:len(samples)/2] {
@ -118,7 +117,7 @@ func TestSampleDelivery(t *testing.T) {
func TestSampleDeliveryOrder(t *testing.T) {
ts := 10
n := defaultMaxSamplesPerSend * ts
n := defaultQueueManagerConfig.MaxSamplesPerSend * ts
samples := make(model.Samples, 0, n)
for i := 0; i < n; i++ {
@ -134,11 +133,7 @@ func TestSampleDeliveryOrder(t *testing.T) {
c := NewTestStorageClient()
c.expectSamples(samples)
m := NewQueueManager(QueueManagerConfig{
Client: c,
// Ensure we don't drop samples in this test.
QueueCapacity: n,
})
m := NewQueueManager(defaultQueueManagerConfig, nil, nil, c)
// These should be received by the client.
for _, s := range samples {
@ -199,7 +194,7 @@ func TestSpawnNotMoreThanMaxConcurrentSendsGoroutines(t *testing.T) {
// `MaxSamplesPerSend*Shards` samples should be consumed by the
// per-shard goroutines, and then another `MaxSamplesPerSend`
// should be left on the queue.
n := defaultMaxSamplesPerSend*1 + defaultMaxSamplesPerSend
n := defaultQueueManagerConfig.MaxSamplesPerSend * 2
samples := make(model.Samples, 0, n)
for i := 0; i < n; i++ {
@ -213,11 +208,10 @@ func TestSpawnNotMoreThanMaxConcurrentSendsGoroutines(t *testing.T) {
}
c := NewTestBlockedStorageClient()
m := NewQueueManager(QueueManagerConfig{
Client: c,
QueueCapacity: n,
MaxShards: 1,
})
cfg := defaultQueueManagerConfig
cfg.MaxShards = 1
cfg.QueueCapacity = n
m := NewQueueManager(cfg, nil, nil, c)
m.Start()
@ -246,7 +240,7 @@ func TestSpawnNotMoreThanMaxConcurrentSendsGoroutines(t *testing.T) {
time.Sleep(10 * time.Millisecond)
}
if m.queueLen() != defaultMaxSamplesPerSend {
if m.queueLen() != defaultQueueManagerConfig.MaxSamplesPerSend {
t.Fatalf("Failed to drain QueueManager queue, %d elements left",
m.queueLen(),
)

View File

@ -44,11 +44,12 @@ func (w *Writer) ApplyConfig(conf *config.Config) error {
if err != nil {
return err
}
newQueues = append(newQueues, NewQueueManager(QueueManagerConfig{
Client: c,
ExternalLabels: conf.GlobalConfig.ExternalLabels,
RelabelConfigs: rwConf.WriteRelabelConfigs,
}))
newQueues = append(newQueues, NewQueueManager(
defaultQueueManagerConfig,
conf.GlobalConfig.ExternalLabels,
rwConf.WriteRelabelConfigs,
c,
))
}
for _, q := range w.queues {