Making the number of CPUs used for WAL replay configurable (#12066)

Adds `WALReplayConcurrency` as an option on tsdb `Options` and `HeadOptions`.
If it is not set or set <=0, then `GOMAXPROCS` is used, which matches the previous behaviour.

Signed-off-by: Yuri Nikolic <durica.nikolic@grafana.com>
This commit is contained in:
Đurica Yuri Nikolić 2023-03-07 17:41:33 +01:00 committed by GitHub
parent ff993b279a
commit c9b85afd93
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 56 additions and 37 deletions

View File

@ -125,6 +125,10 @@ type Options struct {
// WALCompression will turn on Snappy compression for records on the WAL.
WALCompression bool
// Maximum number of CPUs that can simultaneously processes WAL replay.
// If it is <=0, then GOMAXPROCS is used.
WALReplayConcurrency int
// StripeSize is the size in entries of the series hash map. Reducing the size will save memory but impact performance.
StripeSize int
@ -782,6 +786,9 @@ func open(dir string, l log.Logger, r prometheus.Registerer, opts *Options, rngs
headOpts.EnableNativeHistograms.Store(opts.EnableNativeHistograms)
headOpts.OutOfOrderTimeWindow.Store(opts.OutOfOrderTimeWindow)
headOpts.OutOfOrderCapMax.Store(opts.OutOfOrderCapMax)
if opts.WALReplayConcurrency > 0 {
headOpts.WALReplayConcurrency = opts.WALReplayConcurrency
}
if opts.IsolationDisabled {
// We only override this flag if isolation is disabled at DB level. We use the default otherwise.
headOpts.IsolationDisabled = opts.IsolationDisabled

View File

@ -18,6 +18,7 @@ import (
"io"
"math"
"path/filepath"
"runtime"
"sync"
"time"
@ -58,6 +59,8 @@ var (
// defaultIsolationDisabled is true if isolation is disabled by default.
defaultIsolationDisabled = false
defaultWALReplayConcurrency = runtime.GOMAXPROCS(0)
)
// Head handles reads and writes of time series data within a time window.
@ -155,6 +158,11 @@ type HeadOptions struct {
EnableMemorySnapshotOnShutdown bool
IsolationDisabled bool
// Maximum number of CPUs that can simultaneously processes WAL replay.
// The default value is GOMAXPROCS.
// If it is set to a negative value or zero, the default value is used.
WALReplayConcurrency int
}
const (
@ -172,6 +180,7 @@ func DefaultHeadOptions() *HeadOptions {
StripeSize: DefaultStripeSize,
SeriesCallback: &noopSeriesLifecycleCallback{},
IsolationDisabled: defaultIsolationDisabled,
WALReplayConcurrency: defaultWALReplayConcurrency,
}
ho.OutOfOrderCapMax.Store(DefaultOutOfOrderCapMax)
return ho
@ -247,6 +256,10 @@ func NewHead(r prometheus.Registerer, l log.Logger, wal, wbl *wlog.WL, opts *Hea
opts.ChunkPool = chunkenc.NewPool()
}
if opts.WALReplayConcurrency <= 0 {
opts.WALReplayConcurrency = defaultWALReplayConcurrency
}
h.chunkDiskMapper, err = chunks.NewChunkDiskMapper(
r,
mmappedChunksDir(opts.ChunkDirRoot),

View File

@ -18,7 +18,6 @@ import (
"math"
"os"
"path/filepath"
"runtime"
"strconv"
"strings"
"sync"
@ -65,13 +64,13 @@ func (h *Head) loadWAL(r *wlog.Reader, multiRef map[chunks.HeadSeriesRef]chunks.
// Start workers that each process samples for a partition of the series ID space.
var (
wg sync.WaitGroup
n = runtime.GOMAXPROCS(0)
processors = make([]walSubsetProcessor, n)
concurrency = h.opts.WALReplayConcurrency
processors = make([]walSubsetProcessor, concurrency)
exemplarsInput chan record.RefExemplar
dec record.Decoder
shards = make([][]record.RefSample, n)
histogramShards = make([][]histogramRecord, n)
shards = make([][]record.RefSample, concurrency)
histogramShards = make([][]histogramRecord, concurrency)
decoded = make(chan interface{}, 10)
decodeErr, seriesCreationErr error
@ -116,7 +115,7 @@ func (h *Head) loadWAL(r *wlog.Reader, multiRef map[chunks.HeadSeriesRef]chunks.
// For CorruptionErr ensure to terminate all workers before exiting.
_, ok := err.(*wlog.CorruptionErr)
if ok || seriesCreationErr != nil {
for i := 0; i < n; i++ {
for i := 0; i < concurrency; i++ {
processors[i].closeAndDrain()
}
close(exemplarsInput)
@ -124,8 +123,8 @@ func (h *Head) loadWAL(r *wlog.Reader, multiRef map[chunks.HeadSeriesRef]chunks.
}
}()
wg.Add(n)
for i := 0; i < n; i++ {
wg.Add(concurrency)
for i := 0; i < concurrency; i++ {
processors[i].setup()
go func(wp *walSubsetProcessor) {
@ -276,7 +275,7 @@ Outer:
multiRef[walSeries.Ref] = mSeries.ref
}
idx := uint64(mSeries.ref) % uint64(n)
idx := uint64(mSeries.ref) % uint64(concurrency)
processors[idx].input <- walSubsetProcessorInputItem{walSeriesRef: walSeries.Ref, existingSeries: mSeries}
}
//nolint:staticcheck // Ignore SA6002 relax staticcheck verification.
@ -293,7 +292,7 @@ Outer:
if len(samples) < m {
m = len(samples)
}
for i := 0; i < n; i++ {
for i := 0; i < concurrency; i++ {
if shards[i] == nil {
shards[i] = processors[i].reuseBuf()
}
@ -305,10 +304,10 @@ Outer:
if r, ok := multiRef[sam.Ref]; ok {
sam.Ref = r
}
mod := uint64(sam.Ref) % uint64(n)
mod := uint64(sam.Ref) % uint64(concurrency)
shards[mod] = append(shards[mod], sam)
}
for i := 0; i < n; i++ {
for i := 0; i < concurrency; i++ {
if len(shards[i]) > 0 {
processors[i].input <- walSubsetProcessorInputItem{samples: shards[i]}
shards[i] = nil
@ -351,7 +350,7 @@ Outer:
if len(samples) < m {
m = len(samples)
}
for i := 0; i < n; i++ {
for i := 0; i < concurrency; i++ {
if histogramShards[i] == nil {
histogramShards[i] = processors[i].reuseHistogramBuf()
}
@ -363,10 +362,10 @@ Outer:
if r, ok := multiRef[sam.Ref]; ok {
sam.Ref = r
}
mod := uint64(sam.Ref) % uint64(n)
mod := uint64(sam.Ref) % uint64(concurrency)
histogramShards[mod] = append(histogramShards[mod], histogramRecord{ref: sam.Ref, t: sam.T, h: sam.H})
}
for i := 0; i < n; i++ {
for i := 0; i < concurrency; i++ {
if len(histogramShards[i]) > 0 {
processors[i].input <- walSubsetProcessorInputItem{histogramSamples: histogramShards[i]}
histogramShards[i] = nil
@ -388,7 +387,7 @@ Outer:
if len(samples) < m {
m = len(samples)
}
for i := 0; i < n; i++ {
for i := 0; i < concurrency; i++ {
if histogramShards[i] == nil {
histogramShards[i] = processors[i].reuseHistogramBuf()
}
@ -400,10 +399,10 @@ Outer:
if r, ok := multiRef[sam.Ref]; ok {
sam.Ref = r
}
mod := uint64(sam.Ref) % uint64(n)
mod := uint64(sam.Ref) % uint64(concurrency)
histogramShards[mod] = append(histogramShards[mod], histogramRecord{ref: sam.Ref, t: sam.T, fh: sam.FH})
}
for i := 0; i < n; i++ {
for i := 0; i < concurrency; i++ {
if len(histogramShards[i]) > 0 {
processors[i].input <- walSubsetProcessorInputItem{histogramSamples: histogramShards[i]}
histogramShards[i] = nil
@ -444,7 +443,7 @@ Outer:
}
// Signal termination to each worker and wait for it to close its output channel.
for i := 0; i < n; i++ {
for i := 0; i < concurrency; i++ {
processors[i].closeAndDrain()
}
close(exemplarsInput)
@ -685,12 +684,12 @@ func (h *Head) loadWBL(r *wlog.Reader, multiRef map[chunks.HeadSeriesRef]chunks.
lastSeq, lastOff := lastMmapRef.Unpack()
// Start workers that each process samples for a partition of the series ID space.
var (
wg sync.WaitGroup
n = runtime.GOMAXPROCS(0)
processors = make([]wblSubsetProcessor, n)
wg sync.WaitGroup
concurrency = h.opts.WALReplayConcurrency
processors = make([]wblSubsetProcessor, concurrency)
dec record.Decoder
shards = make([][]record.RefSample, n)
shards = make([][]record.RefSample, concurrency)
decodedCh = make(chan interface{}, 10)
decodeErr error
@ -712,15 +711,15 @@ func (h *Head) loadWBL(r *wlog.Reader, multiRef map[chunks.HeadSeriesRef]chunks.
_, ok := err.(*wlog.CorruptionErr)
if ok {
err = &errLoadWbl{err: err}
for i := 0; i < n; i++ {
for i := 0; i < concurrency; i++ {
processors[i].closeAndDrain()
}
wg.Wait()
}
}()
wg.Add(n)
for i := 0; i < n; i++ {
wg.Add(concurrency)
for i := 0; i < concurrency; i++ {
processors[i].setup()
go func(wp *wblSubsetProcessor) {
@ -779,17 +778,17 @@ func (h *Head) loadWBL(r *wlog.Reader, multiRef map[chunks.HeadSeriesRef]chunks.
if len(samples) < m {
m = len(samples)
}
for i := 0; i < n; i++ {
for i := 0; i < concurrency; i++ {
shards[i] = processors[i].reuseBuf()
}
for _, sam := range samples[:m] {
if r, ok := multiRef[sam.Ref]; ok {
sam.Ref = r
}
mod := uint64(sam.Ref) % uint64(n)
mod := uint64(sam.Ref) % uint64(concurrency)
shards[mod] = append(shards[mod], sam)
}
for i := 0; i < n; i++ {
for i := 0; i < concurrency; i++ {
processors[i].input <- shards[i]
}
samples = samples[m:]
@ -816,7 +815,7 @@ func (h *Head) loadWBL(r *wlog.Reader, multiRef map[chunks.HeadSeriesRef]chunks.
mmapMarkerUnknownRefs.Inc()
continue
}
idx := uint64(ms.ref) % uint64(n)
idx := uint64(ms.ref) % uint64(concurrency)
// It is possible that some old sample is being processed in processWALSamples that
// could cause race below. So we wait for the goroutine to empty input the buffer and finish
// processing all old samples after emptying the buffer.
@ -845,7 +844,7 @@ func (h *Head) loadWBL(r *wlog.Reader, multiRef map[chunks.HeadSeriesRef]chunks.
}
// Signal termination to each worker and wait for it to close its output channel.
for i := 0; i < n; i++ {
for i := 0; i < concurrency; i++ {
processors[i].closeAndDrain()
}
wg.Wait()
@ -1381,18 +1380,18 @@ func (h *Head) loadChunkSnapshot() (int, int, map[chunks.HeadSeriesRef]*memSerie
var (
numSeries = 0
unknownRefs = int64(0)
n = runtime.GOMAXPROCS(0)
concurrency = h.opts.WALReplayConcurrency
wg sync.WaitGroup
recordChan = make(chan chunkSnapshotRecord, 5*n)
shardedRefSeries = make([]map[chunks.HeadSeriesRef]*memSeries, n)
errChan = make(chan error, n)
recordChan = make(chan chunkSnapshotRecord, 5*concurrency)
shardedRefSeries = make([]map[chunks.HeadSeriesRef]*memSeries, concurrency)
errChan = make(chan error, concurrency)
refSeries map[chunks.HeadSeriesRef]*memSeries
exemplarBuf []record.RefExemplar
dec record.Decoder
)
wg.Add(n)
for i := 0; i < n; i++ {
wg.Add(concurrency)
for i := 0; i < concurrency; i++ {
go func(idx int, rc <-chan chunkSnapshotRecord) {
defer wg.Done()
defer func() {