prometheus/tsdb/head.go

// Copyright 2017 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package tsdb

import (
	"fmt"
	"io"
	"math"
	"path/filepath"
	"sync"
	"time"

	"github.com/go-kit/log"
	"github.com/go-kit/log/level"
	"github.com/oklog/ulid"
	"github.com/pkg/errors"
	"github.com/prometheus/client_golang/prometheus"
	"go.uber.org/atomic"

	"github.com/prometheus/prometheus/config"
	"github.com/prometheus/prometheus/pkg/exemplar"
	"github.com/prometheus/prometheus/pkg/labels"
	"github.com/prometheus/prometheus/storage"
	"github.com/prometheus/prometheus/tsdb/chunkenc"
	"github.com/prometheus/prometheus/tsdb/chunks"
	tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
	"github.com/prometheus/prometheus/tsdb/index"
	"github.com/prometheus/prometheus/tsdb/record"
	"github.com/prometheus/prometheus/tsdb/tombstones"
	"github.com/prometheus/prometheus/tsdb/tsdbutil"
	"github.com/prometheus/prometheus/tsdb/wal"
)

var (
	// ErrInvalidSample is returned if an appended sample is not valid and can't
	// be ingested.
	ErrInvalidSample = errors.New("invalid sample")
	// ErrInvalidExemplar is returned if an appended exemplar is not valid and can't
	// be ingested.
	ErrInvalidExemplar = errors.New("invalid exemplar")
	// ErrAppenderClosed is returned if an appender has already be successfully
	// rolled back or committed.
	ErrAppenderClosed = errors.New("appender closed")
)

// Head handles reads and writes of time series data within a time window.
type Head struct {
	chunkRange               atomic.Int64
	numSeries                atomic.Uint64
	minTime, maxTime         atomic.Int64 // Current min and max of the samples included in the head.
	minValidTime             atomic.Int64 // Mint allowed to be added to the head. It shouldn't be lower than the maxt of the last persisted block.
	lastWALTruncationTime    atomic.Int64
	lastMemoryTruncationTime atomic.Int64
	lastSeriesID             atomic.Uint64

	metrics         *headMetrics
	opts            *HeadOptions
	wal             *wal.WAL
	exemplarMetrics *ExemplarMetrics
	exemplars       ExemplarStorage
	logger          log.Logger
	appendPool      sync.Pool
	exemplarsPool   sync.Pool
	seriesPool      sync.Pool
	bytesPool       sync.Pool
	memChunkPool    sync.Pool

	// All series addressable by their ID or hash.
	series *stripeSeries

	deletedMtx sync.Mutex
	deleted    map[uint64]int // Deleted series, and what WAL segment they must be kept until.

	postings *index.MemPostings // Postings lists for terms.

	tombstones *tombstones.MemTombstones

	iso *isolation

	cardinalityMutex      sync.Mutex
	cardinalityCache      *index.PostingsStats // Posting stats cache which will expire after 30sec.
	lastPostingsStatsCall time.Duration        // Last posting stats call (PostingsCardinalityStats()) time for caching.

	// chunkDiskMapper is used to write and read Head chunks to/from disk.
	chunkDiskMapper *chunks.ChunkDiskMapper

	chunkSnapshotMtx sync.Mutex

	closedMtx sync.Mutex
	closed    bool

	stats *HeadStats
	reg   prometheus.Registerer

	memTruncationInProcess atomic.Bool
}

type ExemplarStorage interface {
	storage.ExemplarQueryable
	AddExemplar(labels.Labels, exemplar.Exemplar) error
	ValidateExemplar(labels.Labels, exemplar.Exemplar) error
	IterateExemplars(f func(seriesLabels labels.Labels, e exemplar.Exemplar) error) error
}

// HeadOptions are parameters for the Head block.
type HeadOptions struct {
	// Runtime reloadable option. At the top of the struct for 32 bit OS:
	// https://pkg.go.dev/sync/atomic#pkg-note-BUG
	MaxExemplars atomic.Int64

	ChunkRange int64
	// ChunkDirRoot is the parent directory of the chunks directory.
	ChunkDirRoot         string
	ChunkPool            chunkenc.Pool
	ChunkWriteBufferSize int
	// StripeSize sets the number of entries in the hash map, it must be a power of 2.
	// A larger StripeSize will allocate more memory up-front, but will increase performance when handling a large number of series.
	// A smaller StripeSize reduces the memory allocated, but can decrease performance with large number of series.
	StripeSize                     int
	SeriesCallback                 SeriesLifecycleCallback
	EnableExemplarStorage          bool
	EnableMemorySnapshotOnShutdown bool
}

func DefaultHeadOptions() *HeadOptions {
	return &HeadOptions{
		ChunkRange:           DefaultBlockDuration,
		ChunkDirRoot:         "",
		ChunkPool:            chunkenc.NewPool(),
		ChunkWriteBufferSize: chunks.DefaultWriteBufferSize,
		StripeSize:           DefaultStripeSize,
		SeriesCallback:       &noopSeriesLifecycleCallback{},
	}
}

// SeriesLifecycleCallback specifies a list of callbacks that will be called during a lifecycle of a series.
// It is always a no-op in Prometheus and mainly meant for external users who import TSDB.
// All the callbacks should be safe to be called concurrently.
// It is up to the user to implement soft or hard consistency by making the callbacks
// atomic or non-atomic. Atomic callbacks can cause degradation performance.
type SeriesLifecycleCallback interface {
	// PreCreation is called before creating a series to indicate if the series can be created.
	// A non nil error means the series should not be created.
	PreCreation(labels.Labels) error
	// PostCreation is called after creating a series to indicate a creation of series.
	PostCreation(labels.Labels)
	// PostDeletion is called after deletion of series.
	PostDeletion(...labels.Labels)
}

// NewHead opens the head block in dir.
func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, opts *HeadOptions, stats *HeadStats) (*Head, error) {
	var err error
	if l == nil {
		l = log.NewNopLogger()
	}
	if opts.ChunkRange < 1 {
		return nil, errors.Errorf("invalid chunk range %d", opts.ChunkRange)
	}
	if opts.SeriesCallback == nil {
		opts.SeriesCallback = &noopSeriesLifecycleCallback{}
	}

	if stats == nil {
		stats = NewHeadStats()
	}

	if !opts.EnableExemplarStorage {
		opts.MaxExemplars.Store(0)
	}

	h := &Head{
		wal:    wal,
		logger: l,
		opts:   opts,
		memChunkPool: sync.Pool{
			New: func() interface{} {
				return &memChunk{}
			},
		},
		stats: stats,
		reg:   r,
	}
	if err := h.resetInMemoryState(); err != nil {
		return nil, err
	}
	h.metrics = newHeadMetrics(h, r)

	if opts.ChunkPool == nil {
		opts.ChunkPool = chunkenc.NewPool()
	}

	h.chunkDiskMapper, err = chunks.NewChunkDiskMapper(
		mmappedChunksDir(opts.ChunkDirRoot),
		opts.ChunkPool,
		opts.ChunkWriteBufferSize,
	)
	if err != nil {
		return nil, err
	}

	return h, nil
}

func (h *Head) resetInMemoryState() error {
	var err error
	var em *ExemplarMetrics
	if h.exemplars != nil {
		ce, ok := h.exemplars.(*CircularExemplarStorage)
		if ok {
			em = ce.metrics
		}
	}
	if em == nil {
		em = NewExemplarMetrics(h.reg)
	}
	es, err := NewCircularExemplarStorage(h.opts.MaxExemplars.Load(), em)
	if err != nil {
		return err
	}

	h.exemplarMetrics = em
	h.exemplars = es
	h.series = newStripeSeries(h.opts.StripeSize, h.opts.SeriesCallback)
	h.postings = index.NewUnorderedMemPostings()
	h.tombstones = tombstones.NewMemTombstones()
	h.iso = newIsolation()
	h.deleted = map[uint64]int{}
	h.chunkRange.Store(h.opts.ChunkRange)
	h.minTime.Store(math.MaxInt64)
	h.maxTime.Store(math.MinInt64)
	h.lastWALTruncationTime.Store(math.MinInt64)
	h.lastMemoryTruncationTime.Store(math.MinInt64)
	return nil
}

type headMetrics struct {
	activeAppenders          prometheus.Gauge
	series                   prometheus.GaugeFunc
	seriesCreated            prometheus.Counter
	seriesRemoved            prometheus.Counter
	seriesNotFound           prometheus.Counter
	chunks                   prometheus.Gauge
	chunksCreated            prometheus.Counter
	chunksRemoved            prometheus.Counter
	gcDuration               prometheus.Summary
	samplesAppended          prometheus.Counter
	outOfBoundSamples        prometheus.Counter
	outOfOrderSamples        prometheus.Counter
	walTruncateDuration      prometheus.Summary
	walCorruptionsTotal      prometheus.Counter
	walTotalReplayDuration   prometheus.Gauge
	headTruncateFail         prometheus.Counter
	headTruncateTotal        prometheus.Counter
	checkpointDeleteFail     prometheus.Counter
	checkpointDeleteTotal    prometheus.Counter
	checkpointCreationFail   prometheus.Counter
	checkpointCreationTotal  prometheus.Counter
	mmapChunkCorruptionTotal prometheus.Counter
	snapshotReplayErrorTotal prometheus.Counter // Will be either 0 or 1.
}

func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
	m := &headMetrics{
		activeAppenders: prometheus.NewGauge(prometheus.GaugeOpts{
			Name: "prometheus_tsdb_head_active_appenders",
			Help: "Number of currently active appender transactions",
		}),
		series: prometheus.NewGaugeFunc(prometheus.GaugeOpts{
			Name: "prometheus_tsdb_head_series",
			Help: "Total number of series in the head block.",
		}, func() float64 {
			return float64(h.NumSeries())
		}),
		seriesCreated: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_series_created_total",
			Help: "Total number of series created in the head",
		}),
		seriesRemoved: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_series_removed_total",
			Help: "Total number of series removed in the head",
		}),
		seriesNotFound: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_series_not_found_total",
			Help: "Total number of requests for series that were not found.",
		}),
		chunks: prometheus.NewGauge(prometheus.GaugeOpts{
			Name: "prometheus_tsdb_head_chunks",
			Help: "Total number of chunks in the head block.",
		}),
		chunksCreated: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_chunks_created_total",
			Help: "Total number of chunks created in the head",
		}),
		chunksRemoved: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_chunks_removed_total",
			Help: "Total number of chunks removed in the head",
		}),
		gcDuration: prometheus.NewSummary(prometheus.SummaryOpts{
			Name: "prometheus_tsdb_head_gc_duration_seconds",
			Help: "Runtime of garbage collection in the head block.",
		}),
		walTruncateDuration: prometheus.NewSummary(prometheus.SummaryOpts{
			Name: "prometheus_tsdb_wal_truncate_duration_seconds",
			Help: "Duration of WAL truncation.",
		}),
		walCorruptionsTotal: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_wal_corruptions_total",
			Help: "Total number of WAL corruptions.",
		}),
		walTotalReplayDuration: prometheus.NewGauge(prometheus.GaugeOpts{
			Name: "prometheus_tsdb_data_replay_duration_seconds",
			Help: "Time taken to replay the data on disk.",
		}),
		samplesAppended: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_samples_appended_total",
			Help: "Total number of appended samples.",
		}),
		outOfBoundSamples: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_out_of_bound_samples_total",
			Help: "Total number of out of bound samples ingestion failed attempts.",
		}),
		outOfOrderSamples: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_out_of_order_samples_total",
			Help: "Total number of out of order samples ingestion failed attempts.",
		}),
		headTruncateFail: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_truncations_failed_total",
			Help: "Total number of head truncations that failed.",
		}),
		headTruncateTotal: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_head_truncations_total",
			Help: "Total number of head truncations attempted.",
		}),
		checkpointDeleteFail: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_checkpoint_deletions_failed_total",
			Help: "Total number of checkpoint deletions that failed.",
		}),
		checkpointDeleteTotal: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_checkpoint_deletions_total",
			Help: "Total number of checkpoint deletions attempted.",
		}),
		checkpointCreationFail: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_checkpoint_creations_failed_total",
			Help: "Total number of checkpoint creations that failed.",
		}),
		checkpointCreationTotal: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_checkpoint_creations_total",
			Help: "Total number of checkpoint creations attempted.",
		}),
		mmapChunkCorruptionTotal: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_mmap_chunk_corruptions_total",
			Help: "Total number of memory-mapped chunk corruptions.",
		}),
		snapshotReplayErrorTotal: prometheus.NewCounter(prometheus.CounterOpts{
			Name: "prometheus_tsdb_snapshot_replay_error_total",
			Help: "Total number snapshot replays that failed.",
		}),
	}

	if r != nil {
		r.MustRegister(
			m.activeAppenders,
			m.series,
			m.chunks,
			m.chunksCreated,
			m.chunksRemoved,
			m.seriesCreated,
			m.seriesRemoved,
			m.seriesNotFound,
			m.gcDuration,
			m.walTruncateDuration,
			m.walCorruptionsTotal,
			m.walTotalReplayDuration,
			m.samplesAppended,
			m.outOfBoundSamples,
			m.outOfOrderSamples,
			m.headTruncateFail,
			m.headTruncateTotal,
			m.checkpointDeleteFail,
			m.checkpointDeleteTotal,
			m.checkpointCreationFail,
			m.checkpointCreationTotal,
			m.mmapChunkCorruptionTotal,
			m.snapshotReplayErrorTotal,
			// Metrics bound to functions and not needed in tests
			// can be created and registered on the spot.
			prometheus.NewGaugeFunc(prometheus.GaugeOpts{
				Name: "prometheus_tsdb_head_max_time",
				Help: "Maximum timestamp of the head block. The unit is decided by the library consumer.",
			}, func() float64 {
				return float64(h.MaxTime())
			}),
			prometheus.NewGaugeFunc(prometheus.GaugeOpts{
				Name: "prometheus_tsdb_head_min_time",
				Help: "Minimum time bound of the head block. The unit is decided by the library consumer.",
			}, func() float64 {
				return float64(h.MinTime())
			}),
			prometheus.NewGaugeFunc(prometheus.GaugeOpts{
				Name: "prometheus_tsdb_isolation_low_watermark",
				Help: "The lowest TSDB append ID that is still referenced.",
			}, func() float64 {
				return float64(h.iso.lowWatermark())
			}),
			prometheus.NewGaugeFunc(prometheus.GaugeOpts{
				Name: "prometheus_tsdb_isolation_high_watermark",
				Help: "The highest TSDB append ID that has been given out.",
			}, func() float64 {
				return float64(h.iso.lastAppendID())
			}),
		)
	}
	return m
}

func mmappedChunksDir(dir string) string { return filepath.Join(dir, "chunks_head") }

// HeadStats are the statistics for the head component of the DB.
type HeadStats struct {
	WALReplayStatus *WALReplayStatus
}

// NewHeadStats returns a new HeadStats object.
func NewHeadStats() *HeadStats {
	return &HeadStats{
		WALReplayStatus: &WALReplayStatus{},
	}
}

// WALReplayStatus contains status information about the WAL replay.
type WALReplayStatus struct {
	sync.RWMutex
	Min     int
	Max     int
	Current int
}

// GetWALReplayStatus returns the WAL replay status information.
func (s *WALReplayStatus) GetWALReplayStatus() WALReplayStatus {
	s.RLock()
	defer s.RUnlock()

	return WALReplayStatus{
		Min:     s.Min,
		Max:     s.Max,
		Current: s.Current,
	}
}

const cardinalityCacheExpirationTime = time.Duration(30) * time.Second

// Init loads data from the write ahead log and prepares the head for writes.
// It should be called before using an appender so that it
// limits the ingested samples to the head min valid time.
func (h *Head) Init(minValidTime int64) error {
	h.minValidTime.Store(minValidTime)
	defer h.postings.EnsureOrder()
	defer h.gc() // After loading the wal remove the obsolete data from the head.
	defer func() {
		// Loading of m-mapped chunks and snapshot can make the mint of the Head
		// to go below minValidTime.
		if h.MinTime() < h.minValidTime.Load() {
			h.minTime.Store(h.minValidTime.Load())
		}
	}()

	level.Info(h.logger).Log("msg", "Replaying on-disk memory mappable chunks if any")
	start := time.Now()

	snapIdx, snapOffset := -1, 0
	refSeries := make(map[uint64]*memSeries)

	if h.opts.EnableMemorySnapshotOnShutdown {
		level.Info(h.logger).Log("msg", "Chunk snapshot is enabled, replaying from the snapshot")
		var err error
		snapIdx, snapOffset, refSeries, err = h.loadChunkSnapshot()
		if err != nil {
			snapIdx, snapOffset = -1, 0
			h.metrics.snapshotReplayErrorTotal.Inc()
			level.Error(h.logger).Log("msg", "Failed to load chunk snapshot", "err", err)
			// We clear the partially loaded data to replay fresh from the WAL.
			if err := h.resetInMemoryState(); err != nil {
				return err
			}
		}
		level.Info(h.logger).Log("msg", "Chunk snapshot loading time", "duration", time.Since(start).String())
	}

	mmapChunkReplayStart := time.Now()
	mmappedChunks, err := h.loadMmappedChunks(refSeries)
	if err != nil {
		level.Error(h.logger).Log("msg", "Loading on-disk chunks failed", "err", err)
		if _, ok := errors.Cause(err).(*chunks.CorruptionErr); ok {
			h.metrics.mmapChunkCorruptionTotal.Inc()
		}
		// If this fails, data will be recovered from WAL.
		// Hence we wont lose any data (given WAL is not corrupt).
		mmappedChunks = h.removeCorruptedMmappedChunks(err, refSeries)
	}

	level.Info(h.logger).Log("msg", "On-disk memory mappable chunks replay completed", "duration", time.Since(mmapChunkReplayStart).String())
	if h.wal == nil {
		level.Info(h.logger).Log("msg", "WAL not found")
		return nil
	}

	level.Info(h.logger).Log("msg", "Replaying WAL, this may take a while")

	checkpointReplayStart := time.Now()
	// Backfill the checkpoint first if it exists.
	dir, startFrom, err := wal.LastCheckpoint(h.wal.Dir())
	if err != nil && err != record.ErrNotFound {
		return errors.Wrap(err, "find last checkpoint")
	}

	// Find the last segment.
	_, endAt, e := wal.Segments(h.wal.Dir())
	if e != nil {
		return errors.Wrap(e, "finding WAL segments")
	}

	h.startWALReplayStatus(startFrom, endAt)

	multiRef := map[uint64]uint64{}
	if err == nil && startFrom >= snapIdx {
		sr, err := wal.NewSegmentsReader(dir)
		if err != nil {
			return errors.Wrap(err, "open checkpoint")
		}
		defer func() {
			if err := sr.Close(); err != nil {
				level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
			}
		}()

		// A corrupted checkpoint is a hard error for now and requires user
		// intervention. There's likely little data that can be recovered anyway.
		if err := h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks); err != nil {
			return errors.Wrap(err, "backfill checkpoint")
		}
		h.updateWALReplayStatusRead(startFrom)
		startFrom++
		level.Info(h.logger).Log("msg", "WAL checkpoint loaded")
	}
	checkpointReplayDuration := time.Since(checkpointReplayStart)

	walReplayStart := time.Now()

	if snapIdx > startFrom {
		startFrom = snapIdx
	}
	// Backfill segments from the most recent checkpoint onwards.
	for i := startFrom; i <= endAt; i++ {
		s, err := wal.OpenReadSegment(wal.SegmentName(h.wal.Dir(), i))
		if err != nil {
			return errors.Wrap(err, fmt.Sprintf("open WAL segment: %d", i))
		}

		offset := 0
		if i == snapIdx {
			offset = snapOffset
		}
		sr, err := wal.NewSegmentBufReaderWithOffset(offset, s)
		if errors.Cause(err) == io.EOF {
			// File does not exist.
			continue
		}
		if err != nil {
			return errors.Wrapf(err, "segment reader (offset=%d)", offset)
		}
		err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks)
		if err := sr.Close(); err != nil {
			level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
		}
		if err != nil {
			return err
		}
		level.Info(h.logger).Log("msg", "WAL segment loaded", "segment", i, "maxSegment", endAt)
		h.updateWALReplayStatusRead(i)
	}

	walReplayDuration := time.Since(start)
	h.metrics.walTotalReplayDuration.Set(walReplayDuration.Seconds())
	level.Info(h.logger).Log(
		"msg", "WAL replay completed",
		"checkpoint_replay_duration", checkpointReplayDuration.String(),
		"wal_replay_duration", time.Since(walReplayStart).String(),
		"total_replay_duration", walReplayDuration.String(),
	)

	return nil
}

func (h *Head) loadMmappedChunks(refSeries map[uint64]*memSeries) (map[uint64][]*mmappedChunk, error) {
	mmappedChunks := map[uint64][]*mmappedChunk{}
	if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef uint64, chunkRef chunks.ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error {
		if maxt < h.minValidTime.Load() {
			return nil
		}
		ms, ok := refSeries[seriesRef]
		if !ok {
			slice := mmappedChunks[seriesRef]
			if len(slice) > 0 && slice[len(slice)-1].maxTime >= mint {
				return errors.Errorf("out of sequence m-mapped chunk for series ref %d", seriesRef)
			}

			slice = append(slice, &mmappedChunk{
				ref:        chunkRef,
				minTime:    mint,
				maxTime:    maxt,
				numSamples: numSamples,
			})
			mmappedChunks[seriesRef] = slice
			return nil
		}

		if len(ms.mmappedChunks) > 0 && ms.mmappedChunks[len(ms.mmappedChunks)-1].maxTime >= mint {
			return errors.Errorf("out of sequence m-mapped chunk for series ref %d", seriesRef)
		}

		h.metrics.chunks.Inc()
		h.metrics.chunksCreated.Inc()
		ms.mmappedChunks = append(ms.mmappedChunks, &mmappedChunk{
			ref:        chunkRef,
			minTime:    mint,
			maxTime:    maxt,
			numSamples: numSamples,
		})
		h.updateMinMaxTime(mint, maxt)
		if ms.headChunk != nil && maxt >= ms.headChunk.minTime {
			// The head chunk was completed and was m-mapped after taking the snapshot.
			// Hence remove this chunk.
			ms.nextAt = 0
			ms.headChunk = nil
			ms.app = nil
		}
		return nil
	}); err != nil {
		return nil, errors.Wrap(err, "iterate on on-disk chunks")
	}
	return mmappedChunks, nil
}

// removeCorruptedMmappedChunks attempts to delete the corrupted mmapped chunks and if it fails, it clears all the previously
// loaded mmapped chunks.
func (h *Head) removeCorruptedMmappedChunks(err error, refSeries map[uint64]*memSeries) map[uint64][]*mmappedChunk {
	level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")

	if err := h.chunkDiskMapper.DeleteCorrupted(err); err != nil {
		level.Info(h.logger).Log("msg", "Deletion of mmap chunk files failed, discarding chunk files completely", "err", err)
		return map[uint64][]*mmappedChunk{}
	}

	level.Info(h.logger).Log("msg", "Deletion of mmap chunk files successful, reattempting m-mapping the on-disk chunks")
	mmappedChunks, err := h.loadMmappedChunks(refSeries)
	if err != nil {
		level.Error(h.logger).Log("msg", "Loading on-disk chunks failed, discarding chunk files completely", "err", err)
		mmappedChunks = map[uint64][]*mmappedChunk{}
	}

	return mmappedChunks
}

func (h *Head) ApplyConfig(cfg *config.Config) error {
	if !h.opts.EnableExemplarStorage {
		return nil
	}

	// Head uses opts.MaxExemplars in combination with opts.EnableExemplarStorage
	// to decide if it should pass exemplars along to it's exemplar storage, so we
	// need to update opts.MaxExemplars here.
	prevSize := h.opts.MaxExemplars.Load()
	h.opts.MaxExemplars.Store(cfg.StorageConfig.ExemplarsConfig.MaxExemplars)

	if prevSize == h.opts.MaxExemplars.Load() {
		return nil
	}

	migrated := h.exemplars.(*CircularExemplarStorage).Resize(h.opts.MaxExemplars.Load())
	level.Info(h.logger).Log("msg", "Exemplar storage resized", "from", prevSize, "to", h.opts.MaxExemplars, "migrated", migrated)
	return nil
}

// PostingsCardinalityStats returns top 10 highest cardinality stats By label and value names.
func (h *Head) PostingsCardinalityStats(statsByLabelName string) *index.PostingsStats {
	h.cardinalityMutex.Lock()
	defer h.cardinalityMutex.Unlock()
	currentTime := time.Duration(time.Now().Unix()) * time.Second
	seconds := currentTime - h.lastPostingsStatsCall
	if seconds > cardinalityCacheExpirationTime {
		h.cardinalityCache = nil
	}
	if h.cardinalityCache != nil {
		return h.cardinalityCache
	}
	h.cardinalityCache = h.postings.Stats(statsByLabelName)
	h.lastPostingsStatsCall = time.Duration(time.Now().Unix()) * time.Second

	return h.cardinalityCache
}

func (h *Head) updateMinMaxTime(mint, maxt int64) {
	for {
		lt := h.MinTime()
		if mint >= lt {
			break
		}
		if h.minTime.CAS(lt, mint) {
			break
		}
	}
	for {
		ht := h.MaxTime()
		if maxt <= ht {
			break
		}
		if h.maxTime.CAS(ht, maxt) {
			break
		}
	}
}

// SetMinValidTime sets the minimum timestamp the head can ingest.
func (h *Head) SetMinValidTime(minValidTime int64) {
	h.minValidTime.Store(minValidTime)
}

// Truncate removes old data before mint from the head and WAL.
func (h *Head) Truncate(mint int64) (err error) {
	initialize := h.MinTime() == math.MaxInt64
	if err := h.truncateMemory(mint); err != nil {
		return err
	}
	if initialize {
		return nil
	}
	return h.truncateWAL(mint)
}

// OverlapsClosedInterval returns true if the head overlaps [mint, maxt].
func (h *Head) OverlapsClosedInterval(mint, maxt int64) bool {
	return h.MinTime() <= maxt && mint <= h.MaxTime()
}

// truncateMemory removes old data before mint from the head.
func (h *Head) truncateMemory(mint int64) (err error) {
	h.chunkSnapshotMtx.Lock()
	defer h.chunkSnapshotMtx.Unlock()

	defer func() {
		if err != nil {
			h.metrics.headTruncateFail.Inc()
		}
	}()

	initialize := h.MinTime() == math.MaxInt64

	if h.MinTime() >= mint && !initialize {
		return nil
	}

	// The order of these two Store() should not be changed,
	// i.e. truncation time is set before in-process boolean.
	h.lastMemoryTruncationTime.Store(mint)
	h.memTruncationInProcess.Store(true)
	defer h.memTruncationInProcess.Store(false)

	// We wait for pending queries to end that overlap with this truncation.
	if !initialize {
		h.WaitForPendingReadersInTimeRange(h.MinTime(), mint)
	}

	h.minTime.Store(mint)
	h.minValidTime.Store(mint)

	// Ensure that max time is at least as high as min time.
	for h.MaxTime() < mint {
		h.maxTime.CAS(h.MaxTime(), mint)
	}

	// This was an initial call to Truncate after loading blocks on startup.
	// We haven't read back the WAL yet, so do not attempt to truncate it.
	if initialize {
		return nil
	}

	h.metrics.headTruncateTotal.Inc()
	start := time.Now()

	actualMint := h.gc()
	level.Info(h.logger).Log("msg", "Head GC completed", "duration", time.Since(start))
	h.metrics.gcDuration.Observe(time.Since(start).Seconds())
	if actualMint > h.minTime.Load() {
		// The actual mint of the Head is higher than the one asked to truncate.
		appendableMinValidTime := h.appendableMinValidTime()
		if actualMint < appendableMinValidTime {
			h.minTime.Store(actualMint)
			h.minValidTime.Store(actualMint)
		} else {
			// The actual min time is in the appendable window.
			// So we set the mint to the appendableMinValidTime.
			h.minTime.Store(appendableMinValidTime)
			h.minValidTime.Store(appendableMinValidTime)
		}
	}

	// Truncate the chunk m-mapper.
	if err := h.chunkDiskMapper.Truncate(mint); err != nil {
		return errors.Wrap(err, "truncate chunks.HeadReadWriter")
	}
	return nil
}

// WaitForPendingReadersInTimeRange waits for queries overlapping with given range to finish querying.
// The query timeout limits the max wait time of this function implicitly.
// The mint is inclusive and maxt is the truncation time hence exclusive.
func (h *Head) WaitForPendingReadersInTimeRange(mint, maxt int64) {
	maxt-- // Making it inclusive before checking overlaps.
	overlaps := func() bool {
		o := false
		h.iso.TraverseOpenReads(func(s *isolationState) bool {
			if s.mint <= maxt && mint <= s.maxt {
				// Overlaps with the truncation range.
				o = true
				return false
			}
			return true
		})
		return o
	}
	for overlaps() {
		time.Sleep(500 * time.Millisecond)
	}
}

// IsQuerierCollidingWithTruncation returns if the current querier needs to be closed and if a new querier
// has to be created. In the latter case, the method also returns the new mint to be used for creating the
// new range head and the new querier. This methods helps preventing races with the truncation of in-memory data.
//
// NOTE: The querier should already be taken before calling this.
func (h *Head) IsQuerierCollidingWithTruncation(querierMint, querierMaxt int64) (shouldClose, getNew bool, newMint int64) {
	if !h.memTruncationInProcess.Load() {
		return false, false, 0
	}
	// Head truncation is in process. It also means that the block that was
	// created for this truncation range is also available.
	// Check if we took a querier that overlaps with this truncation.
	memTruncTime := h.lastMemoryTruncationTime.Load()
	if querierMaxt < memTruncTime {
		// Head compaction has happened and this time range is being truncated.
		// This query doesn't overlap with the Head any longer.
		// We should close this querier to avoid races and the data would be
		// available with the blocks below.
		// Cases:
		// 1.     |------truncation------|
		//   |---query---|
		// 2.     |------truncation------|
		//              |---query---|
		return true, false, 0
	}
	if querierMint < memTruncTime {
		// The truncation time is not same as head mint that we saw above but the
		// query still overlaps with the Head.
		// The truncation started after we got the querier. So it is not safe
		// to use this querier and/or might block truncation. We should get
		// a new querier for the new Head range while remaining will be available
		// in the blocks below.
		// Case:
		//      |------truncation------|
		//                        |----query----|
		// Turns into
		//      |------truncation------|
		//                             |---qu---|
		return true, true, memTruncTime
	}

	// Other case is this, which is a no-op
	//      |------truncation------|
	//                              |---query---|
	return false, false, 0
}

// truncateWAL removes old data before mint from the WAL.
func (h *Head) truncateWAL(mint int64) error {
	h.chunkSnapshotMtx.Lock()
	defer h.chunkSnapshotMtx.Unlock()

	if h.wal == nil || mint <= h.lastWALTruncationTime.Load() {
		return nil
	}
	start := time.Now()
	h.lastWALTruncationTime.Store(mint)

	first, last, err := wal.Segments(h.wal.Dir())
	if err != nil {
		return errors.Wrap(err, "get segment range")
	}
	// Start a new segment, so low ingestion volume TSDB don't have more WAL than
	// needed.
	if err := h.wal.NextSegment(); err != nil {
		return errors.Wrap(err, "next segment")
	}
	last-- // Never consider last segment for checkpoint.
	if last < 0 {
		return nil // no segments yet.
	}
	// The lower two thirds of segments should contain mostly obsolete samples.
	// If we have less than two segments, it's not worth checkpointing yet.
	// With the default 2h blocks, this will keeping up to around 3h worth
	// of WAL segments.
	last = first + (last-first)*2/3
	if last <= first {
		return nil
	}

	keep := func(id uint64) bool {
		if h.series.getByID(id) != nil {
			return true
		}
		h.deletedMtx.Lock()
		_, ok := h.deleted[id]
		h.deletedMtx.Unlock()
		return ok
	}
	h.metrics.checkpointCreationTotal.Inc()
	if _, err = wal.Checkpoint(h.logger, h.wal, first, last, keep, mint); err != nil {
		h.metrics.checkpointCreationFail.Inc()
		if _, ok := errors.Cause(err).(*wal.CorruptionErr); ok {
			h.metrics.walCorruptionsTotal.Inc()
		}
		return errors.Wrap(err, "create checkpoint")
	}
	if err := h.wal.Truncate(last + 1); err != nil {
		// If truncating fails, we'll just try again at the next checkpoint.
		// Leftover segments will just be ignored in the future if there's a checkpoint
		// that supersedes them.
		level.Error(h.logger).Log("msg", "truncating segments failed", "err", err)
	}

	// The checkpoint is written and segments before it is truncated, so we no
	// longer need to track deleted series that are before it.
	h.deletedMtx.Lock()
	for ref, segment := range h.deleted {
		if segment < first {
			delete(h.deleted, ref)
		}
	}
	h.deletedMtx.Unlock()

	h.metrics.checkpointDeleteTotal.Inc()
	if err := wal.DeleteCheckpoints(h.wal.Dir(), last); err != nil {
		// Leftover old checkpoints do not cause problems down the line beyond
		// occupying disk space.
		// They will just be ignored since a higher checkpoint exists.
		level.Error(h.logger).Log("msg", "delete old checkpoints", "err", err)
		h.metrics.checkpointDeleteFail.Inc()
	}
	h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds())

	level.Info(h.logger).Log("msg", "WAL checkpoint complete",
		"first", first, "last", last, "duration", time.Since(start))

	return nil
}

type Stats struct {
	NumSeries         uint64
	MinTime, MaxTime  int64
	IndexPostingStats *index.PostingsStats
}

// Stats returns important current HEAD statistics. Note that it is expensive to
// calculate these.
func (h *Head) Stats(statsByLabelName string) *Stats {
	return &Stats{
		NumSeries:         h.NumSeries(),
		MaxTime:           h.MaxTime(),
		MinTime:           h.MinTime(),
		IndexPostingStats: h.PostingsCardinalityStats(statsByLabelName),
	}
}

type RangeHead struct {
	head       *Head
	mint, maxt int64
}

// NewRangeHead returns a *RangeHead.
func NewRangeHead(head *Head, mint, maxt int64) *RangeHead {
	return &RangeHead{
		head: head,
		mint: mint,
		maxt: maxt,
	}
}

func (h *RangeHead) Index() (IndexReader, error) {
	return h.head.indexRange(h.mint, h.maxt), nil
}

func (h *RangeHead) Chunks() (ChunkReader, error) {
	return h.head.chunksRange(h.mint, h.maxt, h.head.iso.State(h.mint, h.maxt))
}

func (h *RangeHead) Tombstones() (tombstones.Reader, error) {
	return h.head.tombstones, nil
}

func (h *RangeHead) MinTime() int64 {
	return h.mint
}

// MaxTime returns the max time of actual data fetch-able from the head.
// This controls the chunks time range which is closed [b.MinTime, b.MaxTime].
func (h *RangeHead) MaxTime() int64 {
	return h.maxt
}

// BlockMaxTime returns the max time of the potential block created from this head.
// It's different to MaxTime as we need to add +1 millisecond to block maxt because block
// intervals are half-open: [b.MinTime, b.MaxTime). Block intervals are always +1 than the total samples it includes.
func (h *RangeHead) BlockMaxTime() int64 {
	return h.MaxTime() + 1
}

func (h *RangeHead) NumSeries() uint64 {
	return h.head.NumSeries()
}

func (h *RangeHead) Meta() BlockMeta {
	return BlockMeta{
		MinTime: h.MinTime(),
		MaxTime: h.MaxTime(),
		ULID:    h.head.Meta().ULID,
		Stats: BlockStats{
			NumSeries: h.NumSeries(),
		},
	}
}

// String returns an human readable representation of the range head. It's important to
// keep this function in order to avoid the struct dump when the head is stringified in
// errors or logs.
func (h *RangeHead) String() string {
	return fmt.Sprintf("range head (mint: %d, maxt: %d)", h.MinTime(), h.MaxTime())
}

// Delete all samples in the range of [mint, maxt] for series that satisfy the given
// label matchers.
func (h *Head) Delete(mint, maxt int64, ms ...*labels.Matcher) error {
	// Do not delete anything beyond the currently valid range.
	mint, maxt = clampInterval(mint, maxt, h.MinTime(), h.MaxTime())

	ir := h.indexRange(mint, maxt)

	p, err := PostingsForMatchers(ir, ms...)
	if err != nil {
		return errors.Wrap(err, "select series")
	}

	var stones []tombstones.Stone
	for p.Next() {
		series := h.series.getByID(p.At())

		series.RLock()
		t0, t1 := series.minTime(), series.maxTime()
		series.RUnlock()
		if t0 == math.MinInt64 || t1 == math.MinInt64 {
			continue
		}
		// Delete only until the current values and not beyond.
		t0, t1 = clampInterval(mint, maxt, t0, t1)
		stones = append(stones, tombstones.Stone{Ref: p.At(), Intervals: tombstones.Intervals{{Mint: t0, Maxt: t1}}})
	}
	if p.Err() != nil {
		return p.Err()
	}
	if h.wal != nil {
		var enc record.Encoder
		if err := h.wal.Log(enc.Tombstones(stones, nil)); err != nil {
			return err
		}
	}
	for _, s := range stones {
		h.tombstones.AddInterval(s.Ref, s.Intervals[0])
	}

	return nil
}

// gc removes data before the minimum timestamp from the head.
// It returns the actual min times of the chunks present in the Head.
func (h *Head) gc() int64 {
	// Only data strictly lower than this timestamp must be deleted.
	mint := h.MinTime()

	// Drop old chunks and remember series IDs and hashes if they can be
	// deleted entirely.
	deleted, chunksRemoved, actualMint := h.series.gc(mint)
	seriesRemoved := len(deleted)

	h.metrics.seriesRemoved.Add(float64(seriesRemoved))
	h.metrics.chunksRemoved.Add(float64(chunksRemoved))
	h.metrics.chunks.Sub(float64(chunksRemoved))
	h.numSeries.Sub(uint64(seriesRemoved))

	// Remove deleted series IDs from the postings lists.
	h.postings.Delete(deleted)

	// Remove tombstones referring to the deleted series.
	h.tombstones.DeleteTombstones(deleted)
	h.tombstones.TruncateBefore(mint)

	if h.wal != nil {
		_, last, _ := wal.Segments(h.wal.Dir())
		h.deletedMtx.Lock()
		// Keep series records until we're past segment 'last'
		// because the WAL will still have samples records with
		// this ref ID. If we didn't keep these series records then
		// on start up when we replay the WAL, or any other code
		// that reads the WAL, wouldn't be able to use those
		// samples since we would have no labels for that ref ID.
		for ref := range deleted {
			h.deleted[ref] = last
		}
		h.deletedMtx.Unlock()
	}

	return actualMint
}

// Tombstones returns a new reader over the head's tombstones
func (h *Head) Tombstones() (tombstones.Reader, error) {
	return h.tombstones, nil
}

// NumSeries returns the number of active series in the head.
func (h *Head) NumSeries() uint64 {
	return h.numSeries.Load()
}

// Meta returns meta information about the head.
// The head is dynamic so will return dynamic results.
func (h *Head) Meta() BlockMeta {
	var id [16]byte
	copy(id[:], "______head______")
	return BlockMeta{
		MinTime: h.MinTime(),
		MaxTime: h.MaxTime(),
		ULID:    ulid.ULID(id),
		Stats: BlockStats{
			NumSeries: h.NumSeries(),
		},
	}
}

// MinTime returns the lowest time bound on visible data in the head.
func (h *Head) MinTime() int64 {
	return h.minTime.Load()
}

// MaxTime returns the highest timestamp seen in data of the head.
func (h *Head) MaxTime() int64 {
	return h.maxTime.Load()
}

// compactable returns whether the head has a compactable range.
// The head has a compactable range when the head time range is 1.5 times the chunk range.
// The 0.5 acts as a buffer of the appendable window.
func (h *Head) compactable() bool {
	return h.MaxTime()-h.MinTime() > h.chunkRange.Load()/2*3
}

// Close flushes the WAL and closes the head.
// It also takes a snapshot of in-memory chunks if enabled.
func (h *Head) Close() error {
	h.closedMtx.Lock()
	defer h.closedMtx.Unlock()
	h.closed = true
	errs := tsdb_errors.NewMulti(h.chunkDiskMapper.Close())
	if h.wal != nil {
		errs.Add(h.wal.Close())
	}
	if errs.Err() == nil && h.opts.EnableMemorySnapshotOnShutdown {
		errs.Add(h.performChunkSnapshot())
	}
	return errs.Err()
}

// String returns an human readable representation of the TSDB head. It's important to
// keep this function in order to avoid the struct dump when the head is stringified in
// errors or logs.
func (h *Head) String() string {
	return "head"
}

func (h *Head) getOrCreate(hash uint64, lset labels.Labels) (*memSeries, bool, error) {
	// Just using `getOrCreateWithID` below would be semantically sufficient, but we'd create
	// a new series on every sample inserted via Add(), which causes allocations
	// and makes our series IDs rather random and harder to compress in postings.
	s := h.series.getByHash(hash, lset)
	if s != nil {
		return s, false, nil
	}

	// Optimistically assume that we are the first one to create the series.
	id := h.lastSeriesID.Inc()

	return h.getOrCreateWithID(id, hash, lset)
}

func (h *Head) getOrCreateWithID(id, hash uint64, lset labels.Labels) (*memSeries, bool, error) {
	s, created, err := h.series.getOrSet(hash, lset, func() *memSeries {
		return newMemSeries(lset, id, h.chunkRange.Load(), &h.memChunkPool)
	})
	if err != nil {
		return nil, false, err
	}
	if !created {
		return s, false, nil
	}

	h.metrics.seriesCreated.Inc()
	h.numSeries.Inc()

	h.postings.Add(id, lset)
	return s, true, nil
}

// seriesHashmap is a simple hashmap for memSeries by their label set. It is built
// on top of a regular hashmap and holds a slice of series to resolve hash collisions.
// Its methods require the hash to be submitted with it to avoid re-computations throughout
// the code.
type seriesHashmap map[uint64][]*memSeries

func (m seriesHashmap) get(hash uint64, lset labels.Labels) *memSeries {
	for _, s := range m[hash] {
		if labels.Equal(s.lset, lset) {
			return s
		}
	}
	return nil
}

func (m seriesHashmap) set(hash uint64, s *memSeries) {
	l := m[hash]
	for i, prev := range l {
		if labels.Equal(prev.lset, s.lset) {
			l[i] = s
			return
		}
	}
	m[hash] = append(l, s)
}

func (m seriesHashmap) del(hash uint64, lset labels.Labels) {
	var rem []*memSeries
	for _, s := range m[hash] {
		if !labels.Equal(s.lset, lset) {
			rem = append(rem, s)
		}
	}
	if len(rem) == 0 {
		delete(m, hash)
	} else {
		m[hash] = rem
	}
}

const (
	// DefaultStripeSize is the default number of entries to allocate in the stripeSeries hash map.
	DefaultStripeSize = 1 << 14
)

// stripeSeries locks modulo ranges of IDs and hashes to reduce lock contention.
// The locks are padded to not be on the same cache line. Filling the padded space
// with the maps was profiled to be slower – likely due to the additional pointer
// dereferences.
type stripeSeries struct {
	size                    int
	series                  []map[uint64]*memSeries
	hashes                  []seriesHashmap
	locks                   []stripeLock
	seriesLifecycleCallback SeriesLifecycleCallback
}

type stripeLock struct {
	sync.RWMutex
	// Padding to avoid multiple locks being on the same cache line.
	_ [40]byte
}

func newStripeSeries(stripeSize int, seriesCallback SeriesLifecycleCallback) *stripeSeries {
	s := &stripeSeries{
		size:                    stripeSize,
		series:                  make([]map[uint64]*memSeries, stripeSize),
		hashes:                  make([]seriesHashmap, stripeSize),
		locks:                   make([]stripeLock, stripeSize),
		seriesLifecycleCallback: seriesCallback,
	}

	for i := range s.series {
		s.series[i] = map[uint64]*memSeries{}
	}
	for i := range s.hashes {
		s.hashes[i] = seriesHashmap{}
	}
	return s
}

// gc garbage collects old chunks that are strictly before mint and removes
// series entirely that have no chunks left.
func (s *stripeSeries) gc(mint int64) (map[uint64]struct{}, int, int64) {
	var (
		deleted                  = map[uint64]struct{}{}
		deletedForCallback       = []labels.Labels{}
		rmChunks                 = 0
		actualMint         int64 = math.MaxInt64
	)
	// Run through all series and truncate old chunks. Mark those with no
	// chunks left as deleted and store their ID.
	for i := 0; i < s.size; i++ {
		s.locks[i].Lock()

		for hash, all := range s.hashes[i] {
			for _, series := range all {
				series.Lock()
				rmChunks += series.truncateChunksBefore(mint)

				if len(series.mmappedChunks) > 0 || series.headChunk != nil || series.pendingCommit {
					seriesMint := series.minTime()
					if seriesMint < actualMint {
						actualMint = seriesMint
					}
					series.Unlock()
					continue
				}

				// The series is gone entirely. We need to keep the series lock
				// and make sure we have acquired the stripe locks for hash and ID of the
				// series alike.
				// If we don't hold them all, there's a very small chance that a series receives
				// samples again while we are half-way into deleting it.
				j := int(series.ref) & (s.size - 1)

				if i != j {
					s.locks[j].Lock()
				}

				deleted[series.ref] = struct{}{}
				s.hashes[i].del(hash, series.lset)
				delete(s.series[j], series.ref)
				deletedForCallback = append(deletedForCallback, series.lset)

				if i != j {
					s.locks[j].Unlock()
				}

				series.Unlock()
			}
		}

		s.locks[i].Unlock()

		s.seriesLifecycleCallback.PostDeletion(deletedForCallback...)
		deletedForCallback = deletedForCallback[:0]
	}

	if actualMint == math.MaxInt64 {
		actualMint = mint
	}

	return deleted, rmChunks, actualMint
}

func (s *stripeSeries) getByID(id uint64) *memSeries {
	i := id & uint64(s.size-1)

	s.locks[i].RLock()
	series := s.series[i][id]
	s.locks[i].RUnlock()

	return series
}

func (s *stripeSeries) getByHash(hash uint64, lset labels.Labels) *memSeries {
	i := hash & uint64(s.size-1)

	s.locks[i].RLock()
	series := s.hashes[i].get(hash, lset)
	s.locks[i].RUnlock()

	return series
}

func (s *stripeSeries) getOrSet(hash uint64, lset labels.Labels, createSeries func() *memSeries) (*memSeries, bool, error) {
	// PreCreation is called here to avoid calling it inside the lock.
	// It is not necessary to call it just before creating a series,
	// rather it gives a 'hint' whether to create a series or not.
	preCreationErr := s.seriesLifecycleCallback.PreCreation(lset)

	// Create the series, unless the PreCreation() callback as failed.
	// If failed, we'll not allow to create a new series anyway.
	var series *memSeries
	if preCreationErr == nil {
		series = createSeries()
	}

	i := hash & uint64(s.size-1)
	s.locks[i].Lock()

	if prev := s.hashes[i].get(hash, lset); prev != nil {
		s.locks[i].Unlock()
		return prev, false, nil
	}
	if preCreationErr == nil {
		s.hashes[i].set(hash, series)
	}
	s.locks[i].Unlock()

	if preCreationErr != nil {
		// The callback prevented creation of series.
		return nil, false, preCreationErr
	}
	// Setting the series in the s.hashes marks the creation of series
	// as any further calls to this methods would return that series.
	s.seriesLifecycleCallback.PostCreation(series.lset)

	i = series.ref & uint64(s.size-1)

	s.locks[i].Lock()
	s.series[i][series.ref] = series
	s.locks[i].Unlock()

	return series, true, nil
}

type sample struct {
	t int64
	v float64
}

func newSample(t int64, v float64) tsdbutil.Sample { return sample{t, v} }
func (s sample) T() int64                          { return s.t }
func (s sample) V() float64                        { return s.v }

// memSeries is the in-memory representation of a series. None of its methods
// are goroutine safe and it is the caller's responsibility to lock it.
type memSeries struct {
	sync.RWMutex

	ref           uint64
	lset          labels.Labels
	mmappedChunks []*mmappedChunk
	mmMaxTime     int64 // Max time of any mmapped chunk, only used during WAL replay.
	headChunk     *memChunk
	chunkRange    int64
	firstChunkID  int

	nextAt        int64 // Timestamp at which to cut the next chunk.
	sampleBuf     [4]sample
	pendingCommit bool // Whether there are samples waiting to be committed to this series.

	app chunkenc.Appender // Current appender for the chunk.

	memChunkPool *sync.Pool

	txs *txRing
}

func newMemSeries(lset labels.Labels, id uint64, chunkRange int64, memChunkPool *sync.Pool) *memSeries {
	s := &memSeries{
		lset:         lset,
		ref:          id,
		chunkRange:   chunkRange,
		nextAt:       math.MinInt64,
		txs:          newTxRing(4),
		memChunkPool: memChunkPool,
	}
	return s
}

func (s *memSeries) minTime() int64 {
	if len(s.mmappedChunks) > 0 {
		return s.mmappedChunks[0].minTime
	}
	if s.headChunk != nil {
		return s.headChunk.minTime
	}
	return math.MinInt64
}

func (s *memSeries) maxTime() int64 {
	c := s.head()
	if c != nil {
		return c.maxTime
	}
	if len(s.mmappedChunks) > 0 {
		return s.mmappedChunks[len(s.mmappedChunks)-1].maxTime
	}
	return math.MinInt64
}

// truncateChunksBefore removes all chunks from the series that
// have no timestamp at or after mint.
// Chunk IDs remain unchanged.
func (s *memSeries) truncateChunksBefore(mint int64) (removed int) {
	if s.headChunk != nil && s.headChunk.maxTime < mint {
		// If head chunk is truncated, we can truncate all mmapped chunks.
		removed = 1 + len(s.mmappedChunks)
		s.firstChunkID += removed
		s.headChunk = nil
		s.mmappedChunks = nil
		return removed
	}
	if len(s.mmappedChunks) > 0 {
		for i, c := range s.mmappedChunks {
			if c.maxTime >= mint {
				break
			}
			removed = i + 1
		}
		s.mmappedChunks = append(s.mmappedChunks[:0], s.mmappedChunks[removed:]...)
		s.firstChunkID += removed
	}
	return removed
}

// cleanupAppendIDsBelow cleans up older appendIDs. Has to be called after
// acquiring lock.
func (s *memSeries) cleanupAppendIDsBelow(bound uint64) {
	s.txs.cleanupAppendIDsBelow(bound)
}

func (s *memSeries) head() *memChunk {
	return s.headChunk
}

type memChunk struct {
	chunk            chunkenc.Chunk
	minTime, maxTime int64
}

// OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt].
func (mc *memChunk) OverlapsClosedInterval(mint, maxt int64) bool {
	return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt)
}

func overlapsClosedInterval(mint1, maxt1, mint2, maxt2 int64) bool {
	return mint1 <= maxt2 && mint2 <= maxt1
}

// mappedChunks describes chunk data on disk that can be mmapped
type mmappedChunk struct {
	ref              chunks.ChunkDiskMapperRef
	numSamples       uint16
	minTime, maxTime int64
}

// Returns true if the chunk overlaps [mint, maxt].
func (mc *mmappedChunk) OverlapsClosedInterval(mint, maxt int64) bool {
	return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt)
}

type noopSeriesLifecycleCallback struct{}

func (noopSeriesLifecycleCallback) PreCreation(labels.Labels) error { return nil }
func (noopSeriesLifecycleCallback) PostCreation(labels.Labels)      {}
func (noopSeriesLifecycleCallback) PostDeletion(...labels.Labels)   {}

func (h *Head) Size() int64 {
	var walSize int64
	if h.wal != nil {
		walSize, _ = h.wal.Size()
	}
	cdmSize, _ := h.chunkDiskMapper.Size()
	return walSize + cdmSize
}

func (h *RangeHead) Size() int64 {
	return h.head.Size()
}

func (h *Head) startWALReplayStatus(startFrom, last int) {
	h.stats.WALReplayStatus.Lock()
	defer h.stats.WALReplayStatus.Unlock()

	h.stats.WALReplayStatus.Min = startFrom
	h.stats.WALReplayStatus.Max = last
	h.stats.WALReplayStatus.Current = startFrom
}

func (h *Head) updateWALReplayStatusRead(current int) {
	h.stats.WALReplayStatus.Lock()
	defer h.stats.WALReplayStatus.Unlock()

	h.stats.WALReplayStatus.Current = current
}
-												Add liecence file and headers

											
										
										
											2017-04-10 18:59:45 +00:00
+								// Copyright 2017 The Prometheus Authors
 								// Licensed under the Apache License, Version 2.0 (the "License");
 								// you may not use this file except in compliance with the License.
 								// You may obtain a copy of the License at
 								//
 								// http://www.apache.org/licenses/LICENSE-2.0
 								//
 								// Unless required by applicable law or agreed to in writing, software
 								// distributed under the License is distributed on an "AS IS" BASIS,
 								// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 								// See the License for the specific language governing permissions and
 								// limitations under the License.
-												Add new interfaces and skeleton

											
										
										
											2016-12-04 12:16:11 +00:00
+								package tsdb
 								import (
-												Always create a new clean segment when starting the WAL. (#608)

* Always create a new clean segment when starting the WAL.
* Ensure we flush the last page after repairing and before recreating the
new segment in Repair.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-05-24 18:33:28 +00:00
+									"fmt"
-												Fix bugs and add enhancements to the chunk snapshot (#9185)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-17 17:08:16 +00:00
+									"io"
-												Fix last timestamp initialization

This initializes the chunkDesc's last timestamp to the minimum
value so initial samples with a timestamp of 0 (e.g. in tests)
are not accidentally dropped.

											
										
										
											2017-01-04 13:06:40 +00:00
+									"math"
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									"path/filepath"
-												Add new interfaces and skeleton

											
										
										
											2016-12-04 12:16:11 +00:00
+									"sync"
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+									"time"
-												Switch append refs to string

											
										
										
											2017-05-17 14:43:01 +00:00
-												Switched to go-kit/log

Signed-off-by: Levi Harrison <git@leviharrison.dev>

											
										
										
											2021-06-11 16:17:59 +00:00
+									"github.com/go-kit/log"
 									"github.com/go-kit/log/level"
-												Open db in Read only mode (#588)

* Added db read only open mode and use it for the tsdb cli.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2019-07-23 08:04:48 +00:00
+									"github.com/oklog/ulid"
-												Move stats into meta.json file, cleanup, docs

											
										
										
											2017-01-19 10:22:47 +00:00
+									"github.com/pkg/errors"
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+									"github.com/prometheus/client_golang/prometheus"
-												Move away from testutil, refactor imports (#8087)

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-10-22 09:00:08 +00:00
+									"go.uber.org/atomic"
-												Exemplar resize (#8974)

* Create experimental circular buffer resize method, benchmarks

Signed-off-by: Martin Disibio <mdisibio@gmail.com>

* Optimize exemplar resize to only replay as many exemplars as needed

Signed-off-by: Martin Disibio <mdisibio@gmail.com>

* More comments, benchmark AddExemplar

Signed-off-by: Martin Disibio <mdisibio@gmail.com>

* optimizations

Signed-off-by: Martin Disibio <mdisibio@gmail.com>

* comment

Signed-off-by: Martin Disibio <mdisibio@gmail.com>

* Slight refactor of resize benchmark + make use of resize via runtime
reloadable storage config.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Some more config related changes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Address some review comments.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Address more review comments.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Refactor to remove usage of noopExemplarStorage and avoid race condition
when resizing from Head code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix or add comments to clarify some of the new behaviour.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* fix potential panics related to negative exemplar buffer lengths

Signed-off-by: Callum Styan <callumstyan@gmail.com>

Co-authored-by: Callum Styan <callumstyan@gmail.com>
											
										
										
											2021-07-20 04:52:57 +00:00
+									"github.com/prometheus/prometheus/config"
-												Add circular in-memory exemplars storage (#6635)

* Add circular in-memory exemplars storage

Signed-off-by: Callum Styan <callumstyan@gmail.com>
Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>
Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
Signed-off-by: Martin Disibio <mdisibio@gmail.com>

Co-authored-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com>
Co-authored-by: Martin Disibio <mdisibio@gmail.com>

* Fix some comments, clean up exemplar metrics struct and exemplar tests.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix exemplar query api null vs empty array issue.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

Co-authored-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com>
Co-authored-by: Martin Disibio <mdisibio@gmail.com>
											
										
										
											2021-03-16 09:47:45 +00:00
+									"github.com/prometheus/prometheus/pkg/exemplar"
-												Port tsdb to use pkg/labels. (#6326)

* Port tsdb to use pkg/labels.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Get tests passing.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Remove useless cast.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Appease linters.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-11-18 19:53:33 +00:00
+									"github.com/prometheus/prometheus/pkg/labels"
-												Unify Iterator interfaces. All point to storage now.

This is part of https://github.com/prometheus/prometheus/pull/5882 that can be done to simplify things.
All todos I added will be fixed in follow up PRs.

* querier.Querier, querier.Appender, querier.SeriesSet, and querier.Series interfaces merged
with storage interface.go. All imports that.
* querier.SeriesIterator replaced by chunkenc.Iterator
* Added chunkenc.Iterator.Seek method and tests for xor implementation (?)
* Since we properly handle SelectParams for Select methods I adjusted min max
based on that. This should help in terms of performance for queries with functions like offset.
* added Seek to deletedIterator and test.
* storage/tsdb was removed as it was only a unnecessary glue with incompatible structs.

No logic was changed, only different source of abstractions, so no need for benchmarks.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

											
										
										
											2020-02-06 15:58:38 +00:00
+									"github.com/prometheus/prometheus/storage"
-												Cleanup after merging tsdb into prometheus

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-08-13 08:34:14 +00:00
+									"github.com/prometheus/prometheus/tsdb/chunkenc"
 									"github.com/prometheus/prometheus/tsdb/chunks"
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									tsdb_errors "github.com/prometheus/prometheus/tsdb/errors"
-												Cleanup after merging tsdb into prometheus

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-08-13 08:34:14 +00:00
+									"github.com/prometheus/prometheus/tsdb/index"
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 09:15:41 +00:00
+									"github.com/prometheus/prometheus/tsdb/record"
 									"github.com/prometheus/prometheus/tsdb/tombstones"
-												tsdb: Added ChunkQueryable implementations to db; unified MergeSeriesSets and vertical to single struct. (#7069)

* tsdb: Added ChunkQueryable implementations to db; unified compactor, querier and fanout block iterating.

Chained to https://github.com/prometheus/prometheus/pull/7059

* NewMerge(Chunk)Querier now takies multiple primaries allowing tsdb DB code to use it.
* Added single SeriesEntry / ChunkEntry for all series implementations.
* Unified all vertical, and non vertical for compact and querying to single
merge series / chunk sets by reusing VerticalSeriesMergeFunc for overlapping algorithm (same logic as before)
* Added block (Base/Chunk/)Querier for block querying. We then use populateAndTomb(Base/Chunk/) to iterate over chunks or samples.
* Refactored endpoint tests and querier tests to include subtests.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Addressed comments from Brian and Beorn.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed snapshot test and added chunk iterator support for DBReadOnly.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed race when iterating over Ats first.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed tests.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed populate block tests.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed endpoints test.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed test.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Added test & fixed case of head open chunk.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed DBReadOnly tests and bug producing 1 sample chunks.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Added cases for partial block overlap for multiple full chunks.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Added extra tests for chunk meta after compaction.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed small vertical merge bug and added more tests for that.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>
											
										
										
											2020-07-31 15:03:02 +00:00
+									"github.com/prometheus/prometheus/tsdb/tsdbutil"
-												Cleanup after merging tsdb into prometheus

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-08-13 08:34:14 +00:00
+									"github.com/prometheus/prometheus/tsdb/wal"
-												Add new interfaces and skeleton

											
										
										
											2016-12-04 12:16:11 +00:00
+								)
-												Write to WAL before appending to memory storage

											
										
										
											2017-01-17 15:33:58 +00:00
+								var (
-												tsdb: error on series with duplicate labels (#6664)

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-01-20 11:05:27 +00:00
+									// ErrInvalidSample is returned if an appended sample is not valid and can't
 									// be ingested.
 									ErrInvalidSample = errors.New("invalid sample")
-												Add circular in-memory exemplars storage (#6635)

* Add circular in-memory exemplars storage

Signed-off-by: Callum Styan <callumstyan@gmail.com>
Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>
Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
Signed-off-by: Martin Disibio <mdisibio@gmail.com>

Co-authored-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com>
Co-authored-by: Martin Disibio <mdisibio@gmail.com>

* Fix some comments, clean up exemplar metrics struct and exemplar tests.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix exemplar query api null vs empty array issue.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

Co-authored-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com>
Co-authored-by: Martin Disibio <mdisibio@gmail.com>
											
										
										
											2021-03-16 09:47:45 +00:00
+									// ErrInvalidExemplar is returned if an appended exemplar is not valid and can't
 									// be ingested.
 									ErrInvalidExemplar = errors.New("invalid exemplar")
-												TSDB: Error when we commit/rollback twice (#7593)

* TSDB: Error when we commit/rollback twice

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-07-22 09:57:38 +00:00
+									// ErrAppenderClosed is returned if an appender has already be successfully
-												fix misspell (#7764)

Signed-off-by: Zhou Hao <zhouhao@cn.fujitsu.com>
											
										
										
											2020-08-07 07:57:25 +00:00
+									// rolled back or committed.
-												TSDB: Error when we commit/rollback twice (#7593)

* TSDB: Error when we commit/rollback twice

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-07-22 09:57:38 +00:00
+									ErrAppenderClosed = errors.New("appender closed")
-												Write to WAL before appending to memory storage

											
										
										
											2017-01-17 15:33:58 +00:00
+								)
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 22:39:17 +00:00
+								// Head handles reads and writes of time series data within a time window.
 								type Head struct {
-												tsdb: Block Head GC till pending readers are done reading (#9081)

* tsdb: Block Head GC till pending readers are done reading

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments 2

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix the exclusiveness of maxt in WaitForPendingReadersInTimeRange

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-07-20 08:47:20 +00:00
+									chunkRange               atomic.Int64
 									numSeries                atomic.Uint64
 									minTime, maxTime         atomic.Int64 // Current min and max of the samples included in the head.
 									minValidTime             atomic.Int64 // Mint allowed to be added to the head. It shouldn't be lower than the maxt of the last persisted block.
 									lastWALTruncationTime    atomic.Int64
 									lastMemoryTruncationTime atomic.Int64
 									lastSeriesID             atomic.Uint64
-												Merge the 2.13 release branch to master (#6117)


											
										
										
											2019-10-09 15:41:46 +00:00
-												Exemplar resize (#8974)

* Create experimental circular buffer resize method, benchmarks

Signed-off-by: Martin Disibio <mdisibio@gmail.com>

* Optimize exemplar resize to only replay as many exemplars as needed

Signed-off-by: Martin Disibio <mdisibio@gmail.com>

* More comments, benchmark AddExemplar

Signed-off-by: Martin Disibio <mdisibio@gmail.com>

* optimizations

Signed-off-by: Martin Disibio <mdisibio@gmail.com>

* comment

Signed-off-by: Martin Disibio <mdisibio@gmail.com>

* Slight refactor of resize benchmark + make use of resize via runtime
reloadable storage config.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Some more config related changes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Address some review comments.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Address more review comments.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Refactor to remove usage of noopExemplarStorage and avoid race condition
when resizing from Head code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix or add comments to clarify some of the new behaviour.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* fix potential panics related to negative exemplar buffer lengths

Signed-off-by: Callum Styan <callumstyan@gmail.com>

Co-authored-by: Callum Styan <callumstyan@gmail.com>
											
										
										
											2021-07-20 04:52:57 +00:00
+									metrics         *headMetrics
 									opts            *HeadOptions
 									wal             *wal.WAL
 									exemplarMetrics *ExemplarMetrics
 									exemplars       ExemplarStorage
 									logger          log.Logger
 									appendPool      sync.Pool
 									exemplarsPool   sync.Pool
 									seriesPool      sync.Pool
 									bytesPool       sync.Pool
 									memChunkPool    sync.Pool
-												Count writer references on head blocks

											
										
										
											2017-02-04 10:53:52 +00:00
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+									// All series addressable by their ID or hash.
-												Combine NewHead() args into a HeadOptions struct (#8452)

* Combine NewHead() args into a HeadOptions struct

Signed-off-by: Dustin Hooten <dustinhooten@gmail.com>

* remove overrides params

Signed-off-by: Dustin Hooten <dustinhooten@gmail.com>

* address pr feedback

Signed-off-by: Dustin Hooten <dustinhooten@gmail.com>
											
										
										
											2021-02-09 14:12:48 +00:00
+									series *stripeSeries
-												Consolidate mem index into HeadBlock

											
										
										
											2016-12-22 00:12:28 +00:00
-												Keep series that are still in WAL in checkpoints (#577)

If all the samples are deleted for a series,
we should still keep the series in the WAL as
anything else reading the WAL will still care
about it in order to understand the samples.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-04-09 13:16:24 +00:00
+									deletedMtx sync.Mutex
 									deleted    map[uint64]int // Deleted series, and what WAL segment they must be kept until.
-												Fix punctuation nits

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2020-02-17 18:37:09 +00:00
+									postings *index.MemPostings // Postings lists for terms.
-												Head Cardinality Status Page (#6125)

* Adding TSDB Head Stats like cardinality to Status Page

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Moving mutx to Head

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Renaming variabls

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Renaming variabls and html

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Removing unwanted whitespaces

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Adding Tests, Banchmarks and Max Heap for Postings Stats

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Adding more tests for postingstats and web handler

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Adding more tests for postingstats and web handler

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Remove generated asset file that is no longer used

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Changing comment and variable name for more readability

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Using time.Duration in postings status function and removing refresh button from web page

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

											
										
										
											2019-11-05 02:06:13 +00:00
-												Bring back tombstones to Head block (#6542)

* Bring back tombstones to Head block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add test cases

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Cleanup

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-01-20 15:38:00 +00:00
+									tombstones *tombstones.MemTombstones
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 19:22:27 +00:00
+									iso *isolation
-												Head Cardinality Status Page (#6125)

* Adding TSDB Head Stats like cardinality to Status Page

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Moving mutx to Head

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Renaming variabls

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Renaming variabls and html

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Removing unwanted whitespaces

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Adding Tests, Banchmarks and Max Heap for Postings Stats

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Adding more tests for postingstats and web handler

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Adding more tests for postingstats and web handler

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Remove generated asset file that is no longer used

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

* Changing comment and variable name for more readability

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

* Using time.Duration in postings status function and removing refresh button from web page

Signed-off-by: Sharad Gaur <sgaur@splunk.com>

											
										
										
											2019-11-05 02:06:13 +00:00
+									cardinalityMutex      sync.Mutex
-												Fix punctuation nits

Signed-off-by: beorn7 <beorn@grafana.com>

											
										
										
											2020-02-17 18:37:09 +00:00
+									cardinalityCache      *index.PostingsStats // Posting stats cache which will expire after 30sec.
 									lastPostingsStatsCall time.Duration        // Last posting stats call (PostingsCardinalityStats()) time for caching.
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
 									// chunkDiskMapper is used to write and read Head chunks to/from disk.
 									chunkDiskMapper *chunks.ChunkDiskMapper
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 09:03:23 +00:00
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+									chunkSnapshotMtx sync.Mutex
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 09:03:23 +00:00
+									closedMtx sync.Mutex
 									closed    bool
-												React UI: Add Starting Screen (#8662)

* Added walreplay API endpoint

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Added starting page to react-ui

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Documented the new endpoint

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed typos

Signed-off-by: Levi Harrison <git@leviharrison.dev>

Co-authored-by: Julius Volz <julius.volz@gmail.com>

* Removed logo

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed isResponding to isUnexpected

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed width of progress bar

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed width of progress bar

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Added DB stats object

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Updated starting page to work with new fields

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (pt. 2)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (pt. 3)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (and also implementing a method this time) (pt. 4)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (and also implementing a method this time) (pt. 5)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed const to let

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (pt. 6)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Remove SetStats method

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Added comma

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed api

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed to triple equals

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed data response types

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Don't return pointer

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed version

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed interface issue

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed pointer

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed copying lock value error

Signed-off-by: Levi Harrison <git@leviharrison.dev>

Co-authored-by: Julius Volz <julius.volz@gmail.com>
											
										
										
											2021-06-05 14:29:32 +00:00
 									stats *HeadStats
-												Exemplar resize (#8974)

* Create experimental circular buffer resize method, benchmarks

Signed-off-by: Martin Disibio <mdisibio@gmail.com>

* Optimize exemplar resize to only replay as many exemplars as needed

Signed-off-by: Martin Disibio <mdisibio@gmail.com>

* More comments, benchmark AddExemplar

Signed-off-by: Martin Disibio <mdisibio@gmail.com>

* optimizations

Signed-off-by: Martin Disibio <mdisibio@gmail.com>

* comment

Signed-off-by: Martin Disibio <mdisibio@gmail.com>

* Slight refactor of resize benchmark + make use of resize via runtime
reloadable storage config.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Some more config related changes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Address some review comments.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Address more review comments.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Refactor to remove usage of noopExemplarStorage and avoid race condition
when resizing from Head code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix or add comments to clarify some of the new behaviour.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* fix potential panics related to negative exemplar buffer lengths

Signed-off-by: Callum Styan <callumstyan@gmail.com>

Co-authored-by: Callum Styan <callumstyan@gmail.com>
											
										
										
											2021-07-20 04:52:57 +00:00
+									reg   prometheus.Registerer
-												tsdb: Block Head GC till pending readers are done reading (#9081)

* tsdb: Block Head GC till pending readers are done reading

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments 2

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix the exclusiveness of maxt in WaitForPendingReadersInTimeRange

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-07-20 08:47:20 +00:00
 									memTruncationInProcess atomic.Bool
-												Add new interfaces and skeleton

											
										
										
											2016-12-04 12:16:11 +00:00
+								}
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+								type ExemplarStorage interface {
 									storage.ExemplarQueryable
 									AddExemplar(labels.Labels, exemplar.Exemplar) error
 									ValidateExemplar(labels.Labels, exemplar.Exemplar) error
-												Exemplars in snapshot (#9255)

* Exemplars in snapshot

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Add docs

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix comments

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-30 14:04:38 +00:00
+									IterateExemplars(f func(seriesLabels labels.Labels, e exemplar.Exemplar) error) error
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+								}
-												Combine NewHead() args into a HeadOptions struct (#8452)

* Combine NewHead() args into a HeadOptions struct

Signed-off-by: Dustin Hooten <dustinhooten@gmail.com>

* remove overrides params

Signed-off-by: Dustin Hooten <dustinhooten@gmail.com>

* address pr feedback

Signed-off-by: Dustin Hooten <dustinhooten@gmail.com>
											
										
										
											2021-02-09 14:12:48 +00:00
+								// HeadOptions are parameters for the Head block.
 								type HeadOptions struct {
-												Merge release 2.29 in main (#9196)

* PromQL: Fix start and end keywords masking label and metric names

This commit fixes an issue with the "at modifier" that introduced two
new keywords: `start` and `end`. In grouping options and in metric
names, these keywords took precedence over metric or label names, so
that those metrics and labels could no longer be referenced.

Signed-off-by: Clayton Peters <clayton.peters@man.com>

* Add in additional tests for metrics and/or labels called start/end.

Signed-off-by: Clayton Peters <clayton.peters@man.com>

* *: Cut 2.29.0-rc.0

Signed-off-by: Frederic Branczyk <fbranczyk@gmail.com>

* VERSION: bump to 2.29.0-rc.0

Signed-off-by: Frederic Branczyk <fbranczyk@gmail.com>

* Remove experimental wording on size-based retention

Followup of #9004

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

* Fix PR reference in changelog

Signed-off-by: George Brighton <george@gebn.co.uk>

* Describe EC2 availability zone IDs at most once per refresh (#9142)

Signed-off-by: George Brighton <george@gebn.co.uk>

* Describe EC2 availability zones at most once per SD load

Closes #9142.

Signed-off-by: George Brighton <george@gebn.co.uk>

* Incorporate feedback

Signed-off-by: George Brighton <george@gebn.co.uk>

* Integrate feedback

Signed-off-by: George Brighton <george@gebn.co.uk>

* Add a compatibility note for macOS users.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

* *: Cut v2.29.0-rc.1

Signed-off-by: Frederic Branczyk <fbranczyk@gmail.com>

* Fix `kuma_sd` targetgroup reporting (#9157)

* Bundle all xDS targets into a single group

Signed-off-by: austin ce <austin.cawley@gmail.com>

* *: cut v2.29.0-rc.2

Signed-off-by: Frederic Branczyk <fbranczyk@gmail.com>

* Rename links

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* bump codemirror-promql to 0.17.0

Signed-off-by: Augustin Husson <husson.augustin@gmail.com>

* *: cut v2.29.0

Signed-off-by: Frederic Branczyk <fbranczyk@gmail.com>

* tsdb: align atomically accessed int64 (#9192)

This prevents a panic in 32-bit archs:
https://pkg.go.dev/sync/atomic#pkg-note-BUG

Fixed #9190

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

* Release 2.29.1 (#9193)

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

Co-authored-by: Clayton Peters <clayton.peters@man.com>
Co-authored-by: Frederic Branczyk <fbranczyk@gmail.com>
Co-authored-by: George Brighton <george@gebn.co.uk>
Co-authored-by: Austin Cawley-Edwards <austin.cawley@gmail.com>
Co-authored-by: Levi Harrison <git@leviharrison.dev>
Co-authored-by: Augustin Husson <husson.augustin@gmail.com>
											
										
										
											2021-08-12 16:38:06 +00:00
+									// Runtime reloadable option. At the top of the struct for 32 bit OS:
 									// https://pkg.go.dev/sync/atomic#pkg-note-BUG
 									MaxExemplars atomic.Int64
-												Combine NewHead() args into a HeadOptions struct (#8452)

* Combine NewHead() args into a HeadOptions struct

Signed-off-by: Dustin Hooten <dustinhooten@gmail.com>

* remove overrides params

Signed-off-by: Dustin Hooten <dustinhooten@gmail.com>

* address pr feedback

Signed-off-by: Dustin Hooten <dustinhooten@gmail.com>
											
										
										
											2021-02-09 14:12:48 +00:00
+									ChunkRange int64
 									// ChunkDirRoot is the parent directory of the chunks directory.
 									ChunkDirRoot         string
 									ChunkPool            chunkenc.Pool
 									ChunkWriteBufferSize int
 									// StripeSize sets the number of entries in the hash map, it must be a power of 2.
 									// A larger StripeSize will allocate more memory up-front, but will increase performance when handling a large number of series.
 									// A smaller StripeSize reduces the memory allocated, but can decrease performance with large number of series.
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+									StripeSize                     int
 									SeriesCallback                 SeriesLifecycleCallback
 									EnableExemplarStorage          bool
 									EnableMemorySnapshotOnShutdown bool
-												Combine NewHead() args into a HeadOptions struct (#8452)

* Combine NewHead() args into a HeadOptions struct

Signed-off-by: Dustin Hooten <dustinhooten@gmail.com>

* remove overrides params

Signed-off-by: Dustin Hooten <dustinhooten@gmail.com>

* address pr feedback

Signed-off-by: Dustin Hooten <dustinhooten@gmail.com>
											
										
										
											2021-02-09 14:12:48 +00:00
+								}
 								func DefaultHeadOptions() *HeadOptions {
 									return &HeadOptions{
 										ChunkRange:           DefaultBlockDuration,
 										ChunkDirRoot:         "",
 										ChunkPool:            chunkenc.NewPool(),
 										ChunkWriteBufferSize: chunks.DefaultWriteBufferSize,
 										StripeSize:           DefaultStripeSize,
 										SeriesCallback:       &noopSeriesLifecycleCallback{},
 									}
 								}
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+								// SeriesLifecycleCallback specifies a list of callbacks that will be called during a lifecycle of a series.
 								// It is always a no-op in Prometheus and mainly meant for external users who import TSDB.
 								// All the callbacks should be safe to be called concurrently.
 								// It is up to the user to implement soft or hard consistency by making the callbacks
 								// atomic or non-atomic. Atomic callbacks can cause degradation performance.
 								type SeriesLifecycleCallback interface {
 									// PreCreation is called before creating a series to indicate if the series can be created.
 									// A non nil error means the series should not be created.
 									PreCreation(labels.Labels) error
 									// PostCreation is called after creating a series to indicate a creation of series.
 									PostCreation(labels.Labels)
 									// PostDeletion is called after deletion of series.
 									PostDeletion(...labels.Labels)
 								}
 								// NewHead opens the head block in dir.
 								func NewHead(r prometheus.Registerer, l log.Logger, wal *wal.WAL, opts *HeadOptions, stats *HeadStats) (*Head, error) {
 									var err error
 									if l == nil {
 										l = log.NewNopLogger()
 									}
 									if opts.ChunkRange < 1 {
 										return nil, errors.Errorf("invalid chunk range %d", opts.ChunkRange)
 									}
 									if opts.SeriesCallback == nil {
 										opts.SeriesCallback = &noopSeriesLifecycleCallback{}
 									}
 									if stats == nil {
 										stats = NewHeadStats()
 									}
-												Fix panic on failed snapshot replay and don't hard fail replay on disabled exemplars (#9438)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-10-05 05:21:25 +00:00
+									if !opts.EnableExemplarStorage {
 										opts.MaxExemplars.Store(0)
 									}
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+									h := &Head{
-												Fix bugs and add enhancements to the chunk snapshot (#9185)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-17 17:08:16 +00:00
+										wal:    wal,
 										logger: l,
 										opts:   opts,
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+										memChunkPool: sync.Pool{
 											New: func() interface{} {
 												return &memChunk{}
 											},
 										},
 										stats: stats,
 										reg:   r,
 									}
-												Fix bugs and add enhancements to the chunk snapshot (#9185)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-17 17:08:16 +00:00
+									if err := h.resetInMemoryState(); err != nil {
 										return nil, err
 									}
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+									h.metrics = newHeadMetrics(h, r)
 									if opts.ChunkPool == nil {
 										opts.ChunkPool = chunkenc.NewPool()
 									}
 									h.chunkDiskMapper, err = chunks.NewChunkDiskMapper(
 										mmappedChunksDir(opts.ChunkDirRoot),
 										opts.ChunkPool,
 										opts.ChunkWriteBufferSize,
 									)
 									if err != nil {
 										return nil, err
 									}
 									return h, nil
 								}
-												Fix bugs and add enhancements to the chunk snapshot (#9185)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-17 17:08:16 +00:00
+								func (h *Head) resetInMemoryState() error {
 									var err error
-												Fix panic on failed snapshot replay and don't hard fail replay on disabled exemplars (#9438)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-10-05 05:21:25 +00:00
+									var em *ExemplarMetrics
 									if h.exemplars != nil {
 										ce, ok := h.exemplars.(*CircularExemplarStorage)
 										if ok {
 											em = ce.metrics
 										}
 									}
 									if em == nil {
 										em = NewExemplarMetrics(h.reg)
 									}
-												Fix bugs and add enhancements to the chunk snapshot (#9185)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-17 17:08:16 +00:00
+									es, err := NewCircularExemplarStorage(h.opts.MaxExemplars.Load(), em)
 									if err != nil {
 										return err
 									}
 									h.exemplarMetrics = em
 									h.exemplars = es
 									h.series = newStripeSeries(h.opts.StripeSize, h.opts.SeriesCallback)
 									h.postings = index.NewUnorderedMemPostings()
 									h.tombstones = tombstones.NewMemTombstones()
 									h.iso = newIsolation()
 									h.deleted = map[uint64]int{}
 									h.chunkRange.Store(h.opts.ChunkRange)
 									h.minTime.Store(math.MaxInt64)
 									h.maxTime.Store(math.MinInt64)
 									h.lastWALTruncationTime.Store(math.MinInt64)
 									h.lastMemoryTruncationTime.Store(math.MinInt64)
 									return nil
 								}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+								type headMetrics struct {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									activeAppenders          prometheus.Gauge
 									series                   prometheus.GaugeFunc
 									seriesCreated            prometheus.Counter
 									seriesRemoved            prometheus.Counter
 									seriesNotFound           prometheus.Counter
 									chunks                   prometheus.Gauge
 									chunksCreated            prometheus.Counter
 									chunksRemoved            prometheus.Counter
 									gcDuration               prometheus.Summary
 									samplesAppended          prometheus.Counter
 									outOfBoundSamples        prometheus.Counter
 									outOfOrderSamples        prometheus.Counter
 									walTruncateDuration      prometheus.Summary
 									walCorruptionsTotal      prometheus.Counter
-												Add tsdb startup duration metric (#7737)

* Add tsdb wal replay duration metric

Signed-off-by: Max Neverov <neverov.max@gmail.com>
											
										
										
											2020-09-21 16:25:05 +00:00
+									walTotalReplayDuration   prometheus.Gauge
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									headTruncateFail         prometheus.Counter
 									headTruncateTotal        prometheus.Counter
 									checkpointDeleteFail     prometheus.Counter
 									checkpointDeleteTotal    prometheus.Counter
 									checkpointCreationFail   prometheus.Counter
 									checkpointCreationTotal  prometheus.Counter
 									mmapChunkCorruptionTotal prometheus.Counter
-												Fix bugs and add enhancements to the chunk snapshot (#9185)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-17 17:08:16 +00:00
+									snapshotReplayErrorTotal prometheus.Counter // Will be either 0 or 1.
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+								}
 								func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics {
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 19:22:27 +00:00
+									m := &headMetrics{
 										activeAppenders: prometheus.NewGauge(prometheus.GaugeOpts{
 											Name: "prometheus_tsdb_head_active_appenders",
 											Help: "Number of currently active appender transactions",
 										}),
 										series: prometheus.NewGaugeFunc(prometheus.GaugeOpts{
 											Name: "prometheus_tsdb_head_series",
 											Help: "Total number of series in the head block.",
 										}, func() float64 {
 											return float64(h.NumSeries())
 										}),
 										seriesCreated: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_series_created_total",
 											Help: "Total number of series created in the head",
 										}),
 										seriesRemoved: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_series_removed_total",
 											Help: "Total number of series removed in the head",
 										}),
 										seriesNotFound: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_series_not_found_total",
 											Help: "Total number of requests for series that were not found.",
 										}),
 										chunks: prometheus.NewGauge(prometheus.GaugeOpts{
 											Name: "prometheus_tsdb_head_chunks",
 											Help: "Total number of chunks in the head block.",
 										}),
 										chunksCreated: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_chunks_created_total",
 											Help: "Total number of chunks created in the head",
 										}),
 										chunksRemoved: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_chunks_removed_total",
 											Help: "Total number of chunks removed in the head",
 										}),
 										gcDuration: prometheus.NewSummary(prometheus.SummaryOpts{
 											Name: "prometheus_tsdb_head_gc_duration_seconds",
 											Help: "Runtime of garbage collection in the head block.",
 										}),
 										walTruncateDuration: prometheus.NewSummary(prometheus.SummaryOpts{
 											Name: "prometheus_tsdb_wal_truncate_duration_seconds",
 											Help: "Duration of WAL truncation.",
 										}),
 										walCorruptionsTotal: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_wal_corruptions_total",
 											Help: "Total number of WAL corruptions.",
 										}),
-												Add tsdb startup duration metric (#7737)

* Add tsdb wal replay duration metric

Signed-off-by: Max Neverov <neverov.max@gmail.com>
											
										
										
											2020-09-21 16:25:05 +00:00
+										walTotalReplayDuration: prometheus.NewGauge(prometheus.GaugeOpts{
 											Name: "prometheus_tsdb_data_replay_duration_seconds",
 											Help: "Time taken to replay the data on disk.",
 										}),
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 19:22:27 +00:00
+										samplesAppended: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_samples_appended_total",
 											Help: "Total number of appended samples.",
 										}),
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+										outOfBoundSamples: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_out_of_bound_samples_total",
 											Help: "Total number of out of bound samples ingestion failed attempts.",
 										}),
 										outOfOrderSamples: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_out_of_order_samples_total",
 											Help: "Total number of out of order samples ingestion failed attempts.",
 										}),
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 19:22:27 +00:00
+										headTruncateFail: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_truncations_failed_total",
 											Help: "Total number of head truncations that failed.",
 										}),
 										headTruncateTotal: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_head_truncations_total",
 											Help: "Total number of head truncations attempted.",
 										}),
 										checkpointDeleteFail: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_checkpoint_deletions_failed_total",
 											Help: "Total number of checkpoint deletions that failed.",
 										}),
 										checkpointDeleteTotal: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_checkpoint_deletions_total",
 											Help: "Total number of checkpoint deletions attempted.",
 										}),
 										checkpointCreationFail: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_checkpoint_creations_failed_total",
 											Help: "Total number of checkpoint creations that failed.",
 										}),
 										checkpointCreationTotal: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_checkpoint_creations_total",
 											Help: "Total number of checkpoint creations attempted.",
 										}),
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+										mmapChunkCorruptionTotal: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_mmap_chunk_corruptions_total",
 											Help: "Total number of memory-mapped chunk corruptions.",
 										}),
-												Fix bugs and add enhancements to the chunk snapshot (#9185)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-17 17:08:16 +00:00
+										snapshotReplayErrorTotal: prometheus.NewCounter(prometheus.CounterOpts{
 											Name: "prometheus_tsdb_snapshot_replay_error_total",
 											Help: "Total number snapshot replays that failed.",
 										}),
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 19:22:27 +00:00
+									}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
 									if r != nil {
 										r.MustRegister(
 											m.activeAppenders,
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 19:22:27 +00:00
+											m.series,
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+											m.chunks,
 											m.chunksCreated,
 											m.chunksRemoved,
 											m.seriesCreated,
 											m.seriesRemoved,
-												head: track number of series not found errors in metric

											
										
										
											2017-10-12 13:25:12 +00:00
+											m.seriesNotFound,
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+											m.gcDuration,
 											m.walTruncateDuration,
-												re-add the missing prometheus_tsdb_wal_corruptions_total (#473)

closes https://github.com/prometheus/tsdb/issues/471

after implementing the new WAL this metric was missing so adding it again.
Also added it in a test to make sure it works as expected.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-12-18 10:24:56 +00:00
+											m.walCorruptionsTotal,
-												Add tsdb startup duration metric (#7737)

* Add tsdb wal replay duration metric

Signed-off-by: Max Neverov <neverov.max@gmail.com>
											
										
										
											2020-09-21 16:25:05 +00:00
+											m.walTotalReplayDuration,
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+											m.samplesAppended,
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+											m.outOfBoundSamples,
 											m.outOfOrderSamples,
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 13:48:33 +00:00
+											m.headTruncateFail,
 											m.headTruncateTotal,
-												Add new metrics.

1. 'prometheus_tsdb_wal_truncate_fail' for failed WAL truncation.
2. 'prometheus_tsdb_checkpoint_delete_fail' for failed old checkpoint delete.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 11:49:09 +00:00
+											m.checkpointDeleteFail,
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 13:48:33 +00:00
+											m.checkpointDeleteTotal,
 											m.checkpointCreationFail,
 											m.checkpointCreationTotal,
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+											m.mmapChunkCorruptionTotal,
-												Fix bugs and add enhancements to the chunk snapshot (#9185)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-17 17:08:16 +00:00
+											m.snapshotReplayErrorTotal,
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 19:22:27 +00:00
+											// Metrics bound to functions and not needed in tests
 											// can be created and registered on the spot.
 											prometheus.NewGaugeFunc(prometheus.GaugeOpts{
 												Name: "prometheus_tsdb_head_max_time",
 												Help: "Maximum timestamp of the head block. The unit is decided by the library consumer.",
 											}, func() float64 {
 												return float64(h.MaxTime())
 											}),
 											prometheus.NewGaugeFunc(prometheus.GaugeOpts{
 												Name: "prometheus_tsdb_head_min_time",
 												Help: "Minimum time bound of the head block. The unit is decided by the library consumer.",
 											}, func() float64 {
 												return float64(h.MinTime())
 											}),
 											prometheus.NewGaugeFunc(prometheus.GaugeOpts{
 												Name: "prometheus_tsdb_isolation_low_watermark",
 												Help: "The lowest TSDB append ID that is still referenced.",
 											}, func() float64 {
 												return float64(h.iso.lowWatermark())
 											}),
 											prometheus.NewGaugeFunc(prometheus.GaugeOpts{
 												Name: "prometheus_tsdb_isolation_high_watermark",
 												Help: "The highest TSDB append ID that has been given out.",
 											}, func() float64 {
-												Optimise lowWatermark in Isolation (#7332)

* Track open appenders in doubly-linked list to make lowWatermark O(1).
* Use RW locks.
* Added BenchmarkIsolationWithState.

Signed-off-by: Peter Štibraný <peter.stibrany@grafana.com>
											
										
										
											2020-06-03 18:09:05 +00:00
+												return float64(h.iso.lastAppendID())
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 19:22:27 +00:00
+											}),
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+										)
 									}
 									return m
 								}
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+								func mmappedChunksDir(dir string) string { return filepath.Join(dir, "chunks_head") }
-												React UI: Add Starting Screen (#8662)

* Added walreplay API endpoint

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Added starting page to react-ui

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Documented the new endpoint

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed typos

Signed-off-by: Levi Harrison <git@leviharrison.dev>

Co-authored-by: Julius Volz <julius.volz@gmail.com>

* Removed logo

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed isResponding to isUnexpected

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed width of progress bar

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed width of progress bar

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Added DB stats object

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Updated starting page to work with new fields

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (pt. 2)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (pt. 3)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (and also implementing a method this time) (pt. 4)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (and also implementing a method this time) (pt. 5)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed const to let

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (pt. 6)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Remove SetStats method

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Added comma

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed api

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed to triple equals

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed data response types

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Don't return pointer

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed version

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed interface issue

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed pointer

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed copying lock value error

Signed-off-by: Levi Harrison <git@leviharrison.dev>

Co-authored-by: Julius Volz <julius.volz@gmail.com>
											
										
										
											2021-06-05 14:29:32 +00:00
+								// HeadStats are the statistics for the head component of the DB.
 								type HeadStats struct {
 									WALReplayStatus *WALReplayStatus
 								}
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+								// NewHeadStats returns a new HeadStats object.
 								func NewHeadStats() *HeadStats {
 									return &HeadStats{
 										WALReplayStatus: &WALReplayStatus{},
-												wal: parallelize sample processing

											
										
										
											2017-10-07 13:55:11 +00:00
+									}
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+								}
-												Ensure workers terminated fully before reading unknownRefs

											
										
										
											2017-10-11 08:12:29 +00:00
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+								// WALReplayStatus contains status information about the WAL replay.
 								type WALReplayStatus struct {
 									sync.RWMutex
 									Min     int
 									Max     int
 									Current int
 								}
 								// GetWALReplayStatus returns the WAL replay status information.
 								func (s *WALReplayStatus) GetWALReplayStatus() WALReplayStatus {
 									s.RLock()
 									defer s.RUnlock()
-												move the wal repair logic in db.Open (#633)

* move the wal repair logic in db.Open

This is to allow opening a wal in a read oly mode without triggering a
repair.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
											
										
										
											2019-06-14 15:39:22 +00:00
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+									return WALReplayStatus{
 										Min:     s.Min,
 										Max:     s.Max,
 										Current: s.Current,
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+									}
 								}
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+								const cardinalityCacheExpirationTime = time.Duration(30) * time.Second
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+								// Init loads data from the write ahead log and prepares the head for writes.
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 19:22:27 +00:00
+								// It should be called before using an appender so that it
-												no overlapping on compaction when an existing block is not within default boundaries. (#461)

closes https://github.com/prometheus/prometheus/issues/4643

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-12-04 10:30:49 +00:00
+								// limits the ingested samples to the head min valid time.
-												Exemplars in snapshot (#9255)

* Exemplars in snapshot

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Add docs

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix comments

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-30 14:04:38 +00:00
+								func (h *Head) Init(minValidTime int64) error {
-												tsdb: Replace sync/atomic with uber-go/atomic in tsdb (#7659)

* tsdb/chunks: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb/heaad: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* vendor: Make go.uber.org/atomic a direct dependency

There is no modifications to go.sum and vendor/ because
it was already vendored.

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb: Remove comments referring to the sync/atomic alignment bug

Related: https://golang.org/pkg/sync/atomic/#pkg-note-BUG

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>
											
										
										
											2020-07-28 04:42:42 +00:00
+									h.minValidTime.Store(minValidTime)
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+									defer h.postings.EnsureOrder()
-												no overlapping on compaction when an existing block is not within default boundaries. (#461)

closes https://github.com/prometheus/prometheus/issues/4643

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-12-04 10:30:49 +00:00
+									defer h.gc() // After loading the wal remove the obsolete data from the head.
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+									defer func() {
 										// Loading of m-mapped chunks and snapshot can make the mint of the Head
 										// to go below minValidTime.
 										if h.MinTime() < h.minValidTime.Load() {
 											h.minTime.Store(h.minValidTime.Load())
 										}
 									}()
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
-												Replay m-map chunks irrespective of WAL (#7589)

* Replay m-map chunks irrespective of WAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* More logs

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-07-16 13:04:08 +00:00
+									level.Info(h.logger).Log("msg", "Replaying on-disk memory mappable chunks if any")
-												Log WAL replay duration

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-03-03 14:11:14 +00:00
+									start := time.Now()
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
-												Fix bugs and add enhancements to the chunk snapshot (#9185)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-17 17:08:16 +00:00
+									snapIdx, snapOffset := -1, 0
 									refSeries := make(map[uint64]*memSeries)
 									if h.opts.EnableMemorySnapshotOnShutdown {
 										level.Info(h.logger).Log("msg", "Chunk snapshot is enabled, replaying from the snapshot")
-												Exemplars in snapshot (#9255)

* Exemplars in snapshot

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Add docs

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix lint

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix comments

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-30 14:04:38 +00:00
+										var err error
-												Fix bugs and add enhancements to the chunk snapshot (#9185)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-17 17:08:16 +00:00
+										snapIdx, snapOffset, refSeries, err = h.loadChunkSnapshot()
 										if err != nil {
 											snapIdx, snapOffset = -1, 0
 											h.metrics.snapshotReplayErrorTotal.Inc()
 											level.Error(h.logger).Log("msg", "Failed to load chunk snapshot", "err", err)
 											// We clear the partially loaded data to replay fresh from the WAL.
 											if err := h.resetInMemoryState(); err != nil {
 												return err
 											}
 										}
 										level.Info(h.logger).Log("msg", "Chunk snapshot loading time", "duration", time.Since(start).String())
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+									}
 									mmapChunkReplayStart := time.Now()
 									mmappedChunks, err := h.loadMmappedChunks(refSeries)
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									if err != nil {
 										level.Error(h.logger).Log("msg", "Loading on-disk chunks failed", "err", err)
 										if _, ok := errors.Cause(err).(*chunks.CorruptionErr); ok {
 											h.metrics.mmapChunkCorruptionTotal.Inc()
 										}
 										// If this fails, data will be recovered from WAL.
 										// Hence we wont lose any data (given WAL is not corrupt).
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+										mmappedChunks = h.removeCorruptedMmappedChunks(err, refSeries)
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									}
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+									level.Info(h.logger).Log("msg", "On-disk memory mappable chunks replay completed", "duration", time.Since(mmapChunkReplayStart).String())
-												Replay m-map chunks irrespective of WAL (#7589)

* Replay m-map chunks irrespective of WAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* More logs

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-07-16 13:04:08 +00:00
+									if h.wal == nil {
 										level.Info(h.logger).Log("msg", "WAL not found")
 										return nil
 									}
 									level.Info(h.logger).Log("msg", "Replaying WAL, this may take a while")
 									checkpointReplayStart := time.Now()
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+									// Backfill the checkpoint first if it exists.
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 09:15:41 +00:00
+									dir, startFrom, err := wal.LastCheckpoint(h.wal.Dir())
 									if err != nil && err != record.ErrNotFound {
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+										return errors.Wrap(err, "find last checkpoint")
 									}
-												update checkpoint replay status (#8898)

* Consider  wal checkpoint replay status

Signed-off-by: XiaoYu Zhang <ideoutrea@163.com>

* Fix tests failed

Signed-off-by: XiaoYu Zhang <ideoutrea@163.com>

* Update checkpoint replay status

Signed-off-by: XiaoYu Zhang <ideoutrea@163.com>
											
										
										
											2021-07-13 10:08:07 +00:00
 									// Find the last segment.
 									_, endAt, e := wal.Segments(h.wal.Dir())
 									if e != nil {
 										return errors.Wrap(e, "finding WAL segments")
 									}
 									h.startWALReplayStatus(startFrom, endAt)
-												Handle multiple refs for the same series when WAL reading. (#623)

This can happen if a given series is created/truncated/recreated.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-06-06 13:28:54 +00:00
+									multiRef := map[uint64]uint64{}
-												Do not replay checkpoint if it is covered by snapshot (#9226)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-25 16:18:55 +00:00
+									if err == nil && startFrom >= snapIdx {
-												refactor NewSegmentsRangeReader to take multi WAL ranges (#449)

* refactor NewSegmentsRangeReader to take multi WAL ranges

In case of an error when checkpointing the WAL the error doesn't show
the exact WAL index that is corrupter. this is because it uses
MultiReader to read multiply WAL files.
This refactoring allows the NewSegmentsRangeReader to take more than a
single WAL range and it reads all of the ranges by iterating each one.

this changes the logs from
create checkpoint: read segments: corruption after 4841144384 bytes:...
to
create checkpoint: read segments: corruption in segment
data/wal/00017351 at 123142208: ...

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>

											
										
										
											2018-11-30 14:46:16 +00:00
+										sr, err := wal.NewSegmentsReader(dir)
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+										if err != nil {
 											return errors.Wrap(err, "open checkpoint")
 										}
-												move the wal repair logic in db.Open (#633)

* move the wal repair logic in db.Open

This is to allow opening a wal in a read oly mode without triggering a
repair.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
											
										
										
											2019-06-14 15:39:22 +00:00
+										defer func() {
 											if err := sr.Close(); err != nil {
-												Capitalizing first letter of all log lines (#7043)

Signed-off-by: Marek Slabicki <thaniri@gmail.com>
											
										
										
											2020-04-11 08:22:18 +00:00
+												level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
-												move the wal repair logic in db.Open (#633)

* move the wal repair logic in db.Open

This is to allow opening a wal in a read oly mode without triggering a
repair.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
											
										
										
											2019-06-14 15:39:22 +00:00
+											}
 										}()
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
 										// A corrupted checkpoint is a hard error for now and requires user
 										// intervention. There's likely little data that can be recovered anyway.
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+										if err := h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks); err != nil {
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+											return errors.Wrap(err, "backfill checkpoint")
 										}
-												update checkpoint replay status (#8898)

* Consider  wal checkpoint replay status

Signed-off-by: XiaoYu Zhang <ideoutrea@163.com>

* Fix tests failed

Signed-off-by: XiaoYu Zhang <ideoutrea@163.com>

* Update checkpoint replay status

Signed-off-by: XiaoYu Zhang <ideoutrea@163.com>
											
										
										
											2021-07-13 10:08:07 +00:00
+										h.updateWALReplayStatusRead(startFrom)
-												more descriptive var names and some more logging. (#405)

* more descriptive checkpoint var names and some more logging.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-10-11 15:23:52 +00:00
+										startFrom++
-												Add logging during WAL replay

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>

											
										
										
											2019-07-13 17:10:44 +00:00
+										level.Info(h.logger).Log("msg", "WAL checkpoint loaded")
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+									}
-												Replay m-map chunks irrespective of WAL (#7589)

* Replay m-map chunks irrespective of WAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* More logs

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-07-16 13:04:08 +00:00
+									checkpointReplayDuration := time.Since(checkpointReplayStart)
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
-												Replay m-map chunks irrespective of WAL (#7589)

* Replay m-map chunks irrespective of WAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* More logs

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-07-16 13:04:08 +00:00
+									walReplayStart := time.Now()
-												React UI: Add Starting Screen (#8662)

* Added walreplay API endpoint

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Added starting page to react-ui

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Documented the new endpoint

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed typos

Signed-off-by: Levi Harrison <git@leviharrison.dev>

Co-authored-by: Julius Volz <julius.volz@gmail.com>

* Removed logo

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed isResponding to isUnexpected

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed width of progress bar

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed width of progress bar

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Added DB stats object

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Updated starting page to work with new fields

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (pt. 2)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (pt. 3)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (and also implementing a method this time) (pt. 4)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (and also implementing a method this time) (pt. 5)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed const to let

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (pt. 6)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Remove SetStats method

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Added comma

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed api

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed to triple equals

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed data response types

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Don't return pointer

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed version

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed interface issue

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed pointer

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed copying lock value error

Signed-off-by: Levi Harrison <git@leviharrison.dev>

Co-authored-by: Julius Volz <julius.volz@gmail.com>
											
										
										
											2021-06-05 14:29:32 +00:00
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+									if snapIdx > startFrom {
 										startFrom = snapIdx
 									}
-												Always create a new clean segment when starting the WAL. (#608)

* Always create a new clean segment when starting the WAL.
* Ensure we flush the last page after repairing and before recreating the
new segment in Repair.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-05-24 18:33:28 +00:00
+									// Backfill segments from the most recent checkpoint onwards.
-												update checkpoint replay status (#8898)

* Consider  wal checkpoint replay status

Signed-off-by: XiaoYu Zhang <ideoutrea@163.com>

* Fix tests failed

Signed-off-by: XiaoYu Zhang <ideoutrea@163.com>

* Update checkpoint replay status

Signed-off-by: XiaoYu Zhang <ideoutrea@163.com>
											
										
										
											2021-07-13 10:08:07 +00:00
+									for i := startFrom; i <= endAt; i++ {
-												Always create a new clean segment when starting the WAL. (#608)

* Always create a new clean segment when starting the WAL.
* Ensure we flush the last page after repairing and before recreating the
new segment in Repair.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-05-24 18:33:28 +00:00
+										s, err := wal.OpenReadSegment(wal.SegmentName(h.wal.Dir(), i))
 										if err != nil {
 											return errors.Wrap(err, fmt.Sprintf("open WAL segment: %d", i))
 										}
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+										offset := 0
 										if i == snapIdx {
 											offset = snapOffset
 										}
 										sr, err := wal.NewSegmentBufReaderWithOffset(offset, s)
-												Fix bugs and add enhancements to the chunk snapshot (#9185)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-17 17:08:16 +00:00
+										if errors.Cause(err) == io.EOF {
 											// File does not exist.
 											continue
 										}
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+										if err != nil {
 											return errors.Wrapf(err, "segment reader (offset=%d)", offset)
 										}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+										err = h.loadWAL(wal.NewReader(sr), multiRef, mmappedChunks)
-												move the wal repair logic in db.Open (#633)

* move the wal repair logic in db.Open

This is to allow opening a wal in a read oly mode without triggering a
repair.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
											
										
										
											2019-06-14 15:39:22 +00:00
+										if err := sr.Close(); err != nil {
-												Capitalizing first letter of all log lines (#7043)

Signed-off-by: Marek Slabicki <thaniri@gmail.com>
											
										
										
											2020-04-11 08:22:18 +00:00
+											level.Warn(h.logger).Log("msg", "Error while closing the wal segments reader", "err", err)
-												Always create a new clean segment when starting the WAL. (#608)

* Always create a new clean segment when starting the WAL.
* Ensure we flush the last page after repairing and before recreating the
new segment in Repair.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-05-24 18:33:28 +00:00
+										}
-												move the wal repair logic in db.Open (#633)

* move the wal repair logic in db.Open

This is to allow opening a wal in a read oly mode without triggering a
repair.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
											
										
										
											2019-06-14 15:39:22 +00:00
+										if err != nil {
 											return err
-												Always create a new clean segment when starting the WAL. (#608)

* Always create a new clean segment when starting the WAL.
* Ensure we flush the last page after repairing and before recreating the
new segment in Repair.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-05-24 18:33:28 +00:00
+										}
-												update checkpoint replay status (#8898)

* Consider  wal checkpoint replay status

Signed-off-by: XiaoYu Zhang <ideoutrea@163.com>

* Fix tests failed

Signed-off-by: XiaoYu Zhang <ideoutrea@163.com>

* Update checkpoint replay status

Signed-off-by: XiaoYu Zhang <ideoutrea@163.com>
											
										
										
											2021-07-13 10:08:07 +00:00
+										level.Info(h.logger).Log("msg", "WAL segment loaded", "segment", i, "maxSegment", endAt)
-												React UI: Add Starting Screen (#8662)

* Added walreplay API endpoint

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Added starting page to react-ui

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Documented the new endpoint

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed typos

Signed-off-by: Levi Harrison <git@leviharrison.dev>

Co-authored-by: Julius Volz <julius.volz@gmail.com>

* Removed logo

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed isResponding to isUnexpected

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed width of progress bar

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed width of progress bar

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Added DB stats object

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Updated starting page to work with new fields

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (pt. 2)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (pt. 3)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (and also implementing a method this time) (pt. 4)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (and also implementing a method this time) (pt. 5)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed const to let

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (pt. 6)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Remove SetStats method

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Added comma

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed api

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed to triple equals

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed data response types

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Don't return pointer

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed version

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed interface issue

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed pointer

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed copying lock value error

Signed-off-by: Levi Harrison <git@leviharrison.dev>

Co-authored-by: Julius Volz <julius.volz@gmail.com>
											
										
										
											2021-06-05 14:29:32 +00:00
+										h.updateWALReplayStatusRead(i)
-												wal: parallelize sample processing

											
										
										
											2017-10-07 13:55:11 +00:00
+									}
-												Always create a new clean segment when starting the WAL. (#608)

* Always create a new clean segment when starting the WAL.
* Ensure we flush the last page after repairing and before recreating the
new segment in Repair.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-05-24 18:33:28 +00:00
-												Add tsdb startup duration metric (#7737)

* Add tsdb wal replay duration metric

Signed-off-by: Max Neverov <neverov.max@gmail.com>
											
										
										
											2020-09-21 16:25:05 +00:00
+									walReplayDuration := time.Since(start)
 									h.metrics.walTotalReplayDuration.Set(walReplayDuration.Seconds())
-												Replay m-map chunks irrespective of WAL (#7589)

* Replay m-map chunks irrespective of WAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* More logs

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-07-16 13:04:08 +00:00
+									level.Info(h.logger).Log(
 										"msg", "WAL replay completed",
 										"checkpoint_replay_duration", checkpointReplayDuration.String(),
 										"wal_replay_duration", time.Since(walReplayStart).String(),
-												Add tsdb startup duration metric (#7737)

* Add tsdb wal replay duration metric

Signed-off-by: Max Neverov <neverov.max@gmail.com>
											
										
										
											2020-09-21 16:25:05 +00:00
+										"total_replay_duration", walReplayDuration.String(),
-												Replay m-map chunks irrespective of WAL (#7589)

* Replay m-map chunks irrespective of WAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* More logs

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-07-16 13:04:08 +00:00
+									)
-												Log WAL replay duration

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-03-03 14:11:14 +00:00
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+									return nil
-												Make WAL for HeadBlock composeable.

											
										
										
											2017-05-13 16:14:18 +00:00
+								}
-												Handle compaction trigger and reinitializing in DB

											
										
										
											2017-01-06 11:37:28 +00:00
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+								func (h *Head) loadMmappedChunks(refSeries map[uint64]*memSeries) (map[uint64][]*mmappedChunk, error) {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									mmappedChunks := map[uint64][]*mmappedChunk{}
-												TSDB: Use a dedicated head chunk reference type (#9501)

* Use dedicated Ref type

Throughout the code base, there are reference types masked as
regular integers.  Let's use dedicated types.  They are
equivalent, but clearer semantically.
This also makes it trivial to find where they are used,
and from uses, find the centralized docs.

Signed-off-by: Dieter Plaetinck <dieter@grafana.com>

* postpone some work until after possible return

Signed-off-by: Dieter Plaetinck <dieter@grafana.com>

* clarify

Signed-off-by: Dieter Plaetinck <dieter@grafana.com>

* rename feedback

Signed-off-by: Dieter Plaetinck <dieter@grafana.com>

* skip header is up to caller

Signed-off-by: Dieter Plaetinck <dieter@grafana.com>
											
										
										
											2021-10-13 12:14:32 +00:00
+									if err := h.chunkDiskMapper.IterateAllChunks(func(seriesRef uint64, chunkRef chunks.ChunkDiskMapperRef, mint, maxt int64, numSamples uint16) error {
-												tsdb: Replace sync/atomic with uber-go/atomic in tsdb (#7659)

* tsdb/chunks: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb/heaad: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* vendor: Make go.uber.org/atomic a direct dependency

There is no modifications to go.sum and vendor/ because
it was already vendored.

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb: Remove comments referring to the sync/atomic alignment bug

Related: https://golang.org/pkg/sync/atomic/#pkg-note-BUG

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>
											
										
										
											2020-07-28 04:42:42 +00:00
+										if maxt < h.minValidTime.Load() {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+											return nil
 										}
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+										ms, ok := refSeries[seriesRef]
 										if !ok {
 											slice := mmappedChunks[seriesRef]
 											if len(slice) > 0 && slice[len(slice)-1].maxTime >= mint {
 												return errors.Errorf("out of sequence m-mapped chunk for series ref %d", seriesRef)
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+											}
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
 											slice = append(slice, &mmappedChunk{
 												ref:        chunkRef,
 												minTime:    mint,
 												maxTime:    maxt,
 												numSamples: numSamples,
 											})
 											mmappedChunks[seriesRef] = slice
 											return nil
 										}
 										if len(ms.mmappedChunks) > 0 && ms.mmappedChunks[len(ms.mmappedChunks)-1].maxTime >= mint {
 											return errors.Errorf("out of sequence m-mapped chunk for series ref %d", seriesRef)
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+										}
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+										h.metrics.chunks.Inc()
 										h.metrics.chunksCreated.Inc()
 										ms.mmappedChunks = append(ms.mmappedChunks, &mmappedChunk{
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+											ref:        chunkRef,
 											minTime:    mint,
 											maxTime:    maxt,
 											numSamples: numSamples,
 										})
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+										h.updateMinMaxTime(mint, maxt)
 										if ms.headChunk != nil && maxt >= ms.headChunk.minTime {
 											// The head chunk was completed and was m-mapped after taking the snapshot.
 											// Hence remove this chunk.
 											ms.nextAt = 0
 											ms.headChunk = nil
 											ms.app = nil
 										}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+										return nil
 									}); err != nil {
 										return nil, errors.Wrap(err, "iterate on on-disk chunks")
 									}
 									return mmappedChunks, nil
 								}
 								// removeCorruptedMmappedChunks attempts to delete the corrupted mmapped chunks and if it fails, it clears all the previously
 								// loaded mmapped chunks.
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+								func (h *Head) removeCorruptedMmappedChunks(err error, refSeries map[uint64]*memSeries) map[uint64][]*mmappedChunk {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									level.Info(h.logger).Log("msg", "Deleting mmapped chunk files")
 									if err := h.chunkDiskMapper.DeleteCorrupted(err); err != nil {
 										level.Info(h.logger).Log("msg", "Deletion of mmap chunk files failed, discarding chunk files completely", "err", err)
 										return map[uint64][]*mmappedChunk{}
 									}
 									level.Info(h.logger).Log("msg", "Deletion of mmap chunk files successful, reattempting m-mapping the on-disk chunks")
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+									mmappedChunks, err := h.loadMmappedChunks(refSeries)
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									if err != nil {
 										level.Error(h.logger).Log("msg", "Loading on-disk chunks failed, discarding chunk files completely", "err", err)
 										mmappedChunks = map[uint64][]*mmappedChunk{}
 									}
 									return mmappedChunks
 								}
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+								func (h *Head) ApplyConfig(cfg *config.Config) error {
 									if !h.opts.EnableExemplarStorage {
 										return nil
 									}
 									// Head uses opts.MaxExemplars in combination with opts.EnableExemplarStorage
 									// to decide if it should pass exemplars along to it's exemplar storage, so we
 									// need to update opts.MaxExemplars here.
 									prevSize := h.opts.MaxExemplars.Load()
 									h.opts.MaxExemplars.Store(cfg.StorageConfig.ExemplarsConfig.MaxExemplars)
 									if prevSize == h.opts.MaxExemplars.Load() {
 										return nil
 									}
 									migrated := h.exemplars.(*CircularExemplarStorage).Resize(h.opts.MaxExemplars.Load())
 									level.Info(h.logger).Log("msg", "Exemplar storage resized", "from", prevSize, "to", h.opts.MaxExemplars, "migrated", migrated)
 									return nil
 								}
 								// PostingsCardinalityStats returns top 10 highest cardinality stats By label and value names.
 								func (h *Head) PostingsCardinalityStats(statsByLabelName string) *index.PostingsStats {
 									h.cardinalityMutex.Lock()
 									defer h.cardinalityMutex.Unlock()
 									currentTime := time.Duration(time.Now().Unix()) * time.Second
 									seconds := currentTime - h.lastPostingsStatsCall
 									if seconds > cardinalityCacheExpirationTime {
 										h.cardinalityCache = nil
 									}
 									if h.cardinalityCache != nil {
 										return h.cardinalityCache
 									}
 									h.cardinalityCache = h.postings.Stats(statsByLabelName)
 									h.lastPostingsStatsCall = time.Duration(time.Now().Unix()) * time.Second
 									return h.cardinalityCache
 								}
 								func (h *Head) updateMinMaxTime(mint, maxt int64) {
 									for {
 										lt := h.MinTime()
 										if mint >= lt {
 											break
 										}
 										if h.minTime.CAS(lt, mint) {
 											break
 										}
 									}
 									for {
 										ht := h.MaxTime()
 										if maxt <= ht {
 											break
 										}
 										if h.maxTime.CAS(ht, maxt) {
 											break
 										}
 									}
 								}
 								// SetMinValidTime sets the minimum timestamp the head can ingest.
 								func (h *Head) SetMinValidTime(minValidTime int64) {
 									h.minValidTime.Store(minValidTime)
 								}
-												Create a checkpoint only at the end of Compact call (#8067)

* Create a checkpoint only at the end of Compact call

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix Bartek's offline reviews

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Introduce TruncateInMemory and TruncateWAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Small enhancements and test fixing attempts

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix tests

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add TestOneCheckpointPerCompactCall

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Don't truncate WAL on block compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Simplified the algo.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Better protection around calling truncateWAL, truncate WAL on Head compaction error

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

Co-authored-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-10-19 15:27:08 +00:00
+								// Truncate removes old data before mint from the head and WAL.
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 13:48:33 +00:00
+								func (h *Head) Truncate(mint int64) (err error) {
-												Create a checkpoint only at the end of Compact call (#8067)

* Create a checkpoint only at the end of Compact call

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix Bartek's offline reviews

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Introduce TruncateInMemory and TruncateWAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Small enhancements and test fixing attempts

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix tests

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add TestOneCheckpointPerCompactCall

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Don't truncate WAL on block compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Simplified the algo.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Better protection around calling truncateWAL, truncate WAL on Head compaction error

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

Co-authored-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-10-19 15:27:08 +00:00
+									initialize := h.MinTime() == math.MaxInt64
 									if err := h.truncateMemory(mint); err != nil {
 										return err
 									}
 									if initialize {
 										return nil
 									}
 									return h.truncateWAL(mint)
 								}
-												Call delete on head if interval overlaps (#9151)

* Call delete on head if interval overlaps

Signed-off-by: darshanime <deathbullet@gmail.com>

* Garbage collect tombstones during head gc

Signed-off-by: darshanime <deathbullet@gmail.com>

* Truncate tombstones before min time during head gc

Signed-off-by: darshanime <deathbullet@gmail.com>

* Lock less by deleting all keys in a single pass

Signed-off-by: darshanime <deathbullet@gmail.com>

* Pass map to DeleteTombstones

Signed-off-by: darshanime <deathbullet@gmail.com>

* Create new slice to replace old one

Signed-off-by: darshanime <deathbullet@gmail.com>
											
										
										
											2021-09-16 06:50:03 +00:00
+								// OverlapsClosedInterval returns true if the head overlaps [mint, maxt].
 								func (h *Head) OverlapsClosedInterval(mint, maxt int64) bool {
 									return h.MinTime() <= maxt && mint <= h.MaxTime()
 								}
-												Create a checkpoint only at the end of Compact call (#8067)

* Create a checkpoint only at the end of Compact call

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix Bartek's offline reviews

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Introduce TruncateInMemory and TruncateWAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Small enhancements and test fixing attempts

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix tests

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add TestOneCheckpointPerCompactCall

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Don't truncate WAL on block compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Simplified the algo.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Better protection around calling truncateWAL, truncate WAL on Head compaction error

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

Co-authored-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-10-19 15:27:08 +00:00
+								// truncateMemory removes old data before mint from the head.
 								func (h *Head) truncateMemory(mint int64) (err error) {
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+									h.chunkSnapshotMtx.Lock()
 									defer h.chunkSnapshotMtx.Unlock()
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 13:48:33 +00:00
+									defer func() {
 										if err != nil {
 											h.metrics.headTruncateFail.Inc()
 										}
 									}()
-												tsdb: Block Head GC till pending readers are done reading (#9081)

* tsdb: Block Head GC till pending readers are done reading

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments 2

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix the exclusiveness of maxt in WaitForPendingReadersInTimeRange

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-07-20 08:47:20 +00:00
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 21:19:32 +00:00
+									initialize := h.MinTime() == math.MaxInt64
-												Filter WAL data in Head, misc fixes

											
										
										
											2017-09-06 14:20:37 +00:00
-												Properly initialize head time

This fixes various issues when initializing the head time range
under different starting conditions.

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-25 21:19:32 +00:00
+									if h.MinTime() >= mint && !initialize {
-												Add tests for GC and chunk truncation

											
										
										
											2017-09-01 12:38:49 +00:00
+										return nil
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+									}
-												tsdb: Block Head GC till pending readers are done reading (#9081)

* tsdb: Block Head GC till pending readers are done reading

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments 2

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix the exclusiveness of maxt in WaitForPendingReadersInTimeRange

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-07-20 08:47:20 +00:00
 									// The order of these two Store() should not be changed,
 									// i.e. truncation time is set before in-process boolean.
 									h.lastMemoryTruncationTime.Store(mint)
 									h.memTruncationInProcess.Store(true)
 									defer h.memTruncationInProcess.Store(false)
 									// We wait for pending queries to end that overlap with this truncation.
 									if !initialize {
 										h.WaitForPendingReadersInTimeRange(h.MinTime(), mint)
 									}
-												tsdb: Replace sync/atomic with uber-go/atomic in tsdb (#7659)

* tsdb/chunks: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb/heaad: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* vendor: Make go.uber.org/atomic a direct dependency

There is no modifications to go.sum and vendor/ because
it was already vendored.

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb: Remove comments referring to the sync/atomic alignment bug

Related: https://golang.org/pkg/sync/atomic/#pkg-note-BUG

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>
											
										
										
											2020-07-28 04:42:42 +00:00
+									h.minTime.Store(mint)
 									h.minValidTime.Store(mint)
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
-												Fix min/max time handling and concurrent crc32 usage

											
										
										
											2017-09-07 11:04:02 +00:00
+									// Ensure that max time is at least as high as min time.
 									for h.MaxTime() < mint {
-												tsdb: Replace sync/atomic with uber-go/atomic in tsdb (#7659)

* tsdb/chunks: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb/heaad: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* vendor: Make go.uber.org/atomic a direct dependency

There is no modifications to go.sum and vendor/ because
it was already vendored.

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb: Remove comments referring to the sync/atomic alignment bug

Related: https://golang.org/pkg/sync/atomic/#pkg-note-BUG

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>
											
										
										
											2020-07-28 04:42:42 +00:00
+										h.maxTime.CAS(h.MaxTime(), mint)
-												Fix min/max time handling and concurrent crc32 usage

											
										
										
											2017-09-07 11:04:02 +00:00
+									}
-												Filter WAL data in Head, misc fixes

											
										
										
											2017-09-06 14:20:37 +00:00
+									// This was an initial call to Truncate after loading blocks on startup.
 									// We haven't read back the WAL yet, so do not attempt to truncate it.
 									if initialize {
 										return nil
 									}
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 13:48:33 +00:00
+									h.metrics.headTruncateTotal.Inc()
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+									start := time.Now()
-												Set the min time of Head properly after truncation (#8212)

* Set the min time of Head properly after truncation

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix lint

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Enhance compaction plan logic for completely deleted small block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-11-25 13:03:30 +00:00
+									actualMint := h.gc()
-												Capitalizing first letter of all log lines (#7043)

Signed-off-by: Marek Slabicki <thaniri@gmail.com>
											
										
										
											2020-04-11 08:22:18 +00:00
+									level.Info(h.logger).Log("msg", "Head GC completed", "duration", time.Since(start))
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+									h.metrics.gcDuration.Observe(time.Since(start).Seconds())
-												Set the min time of Head properly after truncation (#8212)

* Set the min time of Head properly after truncation

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix lint

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Enhance compaction plan logic for completely deleted small block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-11-25 13:03:30 +00:00
+									if actualMint > h.minTime.Load() {
 										// The actual mint of the Head is higher than the one asked to truncate.
 										appendableMinValidTime := h.appendableMinValidTime()
 										if actualMint < appendableMinValidTime {
 											h.minTime.Store(actualMint)
 											h.minValidTime.Store(actualMint)
 										} else {
 											// The actual min time is in the appendable window.
 											// So we set the mint to the appendableMinValidTime.
 											h.minTime.Store(appendableMinValidTime)
 											h.minValidTime.Store(appendableMinValidTime)
 										}
 									}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									// Truncate the chunk m-mapper.
 									if err := h.chunkDiskMapper.Truncate(mint); err != nil {
 										return errors.Wrap(err, "truncate chunks.HeadReadWriter")
 									}
-												Create a checkpoint only at the end of Compact call (#8067)

* Create a checkpoint only at the end of Compact call

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix Bartek's offline reviews

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Introduce TruncateInMemory and TruncateWAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Small enhancements and test fixing attempts

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix tests

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add TestOneCheckpointPerCompactCall

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Don't truncate WAL on block compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Simplified the algo.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Better protection around calling truncateWAL, truncate WAL on Head compaction error

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

Co-authored-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-10-19 15:27:08 +00:00
+									return nil
 								}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
-												tsdb: Block Head GC till pending readers are done reading (#9081)

* tsdb: Block Head GC till pending readers are done reading

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments 2

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix the exclusiveness of maxt in WaitForPendingReadersInTimeRange

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-07-20 08:47:20 +00:00
+								// WaitForPendingReadersInTimeRange waits for queries overlapping with given range to finish querying.
 								// The query timeout limits the max wait time of this function implicitly.
 								// The mint is inclusive and maxt is the truncation time hence exclusive.
 								func (h *Head) WaitForPendingReadersInTimeRange(mint, maxt int64) {
 									maxt-- // Making it inclusive before checking overlaps.
 									overlaps := func() bool {
 										o := false
 										h.iso.TraverseOpenReads(func(s *isolationState) bool {
 											if s.mint <= maxt && mint <= s.maxt {
 												// Overlaps with the truncation range.
 												o = true
 												return false
 											}
 											return true
 										})
 										return o
 									}
 									for overlaps() {
 										time.Sleep(500 * time.Millisecond)
 									}
 								}
 								// IsQuerierCollidingWithTruncation returns if the current querier needs to be closed and if a new querier
 								// has to be created. In the latter case, the method also returns the new mint to be used for creating the
 								// new range head and the new querier. This methods helps preventing races with the truncation of in-memory data.
 								//
 								// NOTE: The querier should already be taken before calling this.
-												Format Go source files using 'gofumpt -w -s -extra'

Part of #9557

Signed-off-by: Mateusz Gozdek <mgozdekof@gmail.com>

											
										
										
											2021-10-22 08:06:44 +00:00
+								func (h *Head) IsQuerierCollidingWithTruncation(querierMint, querierMaxt int64) (shouldClose, getNew bool, newMint int64) {
-												tsdb: Block Head GC till pending readers are done reading (#9081)

* tsdb: Block Head GC till pending readers are done reading

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments 2

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix the exclusiveness of maxt in WaitForPendingReadersInTimeRange

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-07-20 08:47:20 +00:00
+									if !h.memTruncationInProcess.Load() {
 										return false, false, 0
 									}
 									// Head truncation is in process. It also means that the block that was
 									// created for this truncation range is also available.
 									// Check if we took a querier that overlaps with this truncation.
 									memTruncTime := h.lastMemoryTruncationTime.Load()
 									if querierMaxt < memTruncTime {
 										// Head compaction has happened and this time range is being truncated.
 										// This query doesn't overlap with the Head any longer.
 										// We should close this querier to avoid races and the data would be
 										// available with the blocks below.
 										// Cases:
 										// 1.     |------truncation------|
 										//   |---query---|
 										// 2.     |------truncation------|
 										//              |---query---|
 										return true, false, 0
 									}
 									if querierMint < memTruncTime {
 										// The truncation time is not same as head mint that we saw above but the
 										// query still overlaps with the Head.
 										// The truncation started after we got the querier. So it is not safe
 										// to use this querier and/or might block truncation. We should get
 										// a new querier for the new Head range while remaining will be available
 										// in the blocks below.
 										// Case:
 										//      |------truncation------|
 										//                        |----query----|
 										// Turns into
 										//      |------truncation------|
 										//                             |---qu---|
 										return true, true, memTruncTime
 									}
 									// Other case is this, which is a no-op
 									//      |------truncation------|
 									//                              |---query---|
 									return false, false, 0
 								}
-												Create a checkpoint only at the end of Compact call (#8067)

* Create a checkpoint only at the end of Compact call

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix Bartek's offline reviews

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Introduce TruncateInMemory and TruncateWAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Small enhancements and test fixing attempts

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix tests

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add TestOneCheckpointPerCompactCall

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Don't truncate WAL on block compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Simplified the algo.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Better protection around calling truncateWAL, truncate WAL on Head compaction error

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

Co-authored-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-10-19 15:27:08 +00:00
+								// truncateWAL removes old data before mint from the WAL.
 								func (h *Head) truncateWAL(mint int64) error {
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+									h.chunkSnapshotMtx.Lock()
 									defer h.chunkSnapshotMtx.Unlock()
-												Create a checkpoint only at the end of Compact call (#8067)

* Create a checkpoint only at the end of Compact call

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix Bartek's offline reviews

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Introduce TruncateInMemory and TruncateWAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Small enhancements and test fixing attempts

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix tests

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add TestOneCheckpointPerCompactCall

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Don't truncate WAL on block compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Simplified the algo.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Better protection around calling truncateWAL, truncate WAL on Head compaction error

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

Co-authored-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-10-19 15:27:08 +00:00
+									if h.wal == nil || mint <= h.lastWALTruncationTime.Load() {
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+										return nil
 									}
-												Create a checkpoint only at the end of Compact call (#8067)

* Create a checkpoint only at the end of Compact call

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix Bartek's offline reviews

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Introduce TruncateInMemory and TruncateWAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Small enhancements and test fixing attempts

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix tests

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add TestOneCheckpointPerCompactCall

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Don't truncate WAL on block compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Simplified the algo.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Better protection around calling truncateWAL, truncate WAL on Head compaction error

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

Co-authored-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-10-19 15:27:08 +00:00
+									start := time.Now()
 									h.lastWALTruncationTime.Store(mint)
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
-												Refactor WAL.Segments method to be part of the wal package (#6477)

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-09-01 09:16:57 +00:00
+									first, last, err := wal.Segments(h.wal.Dir())
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+									if err != nil {
 										return errors.Wrap(err, "get segment range")
 									}
-												Start a new WAL segement on head truncation. (#605)

This reduces disk space usage to not be a minimum of 3 128MB files
in small setups. This will possibly also help debug wal data issues,
by making things a bit more deterministic.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-06-07 10:35:02 +00:00
+									// Start a new segment, so low ingestion volume TSDB don't have more WAL than
 									// needed.
-												Create a checkpoint only at the end of Compact call (#8067)

* Create a checkpoint only at the end of Compact call

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix Bartek's offline reviews

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Introduce TruncateInMemory and TruncateWAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Small enhancements and test fixing attempts

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix tests

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add TestOneCheckpointPerCompactCall

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Don't truncate WAL on block compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Simplified the algo.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Better protection around calling truncateWAL, truncate WAL on Head compaction error

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

Co-authored-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-10-19 15:27:08 +00:00
+									if err := h.wal.NextSegment(); err != nil {
-												Start a new WAL segement on head truncation. (#605)

This reduces disk space usage to not be a minimum of 3 128MB files
in small setups. This will possibly also help debug wal data issues,
by making things a bit more deterministic.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-06-07 10:35:02 +00:00
+										return errors.Wrap(err, "next segment")
 									}
-												more descriptive var names and some more logging. (#405)

* more descriptive checkpoint var names and some more logging.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-10-11 15:23:52 +00:00
+									last-- // Never consider last segment for checkpoint.
 									if last < 0 {
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+										return nil // no segments yet.
 									}
-												Reduce how much old WAL we keep around. (#7098)

Previously we were keeping up to around 6 hours of WAL around by
removing 1/3 every hours. This was excessive, so switch to removing 2/3
which will up to around 3 hours of WAL around.

This will roughly halve the size of the WAL and halve startup time for
those who are I/O bound. This may increase the checkpoint size for
those with certain churn patterns, but by much less than we're saving
from the segments.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2020-04-07 10:25:57 +00:00
+									// The lower two thirds of segments should contain mostly obsolete samples.
 									// If we have less than two segments, it's not worth checkpointing yet.
 									// With the default 2h blocks, this will keeping up to around 3h worth
 									// of WAL segments.
 									last = first + (last-first)*2/3
-												more descriptive var names and some more logging. (#405)

* more descriptive checkpoint var names and some more logging.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-10-11 15:23:52 +00:00
+									if last <= first {
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+										return nil
 									}
-												Use boolean function instead of postings to drop WAL series

There is not guarantee or requirement for WAL writers to only add
series entries in increasing order of IDs. A postings list cannot look
back and thus unordered WAL entries would skip over IDs to not truncate
from the WAL.
We replace it with a simple boolean check function that does not require
order.

											
										
										
											2017-09-21 09:02:30 +00:00
+									keep := func(id uint64) bool {
-												Keep series that are still in WAL in checkpoints (#577)

If all the samples are deleted for a series,
we should still keep the series in the WAL as
anything else reading the WAL will still care
about it in order to understand the samples.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-04-09 13:16:24 +00:00
+										if h.series.getByID(id) != nil {
 											return true
 										}
 										h.deletedMtx.Lock()
 										_, ok := h.deleted[id]
 										h.deletedMtx.Unlock()
 										return ok
-												[WIP]: WAL implementation

Signed-off-by: Goutham Veeramachaneni <cs14btech11014@iith.ac.in>

											
										
										
											2017-08-31 09:39:22 +00:00
+									}
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 13:48:33 +00:00
+									h.metrics.checkpointCreationTotal.Inc()
-												Log when starting to create a checkpoint (#7581)

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-07-15 13:45:37 +00:00
+									if _, err = wal.Checkpoint(h.logger, h.wal, first, last, keep, mint); err != nil {
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 13:48:33 +00:00
+										h.metrics.checkpointCreationFail.Inc()
-												Increments WAL corruption metric on WAL corruption during checkpointing (#7491)

* Increments wal corruption metric on error during checkpointing

Signed-off-by: Harkishen-Singh <harkishensingh@hotmail.com>

* check for wal corruption error

Signed-off-by: Harkishen-Singh <harkishensingh@hotmail.com>
											
										
										
											2020-07-05 05:55:42 +00:00
+										if _, ok := errors.Cause(err).(*wal.CorruptionErr); ok {
 											h.metrics.walCorruptionsTotal.Inc()
 										}
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+										return errors.Wrap(err, "create checkpoint")
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+									}
-												more descriptive var names and some more logging. (#405)

* more descriptive checkpoint var names and some more logging.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-10-11 15:23:52 +00:00
+									if err := h.wal.Truncate(last + 1); err != nil {
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 13:48:33 +00:00
+										// If truncating fails, we'll just try again at the next checkpoint.
 										// Leftover segments will just be ignored in the future if there's a checkpoint
 										// that supersedes them.
 										level.Error(h.logger).Log("msg", "truncating segments failed", "err", err)
 									}
-												Keep series that are still in WAL in checkpoints (#577)

If all the samples are deleted for a series,
we should still keep the series in the WAL as
anything else reading the WAL will still care
about it in order to understand the samples.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-04-09 13:16:24 +00:00
 									// The checkpoint is written and segments before it is truncated, so we no
 									// longer need to track deleted series that are before it.
 									h.deletedMtx.Lock()
 									for ref, segment := range h.deleted {
 										if segment < first {
 											delete(h.deleted, ref)
 										}
 									}
 									h.deletedMtx.Unlock()
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 13:48:33 +00:00
+									h.metrics.checkpointDeleteTotal.Inc()
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 09:15:41 +00:00
+									if err := wal.DeleteCheckpoints(h.wal.Dir(), last); err != nil {
-												Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2018-09-25 13:48:33 +00:00
+										// Leftover old checkpoints do not cause problems down the line beyond
 										// occupying disk space.
 										// They will just be ignored since a higher checkpoint exists.
 										level.Error(h.logger).Log("msg", "delete old checkpoints", "err", err)
 										h.metrics.checkpointDeleteFail.Inc()
 									}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+									h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds())
-												Add tests for GC and chunk truncation

											
										
										
											2017-09-01 12:38:49 +00:00
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+									level.Info(h.logger).Log("msg", "WAL checkpoint complete",
-												more descriptive var names and some more logging. (#405)

* more descriptive checkpoint var names and some more logging.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-10-11 15:23:52 +00:00
+										"first", first, "last", last, "duration", time.Since(start))
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
-												Add tests for GC and chunk truncation

											
										
										
											2017-09-01 12:38:49 +00:00
+									return nil
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+								}
-												Fixed wrongly handled not ready TSDB on web and API. (#7182)

* fix federate endpoint panic

Signed-off-by: yeya24 <yb532204897@gmail.com>

* Fixed all cases of not ready TSDB being wrongly handled.

* Fixed issue for federation.
* Ensured this will never happen again thanks to interfaces
* Fixes same issue for stats.
* Added tests for readiness.
* Fixed bug in stats. It was:
   status.MaxTime = db.Head().MaxTime()
   status.MinTime = db.Head().MaxTime()


Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Addressed Brian's comments.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Addressed Brian's comments.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

Co-authored-by: Bartlomiej Plotka <bwplotka@gmail.com>
											
										
										
											2020-04-29 16:16:14 +00:00
+								type Stats struct {
 									NumSeries         uint64
 									MinTime, MaxTime  int64
 									IndexPostingStats *index.PostingsStats
 								}
 								// Stats returns important current HEAD statistics. Note that it is expensive to
 								// calculate these.
 								func (h *Head) Stats(statsByLabelName string) *Stats {
 									return &Stats{
 										NumSeries:         h.NumSeries(),
 										MaxTime:           h.MaxTime(),
 										MinTime:           h.MinTime(),
 										IndexPostingStats: h.PostingsCardinalityStats(statsByLabelName),
 									}
 								}
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 09:50:24 +00:00
+								type RangeHead struct {
-												Add more verbose error handling for closing, reduce locking

This commit introduces error returns in various places and is explicit
about closing persisted blocks.
{Index,Chunk,Tombstone}Readers are more consistent about their Close()
method. Whenever a reader is retrieved, the corresponding close method
must eventually be called. We use this to track pending readers against
persisted blocks.

Querier's against the DB no longer hold a read lock for their entire
lifecycle. This avoids long running queriers to starve new ones when we
have to acquire a write lock when reloading blocks.

											
										
										
											2017-10-09 13:21:46 +00:00
+									head       *Head
 									mint, maxt int64
 								}
-												Reset comment

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-25 23:17:56 +00:00
+								// NewRangeHead returns a *RangeHead.
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 09:50:24 +00:00
+								func NewRangeHead(head *Head, mint, maxt int64) *RangeHead {
 									return &RangeHead{
 										head: head,
 										mint: mint,
 										maxt: maxt,
 									}
 								}
-												Revert head posting optimization

This reverts commit 52630ad0c735f2dce4ce5bb851acb6c5d7df5eb1.

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>

											
										
										
											2020-03-25 19:13:47 +00:00
+								func (h *RangeHead) Index() (IndexReader, error) {
 									return h.head.indexRange(h.mint, h.maxt), nil
-												Add more verbose error handling for closing, reduce locking

This commit introduces error returns in various places and is explicit
about closing persisted blocks.
{Index,Chunk,Tombstone}Readers are more consistent about their Close()
method. Whenever a reader is retrieved, the corresponding close method
must eventually be called. We use this to track pending readers against
persisted blocks.

Querier's against the DB no longer hold a read lock for their entire
lifecycle. This avoids long running queriers to starve new ones when we
have to acquire a write lock when reloading blocks.

											
										
										
											2017-10-09 13:21:46 +00:00
+								}
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 09:50:24 +00:00
+								func (h *RangeHead) Chunks() (ChunkReader, error) {
-												tsdb: Block Head GC till pending readers are done reading (#9081)

* tsdb: Block Head GC till pending readers are done reading

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix review comments 2

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>

* Fix the exclusiveness of maxt in WaitForPendingReadersInTimeRange

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-07-20 08:47:20 +00:00
+									return h.head.chunksRange(h.mint, h.maxt, h.head.iso.State(h.mint, h.maxt))
-												Add more verbose error handling for closing, reduce locking

This commit introduces error returns in various places and is explicit
about closing persisted blocks.
{Index,Chunk,Tombstone}Readers are more consistent about their Close()
method. Whenever a reader is retrieved, the corresponding close method
must eventually be called. We use this to track pending readers against
persisted blocks.

Querier's against the DB no longer hold a read lock for their entire
lifecycle. This avoids long running queriers to starve new ones when we
have to acquire a write lock when reloading blocks.

											
										
										
											2017-10-09 13:21:46 +00:00
+								}
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 09:50:24 +00:00
+								func (h *RangeHead) Tombstones() (tombstones.Reader, error) {
-												Bring back tombstones to Head block (#6542)

* Bring back tombstones to Head block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add test cases

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Cleanup

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-01-20 15:38:00 +00:00
+									return h.head.tombstones, nil
-												Add more verbose error handling for closing, reduce locking

This commit introduces error returns in various places and is explicit
about closing persisted blocks.
{Index,Chunk,Tombstone}Readers are more consistent about their Close()
method. Whenever a reader is retrieved, the corresponding close method
must eventually be called. We use this to track pending readers against
persisted blocks.

Querier's against the DB no longer hold a read lock for their entire
lifecycle. This avoids long running queriers to starve new ones when we
have to acquire a write lock when reloading blocks.

											
										
										
											2017-10-09 13:21:46 +00:00
+								}
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 09:50:24 +00:00
+								func (h *RangeHead) MinTime() int64 {
-												Vertical query merging and compaction (#370)

* Vertical series iterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Select overlapped blocks first in compactor Plan()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Added vertical compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Code cleanup and comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix tests

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add benchmark for compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Perform vertical compaction only when blocks are overlapping.

Actions for vertical compaction:
* Sorting chunk metas
* Calling chunks.MergeOverlappingChunks on the chunks

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Benchmark for vertical compaction

* BenchmarkNormalCompaction => BenchmarkCompaction
* Moved the benchmark from db_test.go to compact_test.go

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Benchmark for query iterator and seek for non overlapping blocks

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Vertical query merge only for overlapping blocks

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Simplify logging in Compact(...)

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Updated CHANGELOG.md

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Calculate overlapping inside populateBlock

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* MinTime and MaxTime for BlockReader.

Using this to find overlapping blocks in populateBlock()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Sort blocks w.r.t. MinTime in reload()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Log about overlapping in LeveledCompactor.write() instead of returning bool

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Log about overlapping inside LeveledCompactor.populateBlock()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Refactor createBlock to take optional []Series

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* review1

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>

* Updated CHANGELOG and minor nits

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* nits

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Updated CHANGELOG

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Refactor iterator and seek benchmarks for Querier.

Also has as overlapping blocks.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Additional test case

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* genSeries takes optional labels. Updated BenchmarkQueryIterator and BenchmarkQuerySeek.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Split genSeries into genSeries and populateSeries

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Check error in benchmark

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Warn about overlapping blocks in reload()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-02-14 13:29:41 +00:00
+									return h.mint
 								}
-												Create a checkpoint only at the end of Compact call (#8067)

* Create a checkpoint only at the end of Compact call

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix Bartek's offline reviews

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Introduce TruncateInMemory and TruncateWAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Small enhancements and test fixing attempts

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix tests

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add TestOneCheckpointPerCompactCall

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Don't truncate WAL on block compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Simplified the algo.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Better protection around calling truncateWAL, truncate WAL on Head compaction error

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

Co-authored-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-10-19 15:27:08 +00:00
+								// MaxTime returns the max time of actual data fetch-able from the head.
 								// This controls the chunks time range which is closed [b.MinTime, b.MaxTime].
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 09:50:24 +00:00
+								func (h *RangeHead) MaxTime() int64 {
-												Vertical query merging and compaction (#370)

* Vertical series iterator

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Select overlapped blocks first in compactor Plan()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Added vertical compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Code cleanup and comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix tests

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add benchmark for compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Perform vertical compaction only when blocks are overlapping.

Actions for vertical compaction:
* Sorting chunk metas
* Calling chunks.MergeOverlappingChunks on the chunks

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Benchmark for vertical compaction

* BenchmarkNormalCompaction => BenchmarkCompaction
* Moved the benchmark from db_test.go to compact_test.go

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Benchmark for query iterator and seek for non overlapping blocks

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Vertical query merge only for overlapping blocks

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Simplify logging in Compact(...)

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Updated CHANGELOG.md

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Calculate overlapping inside populateBlock

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* MinTime and MaxTime for BlockReader.

Using this to find overlapping blocks in populateBlock()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Sort blocks w.r.t. MinTime in reload()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Log about overlapping in LeveledCompactor.write() instead of returning bool

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Log about overlapping inside LeveledCompactor.populateBlock()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Refactor createBlock to take optional []Series

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* review1

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>

* Updated CHANGELOG and minor nits

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* nits

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Updated CHANGELOG

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Refactor iterator and seek benchmarks for Querier.

Also has as overlapping blocks.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Additional test case

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* genSeries takes optional labels. Updated BenchmarkQueryIterator and BenchmarkQuerySeek.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Split genSeries into genSeries and populateSeries

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Check error in benchmark

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Warn about overlapping blocks in reload()

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-02-14 13:29:41 +00:00
+									return h.maxt
 								}
-												Create a checkpoint only at the end of Compact call (#8067)

* Create a checkpoint only at the end of Compact call

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix Bartek's offline reviews

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Introduce TruncateInMemory and TruncateWAL

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Small enhancements and test fixing attempts

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix tests

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add TestOneCheckpointPerCompactCall

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Don't truncate WAL on block compaction

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Simplified the algo.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Better protection around calling truncateWAL, truncate WAL on Head compaction error

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

Co-authored-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-10-19 15:27:08 +00:00
+								// BlockMaxTime returns the max time of the potential block created from this head.
 								// It's different to MaxTime as we need to add +1 millisecond to block maxt because block
 								// intervals are half-open: [b.MinTime, b.MaxTime). Block intervals are always +1 than the total samples it includes.
 								func (h *RangeHead) BlockMaxTime() int64 {
 									return h.MaxTime() + 1
 								}
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 09:50:24 +00:00
+								func (h *RangeHead) NumSeries() uint64 {
-												Open db in Read only mode (#588)

* Added db read only open mode and use it for the tsdb cli.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2019-07-23 08:04:48 +00:00
+									return h.head.NumSeries()
 								}
-												Break DB.Compact and DB.compactHead and DB.compactBlocks. Add DB.CompactHead.

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-02-14 09:50:24 +00:00
+								func (h *RangeHead) Meta() BlockMeta {
-												Open db in Read only mode (#588)

* Added db read only open mode and use it for the tsdb cli.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2019-07-23 08:04:48 +00:00
+									return BlockMeta{
 										MinTime: h.MinTime(),
 										MaxTime: h.MaxTime(),
 										ULID:    h.head.Meta().ULID,
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+										Stats: BlockStats{
 											NumSeries: h.NumSeries(),
 										},
-												Fix race condition between gc and committing (#378)

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>
											
										
										
											2018-09-17 16:58:42 +00:00
+									}
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+								}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
-												Breakdown tsdb/head.go into multiple files (#9147)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 12:14:26 +00:00
+								// String returns an human readable representation of the range head. It's important to
 								// keep this function in order to avoid the struct dump when the head is stringified in
 								// errors or logs.
 								func (h *RangeHead) String() string {
 									return fmt.Sprintf("range head (mint: %d, maxt: %d)", h.MinTime(), h.MaxTime())
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+								}
 								// Delete all samples in the range of [mint, maxt] for series that satisfy the given
 								// label matchers.
-												Port tsdb to use pkg/labels. (#6326)

* Port tsdb to use pkg/labels.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Get tests passing.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Remove useless cast.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Appease linters.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-11-18 19:53:33 +00:00
+								func (h *Head) Delete(mint, maxt int64, ms ...*labels.Matcher) error {
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+									// Do not delete anything beyond the currently valid range.
 									mint, maxt = clampInterval(mint, maxt, h.MinTime(), h.MaxTime())
 									ir := h.indexRange(mint, maxt)
-												Select series with label unset for != and !~

Fixes https://github.com/prometheus/prometheus/issues/3575

Signed-off-by: Goutham Veeramachaneni <cs14btech11014@iith.ac.in>
Signed-off-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2017-12-17 18:08:21 +00:00
+									p, err := PostingsForMatchers(ir, ms...)
-												Add explicit error to Querier.Select

This has been a frequent source of debugging pain since errors are
potentially delayed to a much later point. They bubble up in an
unrelated execution path.

											
										
										
											2017-11-13 11:16:58 +00:00
+									if err != nil {
 										return errors.Wrap(err, "select series")
 									}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 09:15:41 +00:00
+									var stones []tombstones.Stone
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+									for p.Next() {
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+										series := h.series.getByID(p.At())
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
-												tsdb: fix races around head chunks (#6985)

* tsdb: fix races around head chunks

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-03-16 12:59:22 +00:00
+										series.RLock()
-												Fix crash when a series has no block

											
										
										
											2018-02-07 13:43:21 +00:00
+										t0, t1 := series.minTime(), series.maxTime()
-												tsdb: fix races around head chunks (#6985)

* tsdb: fix races around head chunks

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-03-16 12:59:22 +00:00
+										series.RUnlock()
-												Fix crash when a series has no block

											
										
										
											2018-02-07 13:43:21 +00:00
+										if t0 == math.MinInt64 || t1 == math.MinInt64 {
 											continue
 										}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+										// Delete only until the current values and not beyond.
-												Fix crash when a series has no block

											
										
										
											2018-02-07 13:43:21 +00:00
+										t0, t1 = clampInterval(mint, maxt, t0, t1)
-												Bring back tombstones to Head block (#6542)

* Bring back tombstones to Head block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add test cases

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Cleanup

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-01-20 15:38:00 +00:00
+										stones = append(stones, tombstones.Stone{Ref: p.At(), Intervals: tombstones.Intervals{{Mint: t0, Maxt: t1}}})
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+									}
 									if p.Err() != nil {
 										return p.Err()
 									}
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+									if h.wal != nil {
-												Bring back tombstones to Head block (#6542)

* Bring back tombstones to Head block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add test cases

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Cleanup

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-01-20 15:38:00 +00:00
+										var enc record.Encoder
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+										if err := h.wal.Log(enc.Tombstones(stones, nil)); err != nil {
 											return err
 										}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+									}
-												Bring back tombstones to Head block (#6542)

* Bring back tombstones to Head block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add test cases

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Cleanup

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-01-20 15:38:00 +00:00
+									for _, s := range stones {
 										h.tombstones.AddInterval(s.Ref, s.Intervals[0])
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+									}
-												Dont store stones in head, delete samples directly

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-01-08 17:08:41 +00:00
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
+									return nil
 								}
-												Fix typos in comments (#254)

a the -> the
timestmap -> timestamp
badded -> padded
its -> it is
callers -> caller's
											
										
										
											2018-01-13 17:51:50 +00:00
+								// gc removes data before the minimum timestamp from the head.
-												Set the min time of Head properly after truncation (#8212)

* Set the min time of Head properly after truncation

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix lint

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Enhance compaction plan logic for completely deleted small block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-11-25 13:03:30 +00:00
+								// It returns the actual min times of the chunks present in the Head.
 								func (h *Head) gc() int64 {
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 22:39:17 +00:00
+									// Only data strictly lower than this timestamp must be deleted.
 									mint := h.MinTime()
-												Properly track and write meta file

											
										
										
											2017-01-19 13:01:38 +00:00
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+									// Drop old chunks and remember series IDs and hashes if they can be
 									// deleted entirely.
-												Set the min time of Head properly after truncation (#8212)

* Set the min time of Head properly after truncation

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix lint

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Enhance compaction plan logic for completely deleted small block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-11-25 13:03:30 +00:00
+									deleted, chunksRemoved, actualMint := h.series.gc(mint)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+									seriesRemoved := len(deleted)
-												Trigger reload correctly on interrupted compaction

											
										
										
											2017-03-20 09:41:43 +00:00
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+									h.metrics.seriesRemoved.Add(float64(seriesRemoved))
 									h.metrics.chunksRemoved.Add(float64(chunksRemoved))
 									h.metrics.chunks.Sub(float64(chunksRemoved))
-												tsdb: Replace sync/atomic with uber-go/atomic in tsdb (#7659)

* tsdb/chunks: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb/heaad: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* vendor: Make go.uber.org/atomic a direct dependency

There is no modifications to go.sum and vendor/ because
it was already vendored.

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb: Remove comments referring to the sync/atomic alignment bug

Related: https://golang.org/pkg/sync/atomic/#pkg-note-BUG

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>
											
										
										
											2020-07-28 04:42:42 +00:00
+									h.numSeries.Sub(uint64(seriesRemoved))
-												Add separate head mutex

Introduce a seperate mutex for the head blocks to avoid a race where
a post-compaction reload may run between switching the DB's base mutex
to create a new head block in an appender.

											
										
										
											2017-03-04 15:50:48 +00:00
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 14:34:49 +00:00
+									// Remove deleted series IDs from the postings lists.
 									h.postings.Delete(deleted)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
-												Call delete on head if interval overlaps (#9151)

* Call delete on head if interval overlaps

Signed-off-by: darshanime <deathbullet@gmail.com>

* Garbage collect tombstones during head gc

Signed-off-by: darshanime <deathbullet@gmail.com>

* Truncate tombstones before min time during head gc

Signed-off-by: darshanime <deathbullet@gmail.com>

* Lock less by deleting all keys in a single pass

Signed-off-by: darshanime <deathbullet@gmail.com>

* Pass map to DeleteTombstones

Signed-off-by: darshanime <deathbullet@gmail.com>

* Create new slice to replace old one

Signed-off-by: darshanime <deathbullet@gmail.com>
											
										
										
											2021-09-16 06:50:03 +00:00
+									// Remove tombstones referring to the deleted series.
 									h.tombstones.DeleteTombstones(deleted)
 									h.tombstones.TruncateBefore(mint)
-												Keep series that are still in WAL in checkpoints (#577)

If all the samples are deleted for a series,
we should still keep the series in the WAL as
anything else reading the WAL will still care
about it in order to understand the samples.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-04-09 13:16:24 +00:00
+									if h.wal != nil {
-												Refactor WAL.Segments method to be part of the wal package (#6477)

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-09-01 09:16:57 +00:00
+										_, last, _ := wal.Segments(h.wal.Dir())
-												Keep series that are still in WAL in checkpoints (#577)

If all the samples are deleted for a series,
we should still keep the series in the WAL as
anything else reading the WAL will still care
about it in order to understand the samples.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>
											
										
										
											2019-04-09 13:16:24 +00:00
+										h.deletedMtx.Lock()
 										// Keep series records until we're past segment 'last'
 										// because the WAL will still have samples records with
 										// this ref ID. If we didn't keep these series records then
 										// on start up when we replay the WAL, or any other code
 										// that reads the WAL, wouldn't be able to use those
 										// samples since we would have no labels for that ref ID.
 										for ref := range deleted {
 											h.deleted[ref] = last
 										}
 										h.deletedMtx.Unlock()
 									}
-												Set the min time of Head properly after truncation (#8212)

* Set the min time of Head properly after truncation

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix lint

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Enhance compaction plan logic for completely deleted small block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-11-25 13:03:30 +00:00
+									return actualMint
-												Add Queryable interface to Block

This adds the Queryable interface to the Block interface. Head and
persisted blocks now implement their own Querier() method and thus
isolate customization (e.g. remapPostings) more cleanly.

											
										
										
											2017-03-20 09:21:21 +00:00
+								}
-												Add more verbose error handling for closing, reduce locking

This commit introduces error returns in various places and is explicit
about closing persisted blocks.
{Index,Chunk,Tombstone}Readers are more consistent about their Close()
method. Whenever a reader is retrieved, the corresponding close method
must eventually be called. We use this to track pending readers against
persisted blocks.

Querier's against the DB no longer hold a read lock for their entire
lifecycle. This avoids long running queriers to starve new ones when we
have to acquire a write lock when reloading blocks.

											
										
										
											2017-10-09 13:21:46 +00:00
+								// Tombstones returns a new reader over the head's tombstones
-												Move WAL watcher code to tsdb/wal package. (#5999)

* Move WAL watcher code to tsdb/wal package.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Fix tests after moving WAL watcher code.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

* Lint fixes.

Signed-off-by: Callum Styan <callumstyan@gmail.com>

											
										
										
											2019-09-19 09:15:41 +00:00
+								func (h *Head) Tombstones() (tombstones.Reader, error) {
-												Bring back tombstones to Head block (#6542)

* Bring back tombstones to Head block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add test cases

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Cleanup

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2020-01-20 15:38:00 +00:00
+									return h.tombstones, nil
-												Compact head block early

Let older head blocks be compacted once the newest once has samples at
50% of its total range. This allows the memory of the compacted blocks
to be released and garbage collected before a new head block gets
created. Thereby the number of head blocks is 1 or 2 instead of 2 or 3
and memory spikes are reduced.

											
										
										
											2017-06-25 17:02:02 +00:00
+								}
-												Open db in Read only mode (#588)

* Added db read only open mode and use it for the tsdb cli.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2019-07-23 08:04:48 +00:00
+								// NumSeries returns the number of active series in the head.
 								func (h *Head) NumSeries() uint64 {
-												tsdb: Replace sync/atomic with uber-go/atomic in tsdb (#7659)

* tsdb/chunks: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb/heaad: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* vendor: Make go.uber.org/atomic a direct dependency

There is no modifications to go.sum and vendor/ because
it was already vendored.

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb: Remove comments referring to the sync/atomic alignment bug

Related: https://golang.org/pkg/sync/atomic/#pkg-note-BUG

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>
											
										
										
											2020-07-28 04:42:42 +00:00
+									return h.numSeries.Load()
-												Open db in Read only mode (#588)

* Added db read only open mode and use it for the tsdb cli.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2019-07-23 08:04:48 +00:00
+								}
 								// Meta returns meta information about the head.
 								// The head is dynamic so will return dynamic results.
 								func (h *Head) Meta() BlockMeta {
 									var id [16]byte
 									copy(id[:], "______head______")
 									return BlockMeta{
 										MinTime: h.MinTime(),
 										MaxTime: h.MaxTime(),
 										ULID:    ulid.ULID(id),
 										Stats: BlockStats{
 											NumSeries: h.NumSeries(),
 										},
 									}
 								}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 22:39:17 +00:00
+								// MinTime returns the lowest time bound on visible data in the head.
 								func (h *Head) MinTime() int64 {
-												tsdb: Replace sync/atomic with uber-go/atomic in tsdb (#7659)

* tsdb/chunks: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb/heaad: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* vendor: Make go.uber.org/atomic a direct dependency

There is no modifications to go.sum and vendor/ because
it was already vendored.

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb: Remove comments referring to the sync/atomic alignment bug

Related: https://golang.org/pkg/sync/atomic/#pkg-note-BUG

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>
											
										
										
											2020-07-28 04:42:42 +00:00
+									return h.minTime.Load()
-												Expose series references to clients

This exposes a reference number of a series represented by a label set
to clients.
Subsequent samples can be directly added via the reference rather than
repeatedly passing in the full labels. This drasitcally speeds up the
append process.

The appender chain uses different sections of the reference number for
assignment to child appenders and invalidating reference numbers as
necessary.

Clients can either pass out reference numbers themselves or have their
own optimized lookup, i.e. by directly associating unparsed metric
descriptors strings with reference numbers.

											
										
										
											2017-01-12 18:18:51 +00:00
+								}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 22:39:17 +00:00
+								// MaxTime returns the highest timestamp seen in data of the head.
 								func (h *Head) MaxTime() int64 {
-												tsdb: Replace sync/atomic with uber-go/atomic in tsdb (#7659)

* tsdb/chunks: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb/heaad: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* vendor: Make go.uber.org/atomic a direct dependency

There is no modifications to go.sum and vendor/ because
it was already vendored.

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb: Remove comments referring to the sync/atomic alignment bug

Related: https://golang.org/pkg/sync/atomic/#pkg-note-BUG

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>
											
										
										
											2020-07-28 04:42:42 +00:00
+									return h.maxTime.Load()
-												Expose series references to clients

This exposes a reference number of a series represented by a label set
to clients.
Subsequent samples can be directly added via the reference rather than
repeatedly passing in the full labels. This drasitcally speeds up the
append process.

The appender chain uses different sections of the reference number for
assignment to child appenders and invalidating reference numbers as
necessary.

Clients can either pass out reference numbers themselves or have their
own optimized lookup, i.e. by directly associating unparsed metric
descriptors strings with reference numbers.

											
										
										
											2017-01-12 18:18:51 +00:00
+								}
-												Add Head.compactable method (#542)

* Add Head.compactable method

Signed-off-by: zhulongcheng <zhulongcheng.me@gmail.com>
											
										
										
											2019-04-01 08:19:06 +00:00
+								// compactable returns whether the head has a compactable range.
 								// The head has a compactable range when the head time range is 1.5 times the chunk range.
 								// The 0.5 acts as a buffer of the appendable window.
 								func (h *Head) compactable() bool {
-												tsdb: Replace sync/atomic with uber-go/atomic in tsdb (#7659)

* tsdb/chunks: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb/heaad: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* vendor: Make go.uber.org/atomic a direct dependency

There is no modifications to go.sum and vendor/ because
it was already vendored.

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb: Remove comments referring to the sync/atomic alignment bug

Related: https://golang.org/pkg/sync/atomic/#pkg-note-BUG

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>
											
										
										
											2020-07-28 04:42:42 +00:00
+									return h.MaxTime()-h.MinTime() > h.chunkRange.Load()/2*3
-												Add Head.compactable method (#542)

* Add Head.compactable method

Signed-off-by: zhulongcheng <zhulongcheng.me@gmail.com>
											
										
										
											2019-04-01 08:19:06 +00:00
+								}
-												Close WAL when closing the DB

Also, the `wal` field of the `DB` was not used anywhere, so this removes
it.

											
										
										
											2017-11-10 20:19:39 +00:00
+								// Close flushes the WAL and closes the head.
-												Snapshot in-memory chunks on shutdown for faster restarts (#7229)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-06 16:51:01 +00:00
+								// It also takes a snapshot of in-memory chunks if enabled.
-												Close WAL when closing the DB

Also, the `wal` field of the `DB` was not used anywhere, so this removes
it.

											
										
										
											2017-11-10 20:19:39 +00:00
+								func (h *Head) Close() error {
-												More explicit chunks and  head error handling. (#7277)


											
										
										
											2020-05-22 09:03:23 +00:00
+									h.closedMtx.Lock()
 									defer h.closedMtx.Unlock()
 									h.closed = true
-												MultiError: Refactored MultiError for more concise and safe usage. (#8066)

* MultiError: Refactored MultiError for more concise and safe usage.

* Less lines
* Goland IDE was marking every usage of old MultiError "potential nil" error
* It was easy to forgot using Err() when error was returned, now it's safely assured on compile time.

NOTE: Potentially I would rename package to merrors. (: In different PR.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Addressed review comments.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Addressed comments.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fix after rebase.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>
											
										
										
											2020-10-28 15:24:58 +00:00
+									errs := tsdb_errors.NewMulti(h.chunkDiskMapper.Close())
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									if h.wal != nil {
-												MultiError: Refactored MultiError for more concise and safe usage. (#8066)

* MultiError: Refactored MultiError for more concise and safe usage.

* Less lines
* Goland IDE was marking every usage of old MultiError "potential nil" error
* It was easy to forgot using Err() when error was returned, now it's safely assured on compile time.

NOTE: Potentially I would rename package to merrors. (: In different PR.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Addressed review comments.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Addressed comments.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fix after rebase.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>
											
										
										
											2020-10-28 15:24:58 +00:00
+										errs.Add(h.wal.Close())
-												Integrate new WAL and checkpoints

Remove the old WAL and drop in the new one

Signed-off-by: Fabian Reinartz <freinartz@google.com>

											
										
										
											2018-05-17 13:04:32 +00:00
+									}
-												Take snapshot only after closing the WAL (#9328)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-09-13 13:00:41 +00:00
+									if errs.Err() == nil && h.opts.EnableMemorySnapshotOnShutdown {
 										errs.Add(h.performChunkSnapshot())
 									}
-												MultiError: Refactored MultiError for more concise and safe usage. (#8066)

* MultiError: Refactored MultiError for more concise and safe usage.

* Less lines
* Goland IDE was marking every usage of old MultiError "potential nil" error
* It was easy to forgot using Err() when error was returned, now it's safely assured on compile time.

NOTE: Potentially I would rename package to merrors. (: In different PR.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Addressed review comments.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Addressed comments.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fix after rebase.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>
											
										
										
											2020-10-28 15:24:58 +00:00
+									return errs.Err()
-												Close WAL when closing the DB

Also, the `wal` field of the `DB` was not used anywhere, so this removes
it.

											
										
										
											2017-11-10 20:19:39 +00:00
+								}
-												Fix TSDB head struct dump on querier error (#8379)

* Fix TSDB head struct dump on querier error

Signed-off-by: Marco Pracucci <marco@pracucci.com>

* Added mint/maxt to RangeHead.String()

Signed-off-by: Marco Pracucci <marco@pracucci.com>
											
										
										
											2021-01-21 10:37:29 +00:00
+								// String returns an human readable representation of the TSDB head. It's important to
 								// keep this function in order to avoid the struct dump when the head is stringified in
 								// errors or logs.
 								func (h *Head) String() string {
 									return "head"
 								}
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+								func (h *Head) getOrCreate(hash uint64, lset labels.Labels) (*memSeries, bool, error) {
-												TSDB: optimize series creation on PreCreation() failure (#8620)

Signed-off-by: Marco Pracucci <marco@pracucci.com>
											
										
										
											2021-03-18 15:23:50 +00:00
+									// Just using `getOrCreateWithID` below would be semantically sufficient, but we'd create
-												Simplify series create logic in head

											
										
										
											2017-09-18 10:28:56 +00:00
+									// a new series on every sample inserted via Add(), which causes allocations
 									// and makes our series IDs rather random and harder to compress in postings.
 									s := h.series.getByHash(hash, lset)
 									if s != nil {
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+										return s, false, nil
-												Simplify series create logic in head

											
										
										
											2017-09-18 10:28:56 +00:00
+									}
-												Refactor WAL into Head and misc improvements

											
										
										
											2017-08-30 16:34:54 +00:00
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+									// Optimistically assume that we are the first one to create the series.
-												tsdb: Replace sync/atomic with uber-go/atomic in tsdb (#7659)

* tsdb/chunks: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb/heaad: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* vendor: Make go.uber.org/atomic a direct dependency

There is no modifications to go.sum and vendor/ because
it was already vendored.

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb: Remove comments referring to the sync/atomic alignment bug

Related: https://golang.org/pkg/sync/atomic/#pkg-note-BUG

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>
											
										
										
											2020-07-28 04:42:42 +00:00
+									id := h.lastSeriesID.Inc()
-												Create series with ID recorded in WAL when reading it back

											
										
										
											2017-09-19 08:20:19 +00:00
 									return h.getOrCreateWithID(id, hash, lset)
 								}
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+								func (h *Head) getOrCreateWithID(id, hash uint64, lset labels.Labels) (*memSeries, bool, error) {
-												TSDB: optimize series creation on PreCreation() failure (#8620)

Signed-off-by: Marco Pracucci <marco@pracucci.com>
											
										
										
											2021-03-18 15:23:50 +00:00
+									s, created, err := h.series.getOrSet(hash, lset, func() *memSeries {
 										return newMemSeries(lset, id, h.chunkRange.Load(), &h.memChunkPool)
 									})
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+									if err != nil {
 										return nil, false, err
 									}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+									if !created {
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+										return s, false, nil
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+									}
-												Simplify series create logic in head

											
										
										
											2017-09-18 10:28:56 +00:00
+									h.metrics.seriesCreated.Inc()
-												tsdb: Replace sync/atomic with uber-go/atomic in tsdb (#7659)

* tsdb/chunks: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb/heaad: Replace sync/atomic with uber-go/atomic

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* vendor: Make go.uber.org/atomic a direct dependency

There is no modifications to go.sum and vendor/ because
it was already vendored.

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>

* tsdb: Remove comments referring to the sync/atomic alignment bug

Related: https://golang.org/pkg/sync/atomic/#pkg-note-BUG

Signed-off-by: Javier Palomo <javier.palomo.almena@gmail.com>
											
										
										
											2020-07-28 04:42:42 +00:00
+									h.numSeries.Inc()
-												Simplify series create logic in head

											
										
										
											2017-09-18 10:28:56 +00:00
-												Fixed race between compact (gc, populate) and head append causing unknown symbol error. (#7560)

* Fixed race between compact (gc, populate) and head append causing unknown symbol error.

Fixes https://github.com/prometheus/prometheus/issues/7373

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Addressed comments.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>
											
										
										
											2020-07-14 08:36:22 +00:00
+									h.postings.Add(id, lset)
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+									return s, true, nil
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+								}
 								// seriesHashmap is a simple hashmap for memSeries by their label set. It is built
 								// on top of a regular hashmap and holds a slice of series to resolve hash collisions.
 								// Its methods require the hash to be submitted with it to avoid re-computations throughout
 								// the code.
 								type seriesHashmap map[uint64][]*memSeries
-												Add new interfaces and skeleton

											
										
										
											2016-12-04 12:16:11 +00:00
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+								func (m seriesHashmap) get(hash uint64, lset labels.Labels) *memSeries {
 									for _, s := range m[hash] {
-												Port tsdb to use pkg/labels. (#6326)

* Port tsdb to use pkg/labels.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Get tests passing.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Remove useless cast.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Appease linters.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-11-18 19:53:33 +00:00
+										if labels.Equal(s.lset, lset) {
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+											return s
 										}
 									}
 									return nil
 								}
 								func (m seriesHashmap) set(hash uint64, s *memSeries) {
 									l := m[hash]
 									for i, prev := range l {
-												Port tsdb to use pkg/labels. (#6326)

* Port tsdb to use pkg/labels.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Get tests passing.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Remove useless cast.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Appease linters.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-11-18 19:53:33 +00:00
+										if labels.Equal(prev.lset, s.lset) {
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+											l[i] = s
 											return
 										}
 									}
 									m[hash] = append(l, s)
 								}
 								func (m seriesHashmap) del(hash uint64, lset labels.Labels) {
 									var rem []*memSeries
 									for _, s := range m[hash] {
-												Port tsdb to use pkg/labels. (#6326)

* Port tsdb to use pkg/labels.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Get tests passing.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Remove useless cast.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Appease linters.

Signed-off-by: Tom Wilkie <tom.wilkie@gmail.com>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-11-18 19:53:33 +00:00
+										if !labels.Equal(s.lset, lset) {
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+											rem = append(rem, s)
 										}
 									}
 									if len(rem) == 0 {
 										delete(m, hash)
 									} else {
 										m[hash] = rem
 									}
 								}
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-30 07:12:43 +00:00
+								const (
 									// DefaultStripeSize is the default number of entries to allocate in the stripeSeries hash map.
 									DefaultStripeSize = 1 << 14
 								)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+								// stripeSeries locks modulo ranges of IDs and hashes to reduce lock contention.
-												Fix typos in comments (#254)

a the -> the
timestmap -> timestamp
badded -> padded
its -> it is
callers -> caller's
											
										
										
											2018-01-13 17:51:50 +00:00
+								// The locks are padded to not be on the same cache line. Filling the padded space
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+								// with the maps was profiled to be slower – likely due to the additional pointer
 								// dereferences.
 								type stripeSeries struct {
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+									size                    int
 									series                  []map[uint64]*memSeries
 									hashes                  []seriesHashmap
 									locks                   []stripeLock
 									seriesLifecycleCallback SeriesLifecycleCallback
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+								}
 								type stripeLock struct {
 									sync.RWMutex
 									// Padding to avoid multiple locks being on the same cache line.
 									_ [40]byte
 								}
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+								func newStripeSeries(stripeSize int, seriesCallback SeriesLifecycleCallback) *stripeSeries {
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-30 07:12:43 +00:00
+									s := &stripeSeries{
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+										size:                    stripeSize,
 										series:                  make([]map[uint64]*memSeries, stripeSize),
 										hashes:                  make([]seriesHashmap, stripeSize),
 										locks:                   make([]stripeLock, stripeSize),
 										seriesLifecycleCallback: seriesCallback,
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-30 07:12:43 +00:00
+									}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
 									for i := range s.series {
 										s.series[i] = map[uint64]*memSeries{}
 									}
 									for i := range s.hashes {
 										s.hashes[i] = seriesHashmap{}
 									}
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 12:02:38 +00:00
+									return s
-												Add new interfaces and skeleton

											
										
										
											2016-12-04 12:16:11 +00:00
+								}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+								// gc garbage collects old chunks that are strictly before mint and removes
 								// series entirely that have no chunks left.
-												Set the min time of Head properly after truncation (#8212)

* Set the min time of Head properly after truncation

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix lint

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Enhance compaction plan logic for completely deleted small block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-11-25 13:03:30 +00:00
+								func (s *stripeSeries) gc(mint int64) (map[uint64]struct{}, int, int64) {
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+									var (
-												Set the min time of Head properly after truncation (#8212)

* Set the min time of Head properly after truncation

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix lint

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Enhance compaction plan logic for completely deleted small block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-11-25 13:03:30 +00:00
+										deleted                  = map[uint64]struct{}{}
 										deletedForCallback       = []labels.Labels{}
 										rmChunks                 = 0
 										actualMint         int64 = math.MaxInt64
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+									)
 									// Run through all series and truncate old chunks. Mark those with no
-												Filter WAL data in Head, misc fixes

											
										
										
											2017-09-06 14:20:37 +00:00
+									// chunks left as deleted and store their ID.
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-30 07:12:43 +00:00
+									for i := 0; i < s.size; i++ {
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+										s.locks[i].Lock()
 										for hash, all := range s.hashes[i] {
 											for _, series := range all {
-												Fix various races

											
										
										
											2017-09-08 06:48:19 +00:00
+												series.Lock()
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+												rmChunks += series.truncateChunksBefore(mint)
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+												if len(series.mmappedChunks) > 0 || series.headChunk != nil || series.pendingCommit {
-												Set the min time of Head properly after truncation (#8212)

* Set the min time of Head properly after truncation

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix lint

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Enhance compaction plan logic for completely deleted small block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-11-25 13:03:30 +00:00
+													seriesMint := series.minTime()
 													if seriesMint < actualMint {
 														actualMint = seriesMint
 													}
-												Fix various races

											
										
										
											2017-09-08 06:48:19 +00:00
+													series.Unlock()
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+													continue
 												}
 												// The series is gone entirely. We need to keep the series lock
 												// and make sure we have acquired the stripe locks for hash and ID of the
 												// series alike.
 												// If we don't hold them all, there's a very small chance that a series receives
 												// samples again while we are half-way into deleting it.
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-30 07:12:43 +00:00
+												j := int(series.ref) & (s.size - 1)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
 												if i != j {
 													s.locks[j].Lock()
 												}
 												deleted[series.ref] = struct{}{}
 												s.hashes[i].del(hash, series.lset)
 												delete(s.series[j], series.ref)
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+												deletedForCallback = append(deletedForCallback, series.lset)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
 												if i != j {
 													s.locks[j].Unlock()
 												}
-												Fix various races

											
										
										
											2017-09-08 06:48:19 +00:00
+												series.Unlock()
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+											}
 										}
 										s.locks[i].Unlock()
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
 										s.seriesLifecycleCallback.PostDeletion(deletedForCallback...)
 										deletedForCallback = deletedForCallback[:0]
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+									}
-												Set the min time of Head properly after truncation (#8212)

* Set the min time of Head properly after truncation

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix lint

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Enhance compaction plan logic for completely deleted small block

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Fix review comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-11-25 13:03:30 +00:00
+									if actualMint == math.MaxInt64 {
 										actualMint = mint
 									}
 									return deleted, rmChunks, actualMint
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+								}
 								func (s *stripeSeries) getByID(id uint64) *memSeries {
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-30 07:12:43 +00:00
+									i := id & uint64(s.size-1)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
 									s.locks[i].RLock()
 									series := s.series[i][id]
 									s.locks[i].RUnlock()
 									return series
 								}
 								func (s *stripeSeries) getByHash(hash uint64, lset labels.Labels) *memSeries {
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-30 07:12:43 +00:00
+									i := hash & uint64(s.size-1)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
 									s.locks[i].RLock()
 									series := s.hashes[i].get(hash, lset)
 									s.locks[i].RUnlock()
 									return series
 								}
-												TSDB: optimize series creation on PreCreation() failure (#8620)

Signed-off-by: Marco Pracucci <marco@pracucci.com>
											
										
										
											2021-03-18 15:23:50 +00:00
+								func (s *stripeSeries) getOrSet(hash uint64, lset labels.Labels, createSeries func() *memSeries) (*memSeries, bool, error) {
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+									// PreCreation is called here to avoid calling it inside the lock.
 									// It is not necessary to call it just before creating a series,
 									// rather it gives a 'hint' whether to create a series or not.
-												TSDB: optimize series creation on PreCreation() failure (#8620)

Signed-off-by: Marco Pracucci <marco@pracucci.com>
											
										
										
											2021-03-18 15:23:50 +00:00
+									preCreationErr := s.seriesLifecycleCallback.PreCreation(lset)
 									// Create the series, unless the PreCreation() callback as failed.
 									// If failed, we'll not allow to create a new series anyway.
 									var series *memSeries
 									if preCreationErr == nil {
 										series = createSeries()
 									}
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+									i := hash & uint64(s.size-1)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+									s.locks[i].Lock()
-												TSDB: optimize series creation on PreCreation() failure (#8620)

Signed-off-by: Marco Pracucci <marco@pracucci.com>
											
										
										
											2021-03-18 15:23:50 +00:00
+									if prev := s.hashes[i].get(hash, lset); prev != nil {
-												Add missing unlock on early return

											
										
										
											2017-09-18 09:23:22 +00:00
+										s.locks[i].Unlock()
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+										return prev, false, nil
 									}
-												TSDB: optimize series creation on PreCreation() failure (#8620)

Signed-off-by: Marco Pracucci <marco@pracucci.com>
											
										
										
											2021-03-18 15:23:50 +00:00
+									if preCreationErr == nil {
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+										s.hashes[i].set(hash, series)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+									}
 									s.locks[i].Unlock()
-												TSDB: optimize series creation on PreCreation() failure (#8620)

Signed-off-by: Marco Pracucci <marco@pracucci.com>
											
										
										
											2021-03-18 15:23:50 +00:00
+									if preCreationErr != nil {
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+										// The callback prevented creation of series.
-												TSDB: optimize series creation on PreCreation() failure (#8620)

Signed-off-by: Marco Pracucci <marco@pracucci.com>
											
										
										
											2021-03-18 15:23:50 +00:00
+										return nil, false, preCreationErr
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+									}
 									// Setting the series in the s.hashes marks the creation of series
 									// as any further calls to this methods would return that series.
 									s.seriesLifecycleCallback.PostCreation(series.lset)
-												made stripe size configurable (#6644)

Signed-off-by: Thor <thansen@digitalocean.com>
											
										
										
											2020-01-30 07:12:43 +00:00
+									i = series.ref & uint64(s.size-1)
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
 									s.locks[i].Lock()
 									s.series[i][series.ref] = series
 									s.locks[i].Unlock()
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
+									return series, true, nil
-												Replace single head lock with granular locks

This adds various new locks to replace the single big lock on
the head. All parts now must be COW as they may be held by clients
after initial retrieval.
Series by ID and hashes are now held in a stripe lock to reduce
contention and total holding time during GC. This should reduce
starvation of readers.

											
										
										
											2017-09-05 09:45:18 +00:00
+								}
-												Move BufferedSeriesIterator in own package

This functionality is useful for a lot of clients but not relevant to
the TSDB's core features.

											
										
										
											2017-03-24 09:20:39 +00:00
+								type sample struct {
 									t int64
 									v float64
 								}
-												tsdb: Added ChunkQueryable implementations to db; unified MergeSeriesSets and vertical to single struct. (#7069)

* tsdb: Added ChunkQueryable implementations to db; unified compactor, querier and fanout block iterating.

Chained to https://github.com/prometheus/prometheus/pull/7059

* NewMerge(Chunk)Querier now takies multiple primaries allowing tsdb DB code to use it.
* Added single SeriesEntry / ChunkEntry for all series implementations.
* Unified all vertical, and non vertical for compact and querying to single
merge series / chunk sets by reusing VerticalSeriesMergeFunc for overlapping algorithm (same logic as before)
* Added block (Base/Chunk/)Querier for block querying. We then use populateAndTomb(Base/Chunk/) to iterate over chunks or samples.
* Refactored endpoint tests and querier tests to include subtests.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Addressed comments from Brian and Beorn.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed snapshot test and added chunk iterator support for DBReadOnly.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed race when iterating over Ats first.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed tests.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed populate block tests.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed endpoints test.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed test.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Added test & fixed case of head open chunk.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed DBReadOnly tests and bug producing 1 sample chunks.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Added cases for partial block overlap for multiple full chunks.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Added extra tests for chunk meta after compaction.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>

* Fixed small vertical merge bug and added more tests for that.

Signed-off-by: Bartlomiej Plotka <bwplotka@gmail.com>
											
										
										
											2020-07-31 15:03:02 +00:00
+								func newSample(t int64, v float64) tsdbutil.Sample { return sample{t, v} }
 								func (s sample) T() int64                          { return s.t }
 								func (s sample) V() float64                        { return s.v }
-												refactor util funcs to allow re-usage. (#419)

* refactor util funcs to allow reusage.

Signed-off-by: Krasi Georgiev <kgeorgie@redhat.com>
											
										
										
											2018-10-25 20:06:19 +00:00
-												Fix various races

											
										
										
											2017-09-08 06:48:19 +00:00
+								// memSeries is the in-memory representation of a series. None of its methods
-												Fix typos in comments (#254)

a the -> the
timestmap -> timestamp
badded -> padded
its -> it is
callers -> caller's
											
										
										
											2018-01-13 17:51:50 +00:00
+								// are goroutine safe and it is the caller's responsibility to lock it.
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 12:02:38 +00:00
+								type memSeries struct {
-												tsdb: fix races around head chunks (#6985)

* tsdb: fix races around head chunks

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-03-16 12:59:22 +00:00
+									sync.RWMutex
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 15:51:39 +00:00
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									ref           uint64
 									lset          labels.Labels
 									mmappedChunks []*mmappedChunk
-												Optimise WAL loading by removing extra map and caching min-time (#9160)

* BenchmarkLoadWAL: close WAL after use

So that goroutines are stopped and resources released

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>

* BenchmarkLoadWAL: make series IDs co-prime with #workers

Series are distributed across workers by taking the modulus of the
ID with the number of workers, so multiples of 100 are a poor choice.

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>

* BenchmarkLoadWAL: simulate mmapped chunks

Real Prometheus cuts chunks every 120 samples, then skips those samples
when re-reading the WAL. Simulate this by creating a single mapped chunk
for each series, since the max time is all the reader looks at.

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>

* Fix comment

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>

* Remove series map from processWALSamples()

The locks that is commented to reduce contention in are now sharded
32,000 ways, so won't be contended. Removing the map saves memory and
goes just as fast.

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>

* loadWAL: Cache the last mmapped chunk time

So we can skip calling append() for samples it will reject.

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>

* Improvements from code review

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>

* Full stops and capitals on comments

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>

* Cache max time in both places mmappedChunks is updated

Including refactor to extract function `setMMappedChunks`, to reduce
code duplication.

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>

* Update head min/max time when mmapped chunks added

This ensures we have the correct values if no WAL samples are added for
that series.

Note that `mSeries.maxTime()` was always `math.MinInt64` before, since
that function doesn't consider mmapped chunks.

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>
											
										
										
											2021-08-10 09:23:31 +00:00
+									mmMaxTime     int64 // Max time of any mmapped chunk, only used during WAL replay.
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									headChunk     *memChunk
 									chunkRange    int64
 									firstChunkID  int
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 15:51:39 +00:00
-												Fix race condition between gc and committing (#378)

Signed-off-by: Chris Marchbanks <csmarchbanks@gmail.com>
											
										
										
											2018-09-17 16:58:42 +00:00
+									nextAt        int64 // Timestamp at which to cut the next chunk.
 									sampleBuf     [4]sample
 									pendingCommit bool // Whether there are samples waiting to be committed to this series.
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 15:51:39 +00:00
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 14:34:49 +00:00
+									app chunkenc.Appender // Current appender for the chunk.
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 19:22:27 +00:00
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									memChunkPool *sync.Pool
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 19:22:27 +00:00
+									txs *txRing
-												Make concurrent head chunk reads safe, fix misc races

This adds a 4 sample buffer to every head chunk. The XOR
compression scheme may edit bytes in place. The minimum size
of a sample is 2 bits. So keeping the last 4 samples in an in-memory
buffer makes it safe to query the preceeding ones while samples
are added

											
										
										
											2017-01-09 15:51:39 +00:00
+								}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+								func newMemSeries(lset labels.Labels, id uint64, chunkRange int64, memChunkPool *sync.Pool) *memSeries {
-												Dont store stones in head, delete samples directly

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-01-08 17:08:41 +00:00
+									s := &memSeries{
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+										lset:         lset,
 										ref:          id,
 										chunkRange:   chunkRange,
 										nextAt:       math.MinInt64,
 										txs:          newTxRing(4),
 										memChunkPool: memChunkPool,
-												Dont store stones in head, delete samples directly

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

											
										
										
											2019-01-08 17:08:41 +00:00
+									}
 									return s
 								}
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 22:39:17 +00:00
+								func (s *memSeries) minTime() int64 {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									if len(s.mmappedChunks) > 0 {
 										return s.mmappedChunks[0].minTime
-												Fix crash when a series has no block

											
										
										
											2018-02-07 13:43:21 +00:00
+									}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									if s.headChunk != nil {
 										return s.headChunk.minTime
 									}
 									return math.MinInt64
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 22:39:17 +00:00
+								}
 								func (s *memSeries) maxTime() int64 {
-												Fix crash when a series has no block

											
										
										
											2018-02-07 13:43:21 +00:00
+									c := s.head()
-												Fix bugs and add enhancements to the chunk snapshot (#9185)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-17 17:08:16 +00:00
+									if c != nil {
 										return c.maxTime
 									}
 									if len(s.mmappedChunks) > 0 {
 										return s.mmappedChunks[len(s.mmappedChunks)-1].maxTime
-												Fix crash when a series has no block

											
										
										
											2018-02-07 13:43:21 +00:00
+									}
-												Fix bugs and add enhancements to the chunk snapshot (#9185)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-17 17:08:16 +00:00
+									return math.MinInt64
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 22:39:17 +00:00
+								}
-												Test no panic after a WAL corruption (#7625)

* no panic the head memseries has chunks in it

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>

* fix a panic when querying after a wal corruption.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>

* review nits

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>

* Add test for reading the data after a wal corruption.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>

Update tsdb/db_test.go

Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com>

Update tsdb/db_test.go

Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com>
Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>

* spellings

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>

Co-authored-by: Ganesh Vernekar <15064823+codesome@users.noreply.github.com>
											
										
										
											2020-07-21 07:02:13 +00:00
+								// truncateChunksBefore removes all chunks from the series that
 								// have no timestamp at or after mint.
 								// Chunk IDs remain unchanged.
-												Add various metrics

											
										
										
											2017-08-30 15:38:25 +00:00
+								func (s *memSeries) truncateChunksBefore(mint int64) (removed int) {
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									if s.headChunk != nil && s.headChunk.maxTime < mint {
 										// If head chunk is truncated, we can truncate all mmapped chunks.
-												tsdb: Remove duplicate variables. (#8239)

Signed-off-by: johncming <johncming@yahoo.com>
											
										
										
											2020-11-30 08:55:33 +00:00
+										removed = 1 + len(s.mmappedChunks)
 										s.firstChunkID += removed
-												Precalculate memSeries.head

This is read far more than it changes.
This cuts ~14% off walltme and ~27% off CPU for WAL reading.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 13:28:56 +00:00
+										s.headChunk = nil
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+										s.mmappedChunks = nil
-												tsdb: Remove duplicate variables. (#8239)

Signed-off-by: johncming <johncming@yahoo.com>
											
										
										
											2020-11-30 08:55:33 +00:00
+										return removed
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									}
 									if len(s.mmappedChunks) > 0 {
 										for i, c := range s.mmappedChunks {
 											if c.maxTime >= mint {
 												break
 											}
-												tsdb: Remove duplicate variables. (#8239)

Signed-off-by: johncming <johncming@yahoo.com>
											
										
										
											2020-11-30 08:55:33 +00:00
+											removed = i + 1
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+										}
-												tsdb: Remove duplicate variables. (#8239)

Signed-off-by: johncming <johncming@yahoo.com>
											
										
										
											2020-11-30 08:55:33 +00:00
+										s.mmappedChunks = append(s.mmappedChunks[:0], s.mmappedChunks[removed:]...)
 										s.firstChunkID += removed
-												Precalculate memSeries.head

This is read far more than it changes.
This cuts ~14% off walltme and ~27% off CPU for WAL reading.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 13:28:56 +00:00
+									}
-												tsdb: Remove duplicate variables. (#8239)

Signed-off-by: johncming <johncming@yahoo.com>
											
										
										
											2020-11-30 08:55:33 +00:00
+									return removed
-												Remove multiple heads

This changes the structure to a single WAL backed by a single head
block.
Parts of the head block can be compacted. This relieves us from any head
amangement and greatly simplifies any consistency and isolation concerns
by just having a single head.

											
										
										
											2017-08-28 22:39:17 +00:00
+								}
-												Implement isolation

This has been ported from https://github.com/prometheus/tsdb/pull/306.

Original implementation by @brian-brazil, explained in detail in the
2nd half of this talk:
https://promcon.io/2017-munich/talks/staleness-in-prometheus-2-0/

The implementation was then processed by @gouthamve into the PR linked
above. Relevant slide deck:
https://docs.google.com/presentation/d/1-ICg7PEmDHYcITykD2SR2xwg56Tzf4gr8zfz1OerY5Y/edit?usp=drivesdk

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Brian Brazil <brian.brazil@robustperception.io>
Co-authored-by: Goutham Veeramachaneni <gouthamve@gmail.com>

											
										
										
											2020-02-12 19:22:27 +00:00
+								// cleanupAppendIDsBelow cleans up older appendIDs. Has to be called after
 								// acquiring lock.
 								func (s *memSeries) cleanupAppendIDsBelow(bound uint64) {
 									s.txs.cleanupAppendIDsBelow(bound)
 								}
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 12:02:38 +00:00
+								func (s *memSeries) head() *memChunk {
-												Precalculate memSeries.head

This is read far more than it changes.
This cuts ~14% off walltme and ~27% off CPU for WAL reading.

Signed-off-by: Brian Brazil <brian.brazil@robustperception.io>

											
										
										
											2018-10-31 13:28:56 +00:00
+									return s.headChunk
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 12:02:38 +00:00
+								}
 								type memChunk struct {
-												Move index and chunk encoders to own packages

											
										
										
											2017-11-30 14:34:49 +00:00
+									chunk            chunkenc.Chunk
-												Replace single head chunk per series with memSeries

This adds a memory series holding several chunk to replace
the single head chunk per series so far.
This is necessary for uniform maximum chunk sizes in cases
where some series have higher frequency samples than others.

											
										
										
											2017-01-11 12:02:38 +00:00
+									minTime, maxTime int64
 								}
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+								// OverlapsClosedInterval returns true if the chunk overlaps [mint, maxt].
-												Make interval overlap comparisons more explicit

Blocks are half-open intervals [a, b), while all other intervals
(chunks, head, ...) are closed intervals [a, b].

Make that distinction explicit by defining `OverlapsClosedInterval()`
methods for blocks and chunks, and using them in place of the more
generic `intervalOverlap()` function.

This change also fixes `db.Querier()` and `db.Delete()`, which could
previously return one extraneous block at the end of the specified
interval.

Signed-off-by: Benoît Knecht <benoit.knecht@fsfe.org>

											
										
										
											2018-07-02 08:23:36 +00:00
+								func (mc *memChunk) OverlapsClosedInterval(mint, maxt int64) bool {
-												Enhanced WAL replay for duplicate series record (#7438)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 14:33:54 +00:00
+									return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt)
 								}
 								func overlapsClosedInterval(mint1, maxt1, mint2, maxt2 int64) bool {
 									return mint1 <= maxt2 && mint2 <= maxt1
-												Make interval overlap comparisons more explicit

Blocks are half-open intervals [a, b), while all other intervals
(chunks, head, ...) are closed intervals [a, b].

Make that distinction explicit by defining `OverlapsClosedInterval()`
methods for blocks and chunks, and using them in place of the more
generic `intervalOverlap()` function.

This change also fixes `db.Querier()` and `db.Delete()`, which could
previously return one extraneous block at the end of the specified
interval.

Signed-off-by: Benoît Knecht <benoit.knecht@fsfe.org>

											
										
										
											2018-07-02 08:23:36 +00:00
+								}
-												TSDB: Use a dedicated head chunk reference type (#9501)

* Use dedicated Ref type

Throughout the code base, there are reference types masked as
regular integers.  Let's use dedicated types.  They are
equivalent, but clearer semantically.
This also makes it trivial to find where they are used,
and from uses, find the centralized docs.

Signed-off-by: Dieter Plaetinck <dieter@grafana.com>

* postpone some work until after possible return

Signed-off-by: Dieter Plaetinck <dieter@grafana.com>

* clarify

Signed-off-by: Dieter Plaetinck <dieter@grafana.com>

* rename feedback

Signed-off-by: Dieter Plaetinck <dieter@grafana.com>

* skip header is up to caller

Signed-off-by: Dieter Plaetinck <dieter@grafana.com>
											
										
										
											2021-10-13 12:14:32 +00:00
+								// mappedChunks describes chunk data on disk that can be mmapped
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+								type mmappedChunk struct {
-												TSDB: Use a dedicated head chunk reference type (#9501)

* Use dedicated Ref type

Throughout the code base, there are reference types masked as
regular integers.  Let's use dedicated types.  They are
equivalent, but clearer semantically.
This also makes it trivial to find where they are used,
and from uses, find the centralized docs.

Signed-off-by: Dieter Plaetinck <dieter@grafana.com>

* postpone some work until after possible return

Signed-off-by: Dieter Plaetinck <dieter@grafana.com>

* clarify

Signed-off-by: Dieter Plaetinck <dieter@grafana.com>

* rename feedback

Signed-off-by: Dieter Plaetinck <dieter@grafana.com>

* skip header is up to caller

Signed-off-by: Dieter Plaetinck <dieter@grafana.com>
											
										
										
											2021-10-13 12:14:32 +00:00
+									ref              chunks.ChunkDiskMapperRef
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+									numSamples       uint16
 									minTime, maxTime int64
 								}
 								// Returns true if the chunk overlaps [mint, maxt].
 								func (mc *mmappedChunk) OverlapsClosedInterval(mint, maxt int64) bool {
-												Enhanced WAL replay for duplicate series record (#7438)

Signed-off-by: Ganesh Vernekar <ganeshvern@gmail.com>
											
										
										
											2021-08-03 14:33:54 +00:00
+									return overlapsClosedInterval(mc.minTime, mc.maxTime, mint, maxt)
-												M-map full chunks of Head from disk (#6679)

When appending to the head and a chunk is full it is flushed to the disk and m-mapped (memory mapped) to free up memory

Prom startup now happens in these stages
 - Iterate the m-maped chunks from disk and keep a map of series reference to its slice of mmapped chunks.
- Iterate the WAL as usual. Whenever we create a new series, look for it's mmapped chunks in the map created before and add it to that series.

If a head chunk is corrupted the currpted one and all chunks after that are deleted and the data after the corruption is recovered from the existing WAL which means that a corruption in m-mapped files results in NO data loss.

[Mmaped chunks format](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/head_chunks.md)  - main difference is that the chunk for mmaping now also includes series reference because there is no index for mapping series to chunks.
[The block chunks](https://github.com/prometheus/prometheus/blob/master/tsdb/docs/format/chunks.md) are accessed from the index which includes the offsets for the chunks in the chunks file - example - chunks of series ID have offsets 200, 500 etc in the chunk files.
In case of mmaped chunks, the offsets are stored in memory and accessed from that. During WAL replay, these offsets are restored by iterating all m-mapped chunks as stated above by matching the series id present in the chunk header and offset of that chunk in that file.

**Prombench results**

_WAL Replay_

1h Wal reply time
30% less wal reply time - 4m31 vs 3m36
2h Wal reply time
20% less wal reply time - 8m16 vs 7m

_Memory During WAL Replay_

High Churn:
10-15% less RAM -  32gb vs 28gb
20% less RAM after compaction 34gb vs 27gb
No Churn:
20-30% less RAM -  23gb vs 18gb
40% less RAM after compaction 32.5gb vs 20gb

Screenshots are in [this comment](https://github.com/prometheus/prometheus/pull/6679#issuecomment-621678932)


Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-06 15:30:00 +00:00
+								}
-												Callbacks for lifecycle of series in TSDB (#7159)

* Callbacks for lifecycle of series in TSDB

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>

* Add more comments

Signed-off-by: Ganesh Vernekar <cs15btech11018@iith.ac.in>
											
										
										
											2020-05-20 13:22:08 +00:00
 								type noopSeriesLifecycleCallback struct{}
 								func (noopSeriesLifecycleCallback) PreCreation(labels.Labels) error { return nil }
 								func (noopSeriesLifecycleCallback) PostCreation(labels.Labels)      {}
 								func (noopSeriesLifecycleCallback) PostDeletion(...labels.Labels)   {}
-												Promtool tsdb list now prints block sizes (#7993)

* promtool tsdb list now prints blocks' size

Signed-off-by: arthursens <arthursens2005@gmail.com>
											
										
										
											2020-10-12 21:15:40 +00:00
 								func (h *Head) Size() int64 {
 									var walSize int64
 									if h.wal != nil {
 										walSize, _ = h.wal.Size()
 									}
-												Calculate head chunk size based on actual disk usage (#8139)

Signed-off-by: Julien Pivotto <roidelapluie@inuits.eu>
											
										
										
											2020-11-03 10:04:59 +00:00
+									cdmSize, _ := h.chunkDiskMapper.Size()
 									return walSize + cdmSize
-												Promtool tsdb list now prints block sizes (#7993)

* promtool tsdb list now prints blocks' size

Signed-off-by: arthursens <arthursens2005@gmail.com>
											
										
										
											2020-10-12 21:15:40 +00:00
+								}
 								func (h *RangeHead) Size() int64 {
 									return h.head.Size()
 								}
-												React UI: Add Starting Screen (#8662)

* Added walreplay API endpoint

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Added starting page to react-ui

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Documented the new endpoint

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed typos

Signed-off-by: Levi Harrison <git@leviharrison.dev>

Co-authored-by: Julius Volz <julius.volz@gmail.com>

* Removed logo

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed isResponding to isUnexpected

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed width of progress bar

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed width of progress bar

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Added DB stats object

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Updated starting page to work with new fields

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (pt. 2)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (pt. 3)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (and also implementing a method this time) (pt. 4)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (and also implementing a method this time) (pt. 5)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed const to let

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Passing nil (pt. 6)

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Remove SetStats method

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Added comma

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed api

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed to triple equals

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed data response types

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Don't return pointer

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Changed version

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed interface issue

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed pointer

Signed-off-by: Levi Harrison <git@leviharrison.dev>

* Fixed copying lock value error

Signed-off-by: Levi Harrison <git@leviharrison.dev>

Co-authored-by: Julius Volz <julius.volz@gmail.com>
											
										
										
											2021-06-05 14:29:32 +00:00
 								func (h *Head) startWALReplayStatus(startFrom, last int) {
 									h.stats.WALReplayStatus.Lock()
 									defer h.stats.WALReplayStatus.Unlock()
 									h.stats.WALReplayStatus.Min = startFrom
 									h.stats.WALReplayStatus.Max = last
 									h.stats.WALReplayStatus.Current = startFrom
 								}
 								func (h *Head) updateWALReplayStatusRead(current int) {
 									h.stats.WALReplayStatus.Lock()
 									defer h.stats.WALReplayStatus.Unlock()
 									h.stats.WALReplayStatus.Current = current
 								}