prometheus/storage/local/storage.go

// Copyright 2014 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package local contains the local time series storage used by Prometheus.
package local

import (
	"container/list"
	"sync"
	"sync/atomic"
	"time"

	"github.com/golang/glog"
	"github.com/prometheus/client_golang/prometheus"

	clientmodel "github.com/prometheus/client_golang/model"

	"github.com/prometheus/prometheus/storage/metric"
)

const (
	evictRequestsCap = 1024
	chunkLen         = 1024

	// See waitForNextFP.
	fpMaxWaitDuration = 10 * time.Second
	fpMinWaitDuration = 20 * time.Millisecond // A small multiple of disk seek time.
	fpMaxSweepTime    = 6 * time.Hour

	maxEvictInterval = time.Minute
	headChunkTimeout = time.Hour // Close head chunk if not touched for that long.

	appendWorkers  = 8 // Should be enough to not make appending a bottleneck.
	appendQueueCap = 2 * appendWorkers
)

type storageState uint

const (
	storageStarting storageState = iota
	storageServing
	storageStopping
)

type persistRequest struct {
	fingerprint clientmodel.Fingerprint
	chunkDesc   *chunkDesc
}

type evictRequest struct {
	cd    *chunkDesc
	evict bool
}

type memorySeriesStorage struct {
	fpLocker   *fingerprintLocker
	fpToSeries *seriesMap

	loopStopping, loopStopped  chan struct{}
	maxMemoryChunks            int
	dropAfter                  time.Duration
	checkpointInterval         time.Duration
	checkpointDirtySeriesLimit int

	appendQueue         chan *clientmodel.Sample
	appendLastTimestamp clientmodel.Timestamp // The timestamp of the last sample sent to the append queue.
	appendWaitGroup     sync.WaitGroup        // To wait for all appended samples to be processed.

	persistQueue    chan persistRequest
	persistQueueCap int // Not actually the cap of above channel. See handlePersistQueue.
	persistStopped  chan struct{}

	persistence *persistence

	countPersistedHeadChunks chan struct{}

	evictList                   *list.List
	evictRequests               chan evictRequest
	evictStopping, evictStopped chan struct{}

	persistLatency              prometheus.Summary
	persistErrors               prometheus.Counter
	persistQueueCapacity        prometheus.Metric
	persistQueueLength          prometheus.Gauge
	numSeries                   prometheus.Gauge
	seriesOps                   *prometheus.CounterVec
	ingestedSamplesCount        prometheus.Counter
	invalidPreloadRequestsCount prometheus.Counter
}

// MemorySeriesStorageOptions contains options needed by
// NewMemorySeriesStorage. It is not safe to leave any of those at their zero
// values.
type MemorySeriesStorageOptions struct {
	MemoryChunks               int           // How many chunks to keep in memory.
	PersistenceStoragePath     string        // Location of persistence files.
	PersistenceRetentionPeriod time.Duration // Chunks at least that old are dropped.
	PersistenceQueueCapacity   int           // Capacity of queue for chunks to be persisted.
	CheckpointInterval         time.Duration // How often to checkpoint the series map and head chunks.
	CheckpointDirtySeriesLimit int           // How many dirty series will trigger an early checkpoint.
	Dirty                      bool          // Force the storage to consider itself dirty on startup.
}

// NewMemorySeriesStorage returns a newly allocated Storage. Storage.Serve still
// has to be called to start the storage.
func NewMemorySeriesStorage(o *MemorySeriesStorageOptions) (Storage, error) {
	p, err := newPersistence(o.PersistenceStoragePath, chunkLen, o.Dirty)
	if err != nil {
		return nil, err
	}
	glog.Info("Loading series map and head chunks...")
	fpToSeries, err := p.loadSeriesMapAndHeads()
	if err != nil {
		return nil, err
	}
	glog.Infof("%d series loaded.", fpToSeries.length())
	numSeries := prometheus.NewGauge(prometheus.GaugeOpts{
		Namespace: namespace,
		Subsystem: subsystem,
		Name:      "memory_series",
		Help:      "The current number of series in memory.",
	})
	numSeries.Set(float64(fpToSeries.length()))

	s := &memorySeriesStorage{
		fpLocker:   newFingerprintLocker(1024),
		fpToSeries: fpToSeries,

		loopStopping:               make(chan struct{}),
		loopStopped:                make(chan struct{}),
		maxMemoryChunks:            o.MemoryChunks,
		dropAfter:                  o.PersistenceRetentionPeriod,
		checkpointInterval:         o.CheckpointInterval,
		checkpointDirtySeriesLimit: o.CheckpointDirtySeriesLimit,

		appendLastTimestamp: clientmodel.Earliest,
		appendQueue:         make(chan *clientmodel.Sample, appendQueueCap),

		// The actual buffering happens within handlePersistQueue, so
		// cap of persistQueue just has to be enough to not block while
		// handlePersistQueue is writing to disk (20ms or so).
		persistQueue:    make(chan persistRequest, 1024),
		persistQueueCap: o.PersistenceQueueCapacity,
		persistStopped:  make(chan struct{}),
		persistence:     p,

		countPersistedHeadChunks: make(chan struct{}, 100),

		evictList:     list.New(),
		evictRequests: make(chan evictRequest, evictRequestsCap),
		evictStopping: make(chan struct{}),
		evictStopped:  make(chan struct{}),

		persistLatency: prometheus.NewSummary(prometheus.SummaryOpts{
			Namespace: namespace,
			Subsystem: subsystem,
			Name:      "persist_latency_microseconds",
			Help:      "A summary of latencies for persisting each chunk.",
		}),
		persistErrors: prometheus.NewCounter(prometheus.CounterOpts{
			Namespace: namespace,
			Subsystem: subsystem,
			Name:      "persist_errors_total",
			Help:      "The total number of errors while persisting chunks.",
		}),
		persistQueueCapacity: prometheus.MustNewConstMetric(
			prometheus.NewDesc(
				prometheus.BuildFQName(namespace, subsystem, "persist_queue_capacity"),
				"The total capacity of the persist queue.",
				nil, nil,
			),
			prometheus.GaugeValue, float64(o.PersistenceQueueCapacity),
		),
		persistQueueLength: prometheus.NewGauge(prometheus.GaugeOpts{
			Namespace: namespace,
			Subsystem: subsystem,
			Name:      "persist_queue_length",
			Help:      "The current number of chunks waiting in the persist queue.",
		}),
		numSeries: numSeries,
		seriesOps: prometheus.NewCounterVec(
			prometheus.CounterOpts{
				Namespace: namespace,
				Subsystem: subsystem,
				Name:      "series_ops_total",
				Help:      "The total number of series operations by their type.",
			},
			[]string{opTypeLabel},
		),
		ingestedSamplesCount: prometheus.NewCounter(prometheus.CounterOpts{
			Namespace: namespace,
			Subsystem: subsystem,
			Name:      "ingested_samples_total",
			Help:      "The total number of samples ingested.",
		}),
		invalidPreloadRequestsCount: prometheus.NewCounter(prometheus.CounterOpts{
			Namespace: namespace,
			Subsystem: subsystem,
			Name:      "invalid_preload_requests_total",
			Help:      "The total number of preload requests referring to a non-existent series. This is an indication of outdated label indexes.",
		}),
	}

	for i := 0; i < appendWorkers; i++ {
		go func() {
			for sample := range s.appendQueue {
				s.appendSample(sample)
				s.appendWaitGroup.Done()
			}
		}()
	}

	return s, nil
}

// Start implements Storage.
func (s *memorySeriesStorage) Start() {
	go s.handleEvictList()
	go s.handlePersistQueue()
	go s.loop()
}

// Stop implements Storage.
func (s *memorySeriesStorage) Stop() error {
	glog.Info("Stopping local storage...")

	glog.Info("Draining append queue...")
	close(s.appendQueue)
	s.appendWaitGroup.Wait()
	glog.Info("Append queue drained.")

	glog.Info("Stopping maintenance loop...")
	close(s.loopStopping)
	<-s.loopStopped

	glog.Info("Stopping persist queue...")
	close(s.persistQueue)
	<-s.persistStopped

	glog.Info("Stopping chunk eviction...")
	close(s.evictStopping)
	<-s.evictStopped

	// One final checkpoint of the series map and the head chunks.
	if err := s.persistence.checkpointSeriesMapAndHeads(s.fpToSeries, s.fpLocker); err != nil {
		return err
	}

	if err := s.persistence.close(); err != nil {
		return err
	}
	glog.Info("Local storage stopped.")
	return nil
}

// WaitForIndexing implements Storage.
func (s *memorySeriesStorage) WaitForIndexing() {
	// First let all goroutines appending samples stop.
	s.appendWaitGroup.Wait()
	// Only then wait for the persistence to index them.
	s.persistence.waitForIndexing()
}

// NewIterator implements storage.
func (s *memorySeriesStorage) NewIterator(fp clientmodel.Fingerprint) SeriesIterator {
	s.fpLocker.Lock(fp)
	defer s.fpLocker.Unlock(fp)

	series, ok := s.fpToSeries.get(fp)
	if !ok {
		// Oops, no series for fp found. That happens if, after
		// preloading is done, the whole series is identified as old
		// enough for purging and hence purged for good. As there is no
		// data left to iterate over, return an iterator that will never
		// return any values.
		return nopSeriesIterator{}
	}
	return series.newIterator(
		func() { s.fpLocker.Lock(fp) },
		func() { s.fpLocker.Unlock(fp) },
	)
}

// NewPreloader implements Storage.
func (s *memorySeriesStorage) NewPreloader() Preloader {
	return &memorySeriesPreloader{
		storage: s,
	}
}

// GetFingerprintsForLabelMatchers implements Storage.
func (s *memorySeriesStorage) GetFingerprintsForLabelMatchers(labelMatchers metric.LabelMatchers) clientmodel.Fingerprints {
	var result map[clientmodel.Fingerprint]struct{}
	for _, matcher := range labelMatchers {
		intersection := map[clientmodel.Fingerprint]struct{}{}
		switch matcher.Type {
		case metric.Equal:
			fps, err := s.persistence.getFingerprintsForLabelPair(
				metric.LabelPair{
					Name:  matcher.Name,
					Value: matcher.Value,
				},
			)
			if err != nil {
				glog.Error("Error getting fingerprints for label pair: ", err)
			}
			if len(fps) == 0 {
				return nil
			}
			for _, fp := range fps {
				if _, ok := result[fp]; ok || result == nil {
					intersection[fp] = struct{}{}
				}
			}
		default:
			values, err := s.persistence.getLabelValuesForLabelName(matcher.Name)
			if err != nil {
				glog.Errorf("Error getting label values for label name %q: %v", matcher.Name, err)
			}
			matches := matcher.Filter(values)
			if len(matches) == 0 {
				return nil
			}
			for _, v := range matches {
				fps, err := s.persistence.getFingerprintsForLabelPair(
					metric.LabelPair{
						Name:  matcher.Name,
						Value: v,
					},
				)
				if err != nil {
					glog.Error("Error getting fingerprints for label pair: ", err)
				}
				for _, fp := range fps {
					if _, ok := result[fp]; ok || result == nil {
						intersection[fp] = struct{}{}
					}
				}
			}
		}
		if len(intersection) == 0 {
			return nil
		}
		result = intersection
	}

	fps := make(clientmodel.Fingerprints, 0, len(result))
	for fp := range result {
		fps = append(fps, fp)
	}
	return fps
}

// GetLabelValuesForLabelName implements Storage.
func (s *memorySeriesStorage) GetLabelValuesForLabelName(labelName clientmodel.LabelName) clientmodel.LabelValues {
	lvs, err := s.persistence.getLabelValuesForLabelName(labelName)
	if err != nil {
		glog.Errorf("Error getting label values for label name %q: %v", labelName, err)
	}
	return lvs
}

// GetMetricForFingerprint implements Storage.
func (s *memorySeriesStorage) GetMetricForFingerprint(fp clientmodel.Fingerprint) clientmodel.COWMetric {
	s.fpLocker.Lock(fp)
	defer s.fpLocker.Unlock(fp)

	series, ok := s.fpToSeries.get(fp)
	if ok {
		// Wrap the returned metric in a copy-on-write (COW) metric here because
		// the caller might mutate it.
		return clientmodel.COWMetric{
			Metric: series.metric,
		}
	}
	metric, err := s.persistence.getArchivedMetric(fp)
	if err != nil {
		glog.Errorf("Error retrieving archived metric for fingerprint %v: %v", fp, err)
	}
	return clientmodel.COWMetric{
		Metric: metric,
	}
}

// AppendSamples implements Storage.
func (s *memorySeriesStorage) AppendSamples(samples clientmodel.Samples) {
	for _, sample := range samples {
		if sample.Timestamp != s.appendLastTimestamp {
			// Timestamp has changed. We have to wait for processing
			// of all appended samples before proceeding. Otherwise,
			// we might violate the storage contract that each
			// sample appended to a given series has to have a
			// timestamp greater or equal to the previous sample
			// appended to that series.
			s.appendWaitGroup.Wait()
			s.appendLastTimestamp = sample.Timestamp
		}
		s.appendWaitGroup.Add(1)
		s.appendQueue <- sample
	}
}

func (s *memorySeriesStorage) appendSample(sample *clientmodel.Sample) {
	fp := sample.Metric.Fingerprint()
	s.fpLocker.Lock(fp)
	series := s.getOrCreateSeries(fp, sample.Metric)
	chunkDescsToPersist := series.add(fp, &metric.SamplePair{
		Value:     sample.Value,
		Timestamp: sample.Timestamp,
	})
	s.fpLocker.Unlock(fp)
	s.ingestedSamplesCount.Inc()

	if len(chunkDescsToPersist) == 0 {
		return
	}
	// Queue only outside of the locked area, processing the persistQueue
	// requires the same lock!
	for _, cd := range chunkDescsToPersist {
		s.persistQueue <- persistRequest{fp, cd}
	}
	// Count that a head chunk was persisted, but only best effort, i.e. we
	// don't want to block here.
	select {
	case s.countPersistedHeadChunks <- struct{}{}: // Counted.
	default: // Meh...
	}
}

func (s *memorySeriesStorage) getOrCreateSeries(fp clientmodel.Fingerprint, m clientmodel.Metric) *memorySeries {
	series, ok := s.fpToSeries.get(fp)
	if !ok {
		unarchived, firstTime, err := s.persistence.unarchiveMetric(fp)
		if err != nil {
			glog.Errorf("Error unarchiving fingerprint %v: %v", fp, err)
		}
		if unarchived {
			s.seriesOps.WithLabelValues(unarchive).Inc()
		} else {
			// This was a genuinely new series, so index the metric.
			s.persistence.indexMetric(fp, m)
			s.seriesOps.WithLabelValues(create).Inc()
		}
		series = newMemorySeries(m, !unarchived, firstTime)
		s.fpToSeries.put(fp, series)
		s.numSeries.Inc()
	}
	return series
}

func (s *memorySeriesStorage) preloadChunksForRange(
	fp clientmodel.Fingerprint,
	from clientmodel.Timestamp, through clientmodel.Timestamp,
	stalenessDelta time.Duration,
) ([]*chunkDesc, error) {
	s.fpLocker.Lock(fp)
	defer s.fpLocker.Unlock(fp)

	series, ok := s.fpToSeries.get(fp)
	if !ok {
		has, first, last, err := s.persistence.hasArchivedMetric(fp)
		if err != nil {
			return nil, err
		}
		if !has {
			s.invalidPreloadRequestsCount.Inc()
			return nil, nil
		}
		if from.Add(-stalenessDelta).Before(last) && through.Add(stalenessDelta).After(first) {
			metric, err := s.persistence.getArchivedMetric(fp)
			if err != nil {
				return nil, err
			}
			series = s.getOrCreateSeries(fp, metric)
		} else {
			return nil, nil
		}
	}
	return series.preloadChunksForRange(from, through, fp, s)
}

func (s *memorySeriesStorage) handleEvictList() {
	ticker := time.NewTicker(maxEvictInterval)
	count := 0

	for {
		// To batch up evictions a bit, this tries evictions at least
		// once per evict interval, but earlier if the number of evict
		// requests with evict==true that have happened since the last
		// evict run is more than maxMemoryChunks/1000.
		select {
		case req := <-s.evictRequests:
			if req.evict {
				req.cd.evictListElement = s.evictList.PushBack(req.cd)
				count++
				if count > s.maxMemoryChunks/1000 {
					s.maybeEvict()
					count = 0
				}
			} else {
				if req.cd.evictListElement != nil {
					s.evictList.Remove(req.cd.evictListElement)
					req.cd.evictListElement = nil
				}
			}
		case <-ticker.C:
			if s.evictList.Len() > 0 {
				s.maybeEvict()
			}
		case <-s.evictStopping:
			// Drain evictRequests forever in a goroutine to not let
			// requesters hang.
			go func() {
				for {
					<-s.evictRequests
				}
			}()
			ticker.Stop()
			glog.Info("Chunk eviction stopped.")
			close(s.evictStopped)
			return
		}
	}
}

// maybeEvict is a local helper method. Must only be called by handleEvictList.
func (s *memorySeriesStorage) maybeEvict() {
	numChunksToEvict := int(atomic.LoadInt64(&numMemChunks)) - s.maxMemoryChunks
	if numChunksToEvict <= 0 {
		return
	}
	chunkDescsToEvict := make([]*chunkDesc, numChunksToEvict)
	for i := range chunkDescsToEvict {
		e := s.evictList.Front()
		if e == nil {
			break
		}
		cd := e.Value.(*chunkDesc)
		cd.evictListElement = nil
		chunkDescsToEvict[i] = cd
		s.evictList.Remove(e)
	}
	// Do the actual eviction in a goroutine as we might otherwise deadlock,
	// in the following way: A chunk was unpinned completely and therefore
	// scheduled for eviction. At the time we actually try to evict it,
	// another goroutine is pinning the chunk. The pinning goroutine has
	// currently locked the chunk and tries to send the evict request (to
	// remove the chunk from the evict list) to the evictRequests
	// channel. The send blocks because evictRequests is full. However, the
	// goroutine that is supposed to empty the channel is waiting for the
	// chunkDesc lock to try to evict the chunk.
	go func() {
		for _, cd := range chunkDescsToEvict {
			if cd == nil {
				break
			}
			cd.maybeEvict()
			// We don't care if the eviction succeeds. If the chunk
			// was pinned in the meantime, it will be added to the
			// evict list once it gets unpinned again.
		}
	}()
}

func (s *memorySeriesStorage) handlePersistQueue() {
	chunkMaps := chunkMaps{}
	chunkCount := 0

	persistMostConsecutiveChunks := func() {
		fp, cds := chunkMaps.pop()
		if err := s.persistChunks(fp, cds); err != nil {
			// Need to put chunks back for retry.
			for _, cd := range cds {
				chunkMaps.add(fp, cd)
			}
			return
		}
		chunkCount -= len(cds)
		s.persistQueueLength.Set(float64(chunkCount))
	}

loop:
	for {
		if chunkCount >= s.persistQueueCap && chunkCount > 0 {
			glog.Warningf("%d chunks queued for persistence. Ingestion pipeline will backlog.", chunkCount)
			persistMostConsecutiveChunks()
		}
		select {
		case req, ok := <-s.persistQueue:
			if !ok {
				break loop
			}
			chunkMaps.add(req.fingerprint, req.chunkDesc)
			chunkCount++
		default:
			if chunkCount > 0 {
				persistMostConsecutiveChunks()
				continue loop
			}
			// If we are here, there is nothing to do right now. So
			// just wait for a persist request to come in.
			req, ok := <-s.persistQueue
			if !ok {
				break loop
			}
			chunkMaps.add(req.fingerprint, req.chunkDesc)
			chunkCount++
		}
		s.persistQueueLength.Set(float64(chunkCount))
	}

	// Drain all requests.
	for _, m := range chunkMaps {
		for fp, cds := range m {
			if s.persistChunks(fp, cds) == nil {
				chunkCount -= len(cds)
				if (chunkCount+len(cds))/1000 > chunkCount/1000 {
					glog.Infof(
						"Still draining persist queue, %d chunks left to persist...",
						chunkCount,
					)
				}
				s.persistQueueLength.Set(float64(chunkCount))
			}
		}
	}

	glog.Info("Persist queue drained and stopped.")
	close(s.persistStopped)
}

func (s *memorySeriesStorage) persistChunks(fp clientmodel.Fingerprint, cds []*chunkDesc) error {
	start := time.Now()
	chunks := make([]chunk, len(cds))
	for i, cd := range cds {
		chunks[i] = cd.chunk
	}
	s.fpLocker.Lock(fp)
	offset, err := s.persistence.persistChunks(fp, chunks)
	if series, seriesInMemory := s.fpToSeries.get(fp); err == nil && seriesInMemory && series.chunkDescsOffset == -1 {
		// This is the first chunk persisted for a newly created
		// series that had prior chunks on disk. Finally, we can
		// set the chunkDescsOffset.
		series.chunkDescsOffset = offset
	}
	s.fpLocker.Unlock(fp)
	s.persistLatency.Observe(float64(time.Since(start)) / float64(time.Microsecond))
	if err != nil {
		s.persistErrors.Inc()
		glog.Error("Error persisting chunks: ", err)
		s.persistence.setDirty(true)
		return err
	}
	for _, cd := range cds {
		cd.unpin(s.evictRequests)
	}
	chunkOps.WithLabelValues(persistAndUnpin).Add(float64(len(cds)))
	return nil
}

// waitForNextFP waits an estimated duration, after which we want to process
// another fingerprint so that we will process all fingerprints in a tenth of
// s.dropAfter assuming that the system is doing nothing else, e.g. if we want
// to drop chunks after 40h, we want to cycle through all fingerprints within
// 4h. However, the maximum sweep time is capped at fpMaxSweepTime. Furthermore,
// this method will always wait for at least fpMinWaitDuration and never longer
// than fpMaxWaitDuration. If s.loopStopped is closed, it will return false
// immediately. The estimation is based on the total number of fingerprints as
// passed in.
func (s *memorySeriesStorage) waitForNextFP(numberOfFPs int) bool {
	d := fpMaxWaitDuration
	if numberOfFPs != 0 {
		sweepTime := s.dropAfter / 10
		if sweepTime > fpMaxSweepTime {
			sweepTime = fpMaxSweepTime
		}
		d = sweepTime / time.Duration(numberOfFPs)
		if d < fpMinWaitDuration {
			d = fpMinWaitDuration
		}
		if d > fpMaxWaitDuration {
			d = fpMaxWaitDuration
		}
	}
	t := time.NewTimer(d)
	select {
	case <-t.C:
		return true
	case <-s.loopStopping:
		return false
	}
}

// cycleThroughMemoryFingerprints returns a channel that emits fingerprints for
// series in memory in a throttled fashion. It continues to cycle through all
// fingerprints in memory until s.loopStopping is closed.
func (s *memorySeriesStorage) cycleThroughMemoryFingerprints() chan clientmodel.Fingerprint {
	memoryFingerprints := make(chan clientmodel.Fingerprint)
	go func() {
		var fpIter <-chan clientmodel.Fingerprint

		defer func() {
			if fpIter != nil {
				for range fpIter {
					// Consume the iterator.
				}
			}
			close(memoryFingerprints)
		}()

		for {
			// Initial wait, also important if there are no FPs yet.
			if !s.waitForNextFP(s.fpToSeries.length()) {
				return
			}
			begin := time.Now()
			fpIter = s.fpToSeries.fpIter()
			count := 0
			for fp := range fpIter {
				select {
				case memoryFingerprints <- fp:
				case <-s.loopStopping:
					return
				}
				s.waitForNextFP(s.fpToSeries.length())
				count++
			}
			if count > 0 {
				glog.Infof(
					"Completed maintenance sweep through %d in-memory fingerprints in %v.",
					count, time.Since(begin),
				)
			}
		}
	}()

	return memoryFingerprints
}

// cycleThroughArchivedFingerprints returns a channel that emits fingerprints
// for archived series in a throttled fashion. It continues to cycle through all
// archived fingerprints until s.loopStopping is closed.
func (s *memorySeriesStorage) cycleThroughArchivedFingerprints() chan clientmodel.Fingerprint {
	archivedFingerprints := make(chan clientmodel.Fingerprint)
	go func() {
		defer close(archivedFingerprints)

		for {
			archivedFPs, err := s.persistence.getFingerprintsModifiedBefore(
				clientmodel.TimestampFromTime(time.Now()).Add(-s.dropAfter),
			)
			if err != nil {
				glog.Error("Failed to lookup archived fingerprint ranges: ", err)
				s.waitForNextFP(0)
				continue
			}
			// Initial wait, also important if there are no FPs yet.
			if !s.waitForNextFP(len(archivedFPs)) {
				return
			}
			begin := time.Now()
			for _, fp := range archivedFPs {
				select {
				case archivedFingerprints <- fp:
				case <-s.loopStopping:
					return
				}
				s.waitForNextFP(len(archivedFPs))
			}
			if len(archivedFPs) > 0 {
				glog.Infof(
					"Completed maintenance sweep through %d archived fingerprints in %v.",
					len(archivedFPs), time.Since(begin),
				)
			}
		}
	}()
	return archivedFingerprints
}

func (s *memorySeriesStorage) loop() {
	checkpointTimer := time.NewTimer(s.checkpointInterval)

	// We take the number of head chunks persisted since the last checkpoint
	// as an approximation for the number of series that are "dirty",
	// i.e. whose head chunk is different from the one in the most recent
	// checkpoint or for which the fact that the head chunk has been
	// persisted is not reflected in the most recent checkpoint. This count
	// could overestimate the number of dirty series, but it's good enough
	// as a heuristic.
	headChunksPersistedSinceLastCheckpoint := 0

	defer func() {
		checkpointTimer.Stop()
		glog.Info("Maintenance loop stopped.")
		close(s.loopStopped)
	}()

	memoryFingerprints := s.cycleThroughMemoryFingerprints()
	archivedFingerprints := s.cycleThroughArchivedFingerprints()

loop:
	for {
		select {
		case <-s.loopStopping:
			break loop
		case <-checkpointTimer.C:
			s.persistence.checkpointSeriesMapAndHeads(s.fpToSeries, s.fpLocker)
			headChunksPersistedSinceLastCheckpoint = 0
			checkpointTimer.Reset(s.checkpointInterval)
		case fp := <-memoryFingerprints:
			s.maintainMemorySeries(fp, clientmodel.TimestampFromTime(time.Now()).Add(-s.dropAfter))
		case fp := <-archivedFingerprints:
			s.maintainArchivedSeries(fp, clientmodel.TimestampFromTime(time.Now()).Add(-s.dropAfter))
		case <-s.countPersistedHeadChunks:
			headChunksPersistedSinceLastCheckpoint++
			// Check if we have enough "dirty" series so that we need an early checkpoint.
			// As described above, we take the headChunksPersistedSinceLastCheckpoint as a
			// heuristic for "dirty" series. However, if we are already backlogging
			// chunks to be persisted, creating a checkpoint would be counterproductive,
			// as it would slow down chunk persisting even more, while in a situation like
			// that, the best we can do for crash recovery is to work through the persist
			// queue as quickly as possible. So only checkpoint if s.persistQueue is
			// at most 20% full.
			if headChunksPersistedSinceLastCheckpoint >= s.checkpointDirtySeriesLimit &&
				len(s.persistQueue) < cap(s.persistQueue)/5 {
				checkpointTimer.Reset(0)
			}
		}
	}
	// Wait until both channels are closed.
	for range memoryFingerprints {
	}
	for range archivedFingerprints {
	}
}

// maintainMemorySeries first purges the series from old chunks. If the series
// still exists after that, it proceeds with the following steps: It closes the
// head chunk if it was not touched in a while. It archives a series if all
// chunks are evicted. It evicts chunkDescs if there are too many.
func (s *memorySeriesStorage) maintainMemorySeries(fp clientmodel.Fingerprint, beforeTime clientmodel.Timestamp) {
	var headChunkToPersist *chunkDesc
	s.fpLocker.Lock(fp)
	defer func() {
		s.fpLocker.Unlock(fp)
		// Queue outside of lock!
		if headChunkToPersist != nil {
			s.persistQueue <- persistRequest{fp, headChunkToPersist}
			// Count that a head chunk was persisted, but only best effort, i.e. we
			// don't want to block here.
			select {
			case s.countPersistedHeadChunks <- struct{}{}: // Counted.
			default: // Meh...
			}
		}
	}()

	series, ok := s.fpToSeries.get(fp)
	if !ok {
		// Series is actually not in memory, perhaps archived or dropped in the meantime.
		return
	}

	defer s.seriesOps.WithLabelValues(memoryMaintenance).Inc()

	if s.purgeMemorySeries(fp, series, beforeTime) {
		// Series is gone now, we are done.
		return
	}

	iOldestNotEvicted := -1
	for i, cd := range series.chunkDescs {
		if !cd.isEvicted() {
			iOldestNotEvicted = i
			break
		}
	}

	// Archive if all chunks are evicted.
	if iOldestNotEvicted == -1 {
		s.fpToSeries.del(fp)
		s.numSeries.Dec()
		// Make sure we have a head chunk descriptor (a freshly
		// unarchived series has none).
		if len(series.chunkDescs) == 0 {
			cds, err := s.loadChunkDescs(fp, clientmodel.Latest)
			if err != nil {
				glog.Errorf(
					"Could not load chunk descriptors prior to archiving metric %v, metric will not be archived: %v",
					series.metric, err,
				)
				return
			}
			series.chunkDescs = cds
		}
		if err := s.persistence.archiveMetric(
			fp, series.metric, series.firstTime(), series.head().lastTime(),
		); err != nil {
			glog.Errorf("Error archiving metric %v: %v", series.metric, err)
			return
		}
		s.seriesOps.WithLabelValues(archive).Inc()
		return
	}
	// If we are here, the series is not archived, so check for chunkDesc
	// eviction next and then if the head chunk needs to be persisted.
	series.evictChunkDescs(iOldestNotEvicted)
	if !series.headChunkPersisted && time.Now().Sub(series.head().lastTime().Time()) > headChunkTimeout {
		series.headChunkPersisted = true
		// Since we cannot modify the head chunk from now on, we
		// don't need to bother with cloning anymore.
		series.headChunkUsedByIterator = false
		headChunkToPersist = series.head()
	}
}

// purgeMemorySeries drops chunks older than beforeTime from the provided memory
// series. The caller must have locked fp. If the series contains no chunks
// after dropping old chunks, it is purged entirely. In that case, the method
// returns true.
func (s *memorySeriesStorage) purgeMemorySeries(fp clientmodel.Fingerprint, series *memorySeries, beforeTime clientmodel.Timestamp) bool {
	if !series.firstTime().Before(beforeTime) {
		// Oldest sample not old enough.
		return false
	}
	newFirstTime, numDroppedFromPersistence, allDroppedFromPersistence, err := s.persistence.dropChunks(fp, beforeTime)
	if err != nil {
		glog.Error("Error dropping persisted chunks: ", err)
	}
	numDroppedFromMemory, allDroppedFromMemory := series.dropChunks(beforeTime)
	if allDroppedFromPersistence && allDroppedFromMemory {
		s.fpToSeries.del(fp)
		s.numSeries.Dec()
		s.seriesOps.WithLabelValues(memoryPurge).Inc()
		s.persistence.unindexMetric(fp, series.metric)
		return true
	}
	if series.chunkDescsOffset != -1 {
		series.savedFirstTime = newFirstTime
		series.chunkDescsOffset += numDroppedFromMemory - numDroppedFromPersistence
		if series.chunkDescsOffset < 0 {
			panic("dropped more chunks from persistence than from memory")
		}
	}
	return false
}

// maintainArchivedSeries drops chunks older than beforeTime from an archived
// series. If the series contains no chunks after that, it is purged entirely.
func (s *memorySeriesStorage) maintainArchivedSeries(fp clientmodel.Fingerprint, beforeTime clientmodel.Timestamp) {
	s.fpLocker.Lock(fp)
	defer s.fpLocker.Unlock(fp)

	has, firstTime, lastTime, err := s.persistence.hasArchivedMetric(fp)
	if err != nil {
		glog.Error("Error looking up archived time range: ", err)
		return
	}
	if !has || !firstTime.Before(beforeTime) {
		// Oldest sample not old enough, or metric purged or unarchived in the meantime.
		return
	}

	defer s.seriesOps.WithLabelValues(archiveMaintenance).Inc()

	newFirstTime, _, allDropped, err := s.persistence.dropChunks(fp, beforeTime)
	if err != nil {
		glog.Error("Error dropping persisted chunks: ", err)
	}
	if allDropped {
		if err := s.persistence.purgeArchivedMetric(fp); err != nil {
			glog.Errorf("Error purging archived metric for fingerprint %v: %v", fp, err)
			return
		}
		s.seriesOps.WithLabelValues(archivePurge).Inc()
		return
	}
	s.persistence.updateArchivedTimeRange(fp, newFirstTime, lastTime)
}

// See persistence.loadChunks for detailed explanation.
func (s *memorySeriesStorage) loadChunks(fp clientmodel.Fingerprint, indexes []int, indexOffset int) ([]chunk, error) {
	return s.persistence.loadChunks(fp, indexes, indexOffset)
}

// See persistence.loadChunkDescs for detailed explanation.
func (s *memorySeriesStorage) loadChunkDescs(fp clientmodel.Fingerprint, beforeTime clientmodel.Timestamp) ([]*chunkDesc, error) {
	return s.persistence.loadChunkDescs(fp, beforeTime)
}

// Describe implements prometheus.Collector.
func (s *memorySeriesStorage) Describe(ch chan<- *prometheus.Desc) {
	s.persistence.Describe(ch)

	ch <- s.persistLatency.Desc()
	ch <- s.persistErrors.Desc()
	ch <- s.persistQueueCapacity.Desc()
	ch <- s.persistQueueLength.Desc()
	ch <- s.numSeries.Desc()
	s.seriesOps.Describe(ch)
	ch <- s.ingestedSamplesCount.Desc()
	ch <- s.invalidPreloadRequestsCount.Desc()

	ch <- numMemChunksDesc
}

// Collect implements prometheus.Collector.
func (s *memorySeriesStorage) Collect(ch chan<- prometheus.Metric) {
	s.persistence.Collect(ch)

	ch <- s.persistLatency
	ch <- s.persistErrors
	ch <- s.persistQueueCapacity
	ch <- s.persistQueueLength
	ch <- s.numSeries
	s.seriesOps.Collect(ch)
	ch <- s.ingestedSamplesCount
	ch <- s.invalidPreloadRequestsCount

	ch <- prometheus.MustNewConstMetric(
		numMemChunksDesc,
		prometheus.GaugeValue,
		float64(atomic.LoadInt64(&numMemChunks)))
}

// chunkMaps is a slice of maps with chunkDescs to be persisted.
// Each chunk map contains n consecutive chunks to persist, where
// n is the index+1.
type chunkMaps []map[clientmodel.Fingerprint][]*chunkDesc

// add adds a chunk to chunkMaps.
func (cm *chunkMaps) add(fp clientmodel.Fingerprint, cd *chunkDesc) {
	// Runtime of this method is linear with the number of
	// chunkMaps. However, we expect only ever very few maps.
	numMaps := len(*cm)
	for i, m := range *cm {
		if cds, ok := m[fp]; ok {
			// Found our fp! Add cd and level up.
			cds = append(cds, cd)
			delete(m, fp)
			if i == numMaps-1 {
				*cm = append(*cm, map[clientmodel.Fingerprint][]*chunkDesc{})
			}
			(*cm)[i+1][fp] = cds
			return
		}
	}
	// Our fp isn't contained in cm yet. Add it to the first map (and add a
	// first map if there is none).
	if numMaps == 0 {
		*cm = chunkMaps{map[clientmodel.Fingerprint][]*chunkDesc{}}
	}
	(*cm)[0][fp] = []*chunkDesc{cd}
}

// pop retrieves and removes a fingerprint with all its chunks. It chooses one
// of the fingerprints with the most chunks. It panics if cm has no entries.
func (cm *chunkMaps) pop() (clientmodel.Fingerprint, []*chunkDesc) {
	m := (*cm)[len(*cm)-1]
	for fp, cds := range m {
		delete(m, fp)
		// Prune empty maps from top level.
		for len(m) == 0 {
			*cm = (*cm)[:len(*cm)-1]
			if len(*cm) == 0 {
				break
			}
			m = (*cm)[len(*cm)-1]
		}
		return fp, cds
	}
	panic("popped from empty chunkMaps")
}