vendor: update tsdb (#2840)

2017-06-13 07:44:07 +02:00 · 2017-06-13 07:44:07 +02:00 · 7640960469
parent baf5b0f0fc
commit 7640960469
11 changed files with 1116 additions and 202 deletions
--- a/vendor/github.com/prometheus/tsdb/block.go
+++ b/vendor/github.com/prometheus/tsdb/block.go
@ -1,4 +1,5 @@
 // Copyright 2017 The Prometheus Authors
+
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
@ -21,6 +22,7 @@ import (

 	"github.com/oklog/ulid"
 	"github.com/pkg/errors"
+	"github.com/prometheus/tsdb/labels"
 )

 // DiskBlock handles reads against a Block of time series data.
@ -37,6 +39,12 @@ type DiskBlock interface {
 	// Chunks returns a ChunkReader over the block's data.
 	Chunks() ChunkReader

+	// Tombstones returns a TombstoneReader over the block's deleted data.
+	Tombstones() TombstoneReader
+
+	// Delete deletes data from the block.
+	Delete(mint, maxt int64, ms ...labels.Matcher) error
+
 	// Close releases all underlying resources of the block.
 	Close() error
 }
@ -45,6 +53,7 @@ type DiskBlock interface {
 type Block interface {
 	DiskBlock
 	Queryable
+	Snapshottable
 }

 // headBlock is a regular block that can still be appended to.
@ -53,6 +62,11 @@ type headBlock interface {
 	Appendable
 }

+// Snapshottable defines an entity that can be backedup online.
+type Snapshottable interface {
+	Snapshot(dir string) error
+}
+
 // Appendable defines an entity to which data can be appended.
 type Appendable interface {
 	// Appender returns a new Appender against an underlying store.
@ -78,16 +92,27 @@ type BlockMeta struct {
 	MaxTime int64 `json:"maxTime"`

 	// Stats about the contents of the block.
-	Stats struct {
-		NumSamples uint64 `json:"numSamples,omitempty"`
-		NumSeries  uint64 `json:"numSeries,omitempty"`
-		NumChunks  uint64 `json:"numChunks,omitempty"`
-	} `json:"stats,omitempty"`
+	Stats BlockStats `json:"stats,omitempty"`

 	// Information on compactions the block was created from.
-	Compaction struct {
-		Generation int `json:"generation"`
-	} `json:"compaction"`
+	Compaction BlockMetaCompaction `json:"compaction"`
+}
+
+// BlockStats contains stats about contents of a block.
+type BlockStats struct {
+	NumSamples    uint64 `json:"numSamples,omitempty"`
+	NumSeries     uint64 `json:"numSeries,omitempty"`
+	NumChunks     uint64 `json:"numChunks,omitempty"`
+	NumTombstones uint64 `json:"numTombstones,omitempty"`
+}
+
+// BlockMetaCompaction holds information about compactions a block went through.
+type BlockMetaCompaction struct {
+	// Maximum number of compaction cycles any source block has
+	// gone through.
+	Generation int `json:"generation"`
+	// ULIDs of all source head blocks that went into the block.
+	Sources []ulid.ULID `json:"sources,omitempty"`
 }

 const (
@ -136,7 +161,7 @@ func writeMetaFile(dir string, meta *BlockMeta) error {
 	var merr MultiError
 	if merr.Add(enc.Encode(&blockMeta{Version: 1, BlockMeta: meta})); merr.Err() != nil {
 		merr.Add(f.Close())
-		return merr
+		return merr.Err()
 	}
 	if err := f.Close(); err != nil {
 		return err
@ -150,6 +175,8 @@ type persistedBlock struct {

 	chunkr *chunkReader
 	indexr *indexReader
+
+	tombstones tombstoneReader
 }

 func newPersistedBlock(dir string) (*persistedBlock, error) {
@ -167,11 +194,17 @@ func newPersistedBlock(dir string) (*persistedBlock, error) {
 		return nil, err
 	}

+	tr, err := readTombstones(dir)
+	if err != nil {
+		return nil, err
+	}
+
 	pb := &persistedBlock{
-		dir:    dir,
-		meta:   *meta,
-		chunkr: cr,
-		indexr: ir,
+		dir:        dir,
+		meta:       *meta,
+		chunkr:     cr,
+		indexr:     ir,
+		tombstones: tr,
 	}
 	return pb, nil
 }
@ -191,21 +224,124 @@ func (pb *persistedBlock) String() string {

 func (pb *persistedBlock) Querier(mint, maxt int64) Querier {
 	return &blockQuerier{
-		mint:   mint,
-		maxt:   maxt,
-		index:  pb.Index(),
-		chunks: pb.Chunks(),
+		mint:       mint,
+		maxt:       maxt,
+		index:      pb.Index(),
+		chunks:     pb.Chunks(),
+		tombstones: pb.Tombstones(),
 	}
 }

 func (pb *persistedBlock) Dir() string         { return pb.dir }
 func (pb *persistedBlock) Index() IndexReader  { return pb.indexr }
 func (pb *persistedBlock) Chunks() ChunkReader { return pb.chunkr }
-func (pb *persistedBlock) Meta() BlockMeta     { return pb.meta }
+func (pb *persistedBlock) Tombstones() TombstoneReader {
+	return pb.tombstones
+}
+func (pb *persistedBlock) Meta() BlockMeta { return pb.meta }
+
+func (pb *persistedBlock) Delete(mint, maxt int64, ms ...labels.Matcher) error {
+	pr := newPostingsReader(pb.indexr)
+	p, absent := pr.Select(ms...)
+
+	ir := pb.indexr
+
+	// Choose only valid postings which have chunks in the time-range.
+	stones := map[uint32]intervals{}
+
+Outer:
+	for p.Next() {
+		lset, chunks, err := ir.Series(p.At())
+		if err != nil {
+			return err
+		}
+
+		for _, abs := range absent {
+			if lset.Get(abs) != "" {
+				continue Outer
+			}
+		}
+
+		for _, chk := range chunks {
+			if intervalOverlap(mint, maxt, chk.MinTime, chk.MaxTime) {
+				// Delete only until the current vlaues and not beyond.
+				tmin, tmax := clampInterval(mint, maxt, chunks[0].MinTime, chunks[len(chunks)-1].MaxTime)
+				stones[p.At()] = intervals{{tmin, tmax}}
+				continue Outer
+			}
+		}
+	}
+
+	if p.Err() != nil {
+		return p.Err()
+	}
+
+	// Merge the current and new tombstones.
+	for k, v := range stones {
+		pb.tombstones.add(k, v[0])
+	}
+
+	if err := writeTombstoneFile(pb.dir, pb.tombstones); err != nil {
+		return err
+	}
+
+	pb.meta.Stats.NumTombstones = uint64(len(pb.tombstones))
+	return writeMetaFile(pb.dir, &pb.meta)
+}
+
+func (pb *persistedBlock) Snapshot(dir string) error {
+	blockDir := filepath.Join(dir, pb.meta.ULID.String())
+	if err := os.MkdirAll(blockDir, 0777); err != nil {
+		return errors.Wrap(err, "create snapshot block dir")
+	}
+
+	chunksDir := chunkDir(blockDir)
+	if err := os.MkdirAll(chunksDir, 0777); err != nil {
+		return errors.Wrap(err, "create snapshot chunk dir")
+	}
+
+	// Hardlink meta, index and tombstones
+	for _, fname := range []string{
+		metaFilename,
+		indexFilename,
+		tombstoneFilename,
+	} {
+		if err := os.Link(filepath.Join(pb.dir, fname), filepath.Join(blockDir, fname)); err != nil {
+			return errors.Wrapf(err, "create snapshot %s", fname)
+		}
+	}
+
+	// Hardlink the chunks
+	curChunkDir := chunkDir(pb.dir)
+	files, err := ioutil.ReadDir(curChunkDir)
+	if err != nil {
+		return errors.Wrap(err, "ReadDir the current chunk dir")
+	}
+
+	for _, f := range files {
+		err := os.Link(filepath.Join(curChunkDir, f.Name()), filepath.Join(chunksDir, f.Name()))
+		if err != nil {
+			return errors.Wrap(err, "hardlink a chunk")
+		}
+	}
+
+	return nil
+}

 func chunkDir(dir string) string { return filepath.Join(dir, "chunks") }
 func walDir(dir string) string   { return filepath.Join(dir, "wal") }

+func clampInterval(a, b, mint, maxt int64) (int64, int64) {
+	if a < mint {
+		a = mint
+	}
+	if b > maxt {
+		b = maxt
+	}
+
+	return a, b
+}
+
 type mmapFile struct {
 	f *os.File
 	b []byte
--- a/vendor/github.com/prometheus/tsdb/chunks.go
+++ b/vendor/github.com/prometheus/tsdb/chunks.go
@ -54,6 +54,46 @@ func (cm *ChunkMeta) writeHash(h hash.Hash) error {
 	return nil
 }

+// deletedIterator wraps an Iterator and makes sure any deleted metrics are not
+// returned.
+type deletedIterator struct {
+	it chunks.Iterator
+
+	intervals intervals
+}
+
+func (it *deletedIterator) At() (int64, float64) {
+	return it.it.At()
+}
+
+func (it *deletedIterator) Next() bool {
+Outer:
+	for it.it.Next() {
+		ts, _ := it.it.At()
+
+		for _, tr := range it.intervals {
+			if tr.inBounds(ts) {
+				continue Outer
+			}
+
+			if ts > tr.maxt {
+				it.intervals = it.intervals[1:]
+				continue
+			}
+
+			return true
+		}
+
+		return true
+	}
+
+	return false
+}
+
+func (it *deletedIterator) Err() error {
+	return it.it.Err()
+}
+
 // ChunkWriter serializes a time block of chunked series data.
 type ChunkWriter interface {
 	// WriteChunks writes several chunks. The Chunk field of the ChunkMetas
--- a/vendor/github.com/prometheus/tsdb/compact.go
+++ b/vendor/github.com/prometheus/tsdb/compact.go
@ -26,6 +26,7 @@ import (
 	"github.com/oklog/ulid"
 	"github.com/pkg/errors"
 	"github.com/prometheus/client_golang/prometheus"
+	"github.com/prometheus/tsdb/chunks"
 	"github.com/prometheus/tsdb/labels"
 )

@ -70,7 +71,7 @@ func newCompactorMetrics(r prometheus.Registerer) *compactorMetrics {
 		Name: "tsdb_compactions_failed_total",
 		Help: "Total number of compactions that failed for the partition.",
 	})
-	m.duration = prometheus.NewHistogram(prometheus.HistogramOpts{
+	m.duration = prometheus.NewSummary(prometheus.SummaryOpts{
 		Name: "tsdb_compaction_duration",
 		Help: "Duration of compaction runs.",
 	})
@ -165,17 +166,35 @@ func (c *compactor) match(dirs []dirMeta) bool {
 	return uint64(dirs[len(dirs)-1].meta.MaxTime-dirs[0].meta.MinTime) <= c.opts.maxBlockRange
 }

-func mergeBlockMetas(blocks ...Block) (res BlockMeta) {
-	m0 := blocks[0].Meta()
+func compactBlockMetas(blocks ...BlockMeta) (res BlockMeta) {
+	res.MinTime = blocks[0].MinTime
+	res.MaxTime = blocks[len(blocks)-1].MaxTime

-	res.MinTime = m0.MinTime
-	res.MaxTime = blocks[len(blocks)-1].Meta().MaxTime
-
-	res.Compaction.Generation = m0.Compaction.Generation + 1
+	sources := map[ulid.ULID]struct{}{}

 	for _, b := range blocks {
-		res.Stats.NumSamples += b.Meta().Stats.NumSamples
+		res.Stats.NumSamples += b.Stats.NumSamples
+
+		if b.Compaction.Generation > res.Compaction.Generation {
+			res.Compaction.Generation = b.Compaction.Generation
+		}
+		for _, s := range b.Compaction.Sources {
+			sources[s] = struct{}{}
+		}
+		// If it's an in memory block, its ULID goes into the sources.
+		if b.Compaction.Generation == 0 {
+			sources[b.ULID] = struct{}{}
+		}
 	}
+	res.Compaction.Generation++
+
+	for s := range sources {
+		res.Compaction.Sources = append(res.Compaction.Sources, s)
+	}
+	sort.Slice(res.Compaction.Sources, func(i, j int) bool {
+		return res.Compaction.Sources[i].Compare(res.Compaction.Sources[j]) < 0
+	})
+
 	return res
 }

@ -219,6 +238,7 @@ func (c *compactor) write(uid ulid.ULID, blocks ...Block) (err error) {
 		if err != nil {
 			c.metrics.failed.Inc()
 		}
+		c.metrics.ran.Inc()
 		c.metrics.duration.Observe(time.Since(t).Seconds())
 	}(time.Now())

@ -244,7 +264,7 @@ func (c *compactor) write(uid ulid.ULID, blocks ...Block) (err error) {
 		return errors.Wrap(err, "open index writer")
 	}

-	meta, err := c.populate(blocks, indexw, chunkw)
+	meta, err := populateBlock(blocks, indexw, chunkw)
 	if err != nil {
 		return errors.Wrap(err, "write compaction")
 	}
@ -261,6 +281,11 @@ func (c *compactor) write(uid ulid.ULID, blocks ...Block) (err error) {
 		return errors.Wrap(err, "close index writer")
 	}

+	// Create an empty tombstones file.
+	if err := writeTombstoneFile(tmp, newEmptyTombstoneReader()); err != nil {
+		return errors.Wrap(err, "write new tombstones file")
+	}
+
 	// Block successfully written, make visible and remove old ones.
 	if err := renameFile(tmp, dir); err != nil {
 		return errors.Wrap(err, "rename block dir")
@ -275,6 +300,8 @@ func (c *compactor) write(uid ulid.ULID, blocks ...Block) (err error) {
 	if err != nil {
 		return errors.Wrap(err, "sync block dir")
 	}
+	defer df.Close()
+
 	if err := fileutil.Fsync(df); err != nil {
 		return errors.Wrap(err, "sync block dir")
 	}
@ -282,17 +309,20 @@ func (c *compactor) write(uid ulid.ULID, blocks ...Block) (err error) {
 	return nil
 }

-// populate fills the index and chunk writers with new data gathered as the union
+// populateBlock fills the index and chunk writers with new data gathered as the union
 // of the provided blocks. It returns meta information for the new block.
-func (c *compactor) populate(blocks []Block, indexw IndexWriter, chunkw ChunkWriter) (*BlockMeta, error) {
+func populateBlock(blocks []Block, indexw IndexWriter, chunkw ChunkWriter) (*BlockMeta, error) {
 	var set compactionSet
+	var metas []BlockMeta

 	for i, b := range blocks {
+		metas = append(metas, b.Meta())
+
 		all, err := b.Index().Postings("", "")
 		if err != nil {
 			return nil, err
 		}
-		s := newCompactionSeriesSet(b.Index(), b.Chunks(), all)
+		s := newCompactionSeriesSet(b.Index(), b.Chunks(), b.Tombstones(), all)

 		if i == 0 {
 			set = s
@ -309,18 +339,40 @@ func (c *compactor) populate(blocks []Block, indexw IndexWriter, chunkw ChunkWri
 		postings = &memPostings{m: make(map[term][]uint32, 512)}
 		values   = map[string]stringset{}
 		i        = uint32(0)
-		meta     = mergeBlockMetas(blocks...)
+		meta     = compactBlockMetas(metas...)
 	)

 	for set.Next() {
-		lset, chunks := set.At()
-		if err := chunkw.WriteChunks(chunks...); err != nil {
+		lset, chks, dranges := set.At() // The chunks here are not fully deleted.
+
+		if len(dranges) > 0 {
+			// Re-encode the chunk to not have deleted values.
+			for _, chk := range chks {
+				if intervalOverlap(dranges[0].mint, dranges[len(dranges)-1].maxt, chk.MinTime, chk.MaxTime) {
+					newChunk := chunks.NewXORChunk()
+					app, err := newChunk.Appender()
+					if err != nil {
+						return nil, err
+					}
+
+					it := &deletedIterator{it: chk.Chunk.Iterator(), intervals: dranges}
+					for it.Next() {
+						ts, v := it.At()
+						app.Append(ts, v)
+					}
+
+					chk.Chunk = newChunk
+				}
+			}
+		}
+
+		if err := chunkw.WriteChunks(chks...); err != nil {
 			return nil, err
 		}

-		indexw.AddSeries(i, lset, chunks...)
+		indexw.AddSeries(i, lset, chks...)

-		meta.Stats.NumChunks += uint64(len(chunks))
+		meta.Stats.NumChunks += uint64(len(chks))
 		meta.Stats.NumSeries++

 		for _, l := range lset {
@ -370,25 +422,28 @@ func (c *compactor) populate(blocks []Block, indexw IndexWriter, chunkw ChunkWri

 type compactionSet interface {
 	Next() bool
-	At() (labels.Labels, []*ChunkMeta)
+	At() (labels.Labels, []*ChunkMeta, intervals)
 	Err() error
 }

 type compactionSeriesSet struct {
-	p      Postings
-	index  IndexReader
-	chunks ChunkReader
+	p          Postings
+	index      IndexReader
+	chunks     ChunkReader
+	tombstones TombstoneReader

-	l   labels.Labels
-	c   []*ChunkMeta
-	err error
+	l         labels.Labels
+	c         []*ChunkMeta
+	intervals intervals
+	err       error
 }

-func newCompactionSeriesSet(i IndexReader, c ChunkReader, p Postings) *compactionSeriesSet {
+func newCompactionSeriesSet(i IndexReader, c ChunkReader, t TombstoneReader, p Postings) *compactionSeriesSet {
 	return &compactionSeriesSet{
-		index:  i,
-		chunks: c,
-		p:      p,
+		index:      i,
+		chunks:     c,
+		tombstones: t,
+		p:          p,
 	}
 }

@ -397,10 +452,25 @@ func (c *compactionSeriesSet) Next() bool {
 		return false
 	}

+	c.intervals = c.tombstones.Get(c.p.At())
+
 	c.l, c.c, c.err = c.index.Series(c.p.At())
 	if c.err != nil {
 		return false
 	}
+
+	// Remove completely deleted chunks.
+	if len(c.intervals) > 0 {
+		chks := make([]*ChunkMeta, 0, len(c.c))
+		for _, chk := range c.c {
+			if !(interval{chk.MinTime, chk.MaxTime}.isSubrange(c.intervals)) {
+				chks = append(chks, chk)
+			}
+		}
+
+		c.c = chks
+	}
+
 	for _, chk := range c.c {
 		chk.Chunk, c.err = c.chunks.Chunk(chk.Ref)
 		if c.err != nil {
@ -418,16 +488,17 @@ func (c *compactionSeriesSet) Err() error {
 	return c.p.Err()
 }

-func (c *compactionSeriesSet) At() (labels.Labels, []*ChunkMeta) {
-	return c.l, c.c
+func (c *compactionSeriesSet) At() (labels.Labels, []*ChunkMeta, intervals) {
+	return c.l, c.c, c.intervals
 }

 type compactionMerger struct {
 	a, b compactionSet

-	aok, bok bool
-	l        labels.Labels
-	c        []*ChunkMeta
+	aok, bok  bool
+	l         labels.Labels
+	c         []*ChunkMeta
+	intervals intervals
 }

 type compactionSeries struct {
@ -455,8 +526,8 @@ func (c *compactionMerger) compare() int {
 	if !c.bok {
 		return -1
 	}
-	a, _ := c.a.At()
-	b, _ := c.b.At()
+	a, _, _ := c.a.At()
+	b, _, _ := c.b.At()
 	return labels.Compare(a, b)
 }

@ -468,17 +539,21 @@ func (c *compactionMerger) Next() bool {
 	d := c.compare()
 	// Both sets contain the current series. Chain them into a single one.
 	if d > 0 {
-		c.l, c.c = c.b.At()
+		c.l, c.c, c.intervals = c.b.At()
 		c.bok = c.b.Next()
 	} else if d < 0 {
-		c.l, c.c = c.a.At()
+		c.l, c.c, c.intervals = c.a.At()
 		c.aok = c.a.Next()
 	} else {
-		l, ca := c.a.At()
-		_, cb := c.b.At()
+		l, ca, ra := c.a.At()
+		_, cb, rb := c.b.At()
+		for _, r := range rb {
+			ra = ra.add(r)
+		}

 		c.l = l
 		c.c = append(ca, cb...)
+		c.intervals = ra

 		c.aok = c.a.Next()
 		c.bok = c.b.Next()
@ -493,8 +568,8 @@ func (c *compactionMerger) Err() error {
 	return c.b.Err()
 }

-func (c *compactionMerger) At() (labels.Labels, []*ChunkMeta) {
-	return c.l, c.c
+func (c *compactionMerger) At() (labels.Labels, []*ChunkMeta, intervals) {
+	return c.l, c.c, c.intervals
 }

 func renameFile(from, to string) error {
@ -510,6 +585,8 @@ func renameFile(from, to string) error {
 	if err != nil {
 		return err
 	}
+	defer pdir.Close()
+
 	if err = fileutil.Fsync(pdir); err != nil {
 		return err
 	}
--- a/vendor/github.com/prometheus/tsdb/db.go
+++ b/vendor/github.com/prometheus/tsdb/db.go
@ -119,16 +119,49 @@ type DB struct {
 	compactc chan struct{}
 	donec    chan struct{}
 	stopc    chan struct{}
+
+	// cmtx is used to control compactions and deletions.
+	cmtx       sync.Mutex
+	compacting bool
 }

 type dbMetrics struct {
+	activeAppenders      prometheus.Gauge
+	loadedBlocks         prometheus.GaugeFunc
+	reloads              prometheus.Counter
+	reloadsFailed        prometheus.Counter
+	reloadDuration       prometheus.Summary
 	samplesAppended      prometheus.Counter
 	compactionsTriggered prometheus.Counter
 }

-func newDBMetrics(r prometheus.Registerer) *dbMetrics {
+func newDBMetrics(db *DB, r prometheus.Registerer) *dbMetrics {
 	m := &dbMetrics{}

+	m.activeAppenders = prometheus.NewGauge(prometheus.GaugeOpts{
+		Name: "tsdb_active_appenders",
+		Help: "Number of currently active appender transactions",
+	})
+	m.loadedBlocks = prometheus.NewGaugeFunc(prometheus.GaugeOpts{
+		Name: "tsdb_blocks_loaded",
+		Help: "Number of currently loaded data blocks",
+	}, func() float64 {
+		db.mtx.RLock()
+		defer db.mtx.RUnlock()
+		return float64(len(db.blocks))
+	})
+	m.reloads = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "tsdb_reloads_total",
+		Help: "Number of times the database reloaded block data from disk.",
+	})
+	m.reloadsFailed = prometheus.NewCounter(prometheus.CounterOpts{
+		Name: "tsdb_reloads_failures_total",
+		Help: "Number of times the database failed to reload black data from disk.",
+	})
+	m.reloadDuration = prometheus.NewSummary(prometheus.SummaryOpts{
+		Name: "tsdb_reload_duration_seconds",
+		Help: "Duration of block reloads.",
+	})
 	m.samplesAppended = prometheus.NewCounter(prometheus.CounterOpts{
 		Name: "tsdb_samples_appended_total",
 		Help: "Total number of appended sampledb.",
@ -140,6 +173,11 @@ func newDBMetrics(r prometheus.Registerer) *dbMetrics {

 	if r != nil {
 		r.MustRegister(
+			m.activeAppenders,
+			m.loadedBlocks,
+			m.reloads,
+			m.reloadsFailed,
+			m.reloadDuration,
 			m.samplesAppended,
 			m.compactionsTriggered,
 		)
@ -163,14 +201,16 @@ func Open(dir string, l log.Logger, r prometheus.Registerer, opts *Options) (db
 	}

 	db = &DB{
-		dir:      dir,
-		logger:   l,
-		metrics:  newDBMetrics(r),
-		opts:     opts,
-		compactc: make(chan struct{}, 1),
-		donec:    make(chan struct{}),
-		stopc:    make(chan struct{}),
+		dir:        dir,
+		logger:     l,
+		opts:       opts,
+		compactc:   make(chan struct{}, 1),
+		donec:      make(chan struct{}),
+		stopc:      make(chan struct{}),
+		compacting: true,
 	}
+	db.metrics = newDBMetrics(db, r)
+
 	if !opts.NoLockfile {
 		absdir, err := filepath.Abs(dir)
 		if err != nil {
@ -198,6 +238,11 @@ func Open(dir string, l log.Logger, r prometheus.Registerer, opts *Options) (db
 	return db, nil
 }

+// Dir returns the directory of the database.
+func (db *DB) Dir() string {
+	return db.dir
+}
+
 func (db *DB) run() {
 	defer close(db.donec)

@ -261,6 +306,9 @@ func (db *DB) retentionCutoff() (bool, error) {
 }

 func (db *DB) compact() (changes bool, err error) {
+	db.cmtx.Lock()
+	defer db.cmtx.Unlock()
+
 	db.headmtx.RLock()

 	// Check whether we have pending head blocks that are ready to be persisted.
@ -338,6 +386,8 @@ func retentionCutoff(dir string, mint int64) (bool, error) {
 	if err != nil {
 		return false, errors.Wrapf(err, "open directory")
 	}
+	defer df.Close()
+
 	dirs, err := blockDirs(dir)
 	if err != nil {
 		return false, errors.Wrapf(err, "list block dirs %s", dir)
@ -374,7 +424,15 @@ func (db *DB) getBlock(id ulid.ULID) (Block, bool) {
 	return nil, false
 }

-func (db *DB) reloadBlocks() error {
+func (db *DB) reloadBlocks() (err error) {
+	defer func(t time.Time) {
+		if err != nil {
+			db.metrics.reloadsFailed.Inc()
+		}
+		db.metrics.reloads.Inc()
+		db.metrics.reloadDuration.Observe(time.Since(t).Seconds())
+	}(time.Now())
+
 	var cs []io.Closer
 	defer func() { closeAll(cs...) }()

@ -418,6 +476,7 @@ func (db *DB) reloadBlocks() error {
 	if err := validateBlockSequence(blocks); err != nil {
 		return errors.Wrap(err, "invalid block sequence")
 	}
+
 	// Close all opened blocks that no longer exist after we returned all locks.
 	for _, b := range db.blocks {
 		if _, ok := exist[b.Meta().ULID]; !ok {
@ -447,7 +506,7 @@ func validateBlockSequence(bs []Block) error {
 	prev := bs[0]
 	for _, b := range bs[1:] {
 		if b.Meta().MinTime < prev.Meta().MaxTime {
-			return errors.Errorf("block time ranges overlap", b.Meta().MinTime, prev.Meta().MaxTime)
+			return errors.Errorf("block time ranges overlap (%d, %d)", b.Meta().MinTime, prev.Meta().MaxTime)
 		}
 	}
 	return nil
@ -478,8 +537,47 @@ func (db *DB) Close() error {
 	return merr.Err()
 }

+// DisableCompactions disables compactions.
+func (db *DB) DisableCompactions() {
+	if db.compacting {
+		db.cmtx.Lock()
+		db.compacting = false
+		db.logger.Log("msg", "compactions disabled")
+	}
+}
+
+// EnableCompactions enables compactions.
+func (db *DB) EnableCompactions() {
+	if !db.compacting {
+		db.cmtx.Unlock()
+		db.compacting = true
+		db.logger.Log("msg", "compactions enabled")
+	}
+}
+
+// Snapshot writes the current data to the directory.
+func (db *DB) Snapshot(dir string) error {
+	db.mtx.Lock() // To block any appenders.
+	defer db.mtx.Unlock()
+
+	db.cmtx.Lock()
+	defer db.cmtx.Unlock()
+
+	blocks := db.blocks[:]
+	for _, b := range blocks {
+		db.logger.Log("msg", "snapshotting block", "block", b)
+		if err := b.Snapshot(dir); err != nil {
+			return errors.Wrap(err, "error snapshotting headblock")
+		}
+	}
+
+	return nil
+}
+
 // Appender returns a new Appender on the database.
 func (db *DB) Appender() Appender {
+	db.metrics.activeAppenders.Inc()
+
 	db.mtx.RLock()
 	return &dbAppender{db: db}
 }
@ -619,6 +717,7 @@ func (db *DB) ensureHead(t int64) error {
 }

 func (a *dbAppender) Commit() error {
+	defer a.db.metrics.activeAppenders.Dec()
 	defer a.db.mtx.RUnlock()

 	// Commits to partial appenders must be concurrent as concurrent appenders
@ -649,6 +748,7 @@ func (a *dbAppender) Commit() error {
 }

 func (a *dbAppender) Rollback() error {
+	defer a.db.metrics.activeAppenders.Dec()
 	defer a.db.mtx.RUnlock()

 	var g errgroup.Group
@ -660,6 +760,30 @@ func (a *dbAppender) Rollback() error {
 	return g.Wait()
 }

+// Delete implements deletion of metrics.
+func (db *DB) Delete(mint, maxt int64, ms ...labels.Matcher) error {
+	db.cmtx.Lock()
+	defer db.cmtx.Unlock()
+	db.mtx.Lock()
+	defer db.mtx.Unlock()
+
+	blocks := db.blocksForInterval(mint, maxt)
+
+	var g errgroup.Group
+
+	for _, b := range blocks {
+		g.Go(func(b Block) func() error {
+			return func() error { return b.Delete(mint, maxt, ms...) }
+		}(b))
+	}
+
+	if err := g.Wait(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
 // appendable returns a copy of a slice of HeadBlocks that can still be appended to.
 func (db *DB) appendable() (r []headBlock) {
 	switch len(db.heads) {
@ -673,13 +797,8 @@ func (db *DB) appendable() (r []headBlock) {
 }

 func intervalOverlap(amin, amax, bmin, bmax int64) bool {
-	if bmin >= amin && bmin <= amax {
-		return true
-	}
-	if amin >= bmin && amin <= bmax {
-		return true
-	}
-	return false
+	// Checks Overlap: http://stackoverflow.com/questions/3269434/
+	return amin <= bmax && bmin <= amax
 }

 func intervalContains(min, max, t int64) bool {
--- a/vendor/github.com/prometheus/tsdb/encoding_helpers.go
+++ b/vendor/github.com/prometheus/tsdb/encoding_helpers.go
@ -22,6 +22,7 @@ func (e *encbuf) putByte(c byte)     { e.b = append(e.b, c) }

 func (e *encbuf) putBE32int(x int)      { e.putBE32(uint32(x)) }
 func (e *encbuf) putBE64int(x int)      { e.putBE64(uint64(x)) }
+func (e *encbuf) putBE64int64(x int64)  { e.putBE64(uint64(x)) }
 func (e *encbuf) putUvarint32(x uint32) { e.putUvarint64(uint64(x)) }
 func (e *encbuf) putUvarint(x int)      { e.putUvarint64(uint64(x)) }

@ -71,8 +72,10 @@ type decbuf struct {
 	e error
 }

-func (d *decbuf) uvarint() int { return int(d.uvarint64()) }
-func (d *decbuf) be32int() int { return int(d.be32()) }
+func (d *decbuf) uvarint() int      { return int(d.uvarint64()) }
+func (d *decbuf) uvarint32() uint32 { return uint32(d.uvarint64()) }
+func (d *decbuf) be32int() int      { return int(d.be32()) }
+func (d *decbuf) be64int64() int64  { return int64(d.be64()) }

 func (d *decbuf) uvarintStr() string {
 	l := d.uvarint64()
@ -140,6 +143,19 @@ func (d *decbuf) be32() uint32 {
 	return x
 }

+func (d *decbuf) byte() byte {
+	if d.e != nil {
+		return 0
+	}
+	if len(d.b) < 1 {
+		d.e = errInvalidSize
+		return 0
+	}
+	x := d.b[0]
+	d.b = d.b[1:]
+	return x
+}
+
 func (d *decbuf) decbuf(l int) decbuf {
 	if d.e != nil {
 		return decbuf{e: d.e}
--- a/vendor/github.com/prometheus/tsdb/head.go
+++ b/vendor/github.com/prometheus/tsdb/head.go
@ -69,6 +69,8 @@ type HeadBlock struct {
 	values   map[string]stringset // label names to possible values
 	postings *memPostings         // postings lists for terms

+	tombstones tombstoneReader
+
 	meta BlockMeta
 }

@ -97,6 +99,7 @@ func TouchHeadBlock(dir string, mint, maxt int64) (string, error) {
 	}); err != nil {
 		return "", err
 	}
+
 	return dir, renameFile(tmp, dir)
 }

@ -108,13 +111,14 @@ func OpenHeadBlock(dir string, l log.Logger, wal WAL) (*HeadBlock, error) {
 	}

 	h := &HeadBlock{
-		dir:      dir,
-		wal:      wal,
-		series:   []*memSeries{nil}, // 0 is not a valid posting, filled with nil.
-		hashes:   map[uint64][]*memSeries{},
-		values:   map[string]stringset{},
-		postings: &memPostings{m: make(map[term][]uint32)},
-		meta:     *meta,
+		dir:        dir,
+		wal:        wal,
+		series:     []*memSeries{nil}, // 0 is not a valid posting, filled with nil.
+		hashes:     map[uint64][]*memSeries{},
+		values:     map[string]stringset{},
+		postings:   &memPostings{m: make(map[term][]uint32)},
+		meta:       *meta,
+		tombstones: newEmptyTombstoneReader(),
 	}
 	return h, h.init()
 }
@ -122,16 +126,19 @@ func OpenHeadBlock(dir string, l log.Logger, wal WAL) (*HeadBlock, error) {
 func (h *HeadBlock) init() error {
 	r := h.wal.Reader()

-	for r.Next() {
-		series, samples := r.At()
-
+	seriesFunc := func(series []labels.Labels) error {
 		for _, lset := range series {
 			h.create(lset.Hash(), lset)
 			h.meta.Stats.NumSeries++
 		}
+
+		return nil
+	}
+	samplesFunc := func(samples []RefSample) error {
 		for _, s := range samples {
 			if int(s.Ref) >= len(h.series) {
-				return errors.Errorf("unknown series reference %d (max %d); abort WAL restore", s.Ref, len(h.series))
+				return errors.Errorf("unknown series reference %d (max %d); abort WAL restore",
+					s.Ref, len(h.series))
 			}
 			h.series[s.Ref].append(s.T, s.V)

@ -140,8 +147,24 @@ func (h *HeadBlock) init() error {
 			}
 			h.meta.Stats.NumSamples++
 		}
+
+		return nil
 	}
-	return errors.Wrap(r.Err(), "consume WAL")
+	deletesFunc := func(stones []Stone) error {
+		for _, s := range stones {
+			for _, itv := range s.intervals {
+				h.tombstones.add(s.ref, itv)
+			}
+		}
+
+		return nil
+	}
+
+	if err := r.Read(seriesFunc, samplesFunc, deletesFunc); err != nil {
+		return errors.Wrap(err, "consume WAL")
+	}
+
+	return nil
 }

 // inBounds returns true if the given timestamp is within the valid
@ -195,6 +218,114 @@ func (h *HeadBlock) Meta() BlockMeta {
 	return m
 }

+// Tombstones returns the TombstoneReader against the block.
+func (h *HeadBlock) Tombstones() TombstoneReader {
+	return h.tombstones
+}
+
+// Delete implements headBlock.
+func (h *HeadBlock) Delete(mint int64, maxt int64, ms ...labels.Matcher) error {
+	ir := h.Index()
+
+	pr := newPostingsReader(ir)
+	p, absent := pr.Select(ms...)
+
+	var stones []Stone
+
+Outer:
+	for p.Next() {
+		ref := p.At()
+		lset := h.series[ref].lset
+		for _, abs := range absent {
+			if lset.Get(abs) != "" {
+				continue Outer
+			}
+		}
+
+		// Delete only until the current values and not beyond.
+		tmin, tmax := clampInterval(mint, maxt, h.series[ref].chunks[0].minTime, h.series[ref].head().maxTime)
+		stones = append(stones, Stone{ref, intervals{{tmin, tmax}}})
+	}
+
+	if p.Err() != nil {
+		return p.Err()
+	}
+	if err := h.wal.LogDeletes(stones); err != nil {
+		return err
+	}
+
+	for _, s := range stones {
+		h.tombstones.add(s.ref, s.intervals[0])
+	}
+
+	h.meta.Stats.NumTombstones = uint64(len(h.tombstones))
+	return nil
+}
+
+// Snapshot persists the current state of the headblock to the given directory.
+// TODO(gouthamve): Snapshot must be called when there are no active appenders.
+// This has been ensured by acquiring a Lock on DB.mtx, but this limitation should
+// be removed in the future.
+func (h *HeadBlock) Snapshot(snapshotDir string) error {
+	if h.meta.Stats.NumSeries == 0 {
+		return nil
+	}
+
+	entropy := rand.New(rand.NewSource(time.Now().UnixNano()))
+	uid := ulid.MustNew(ulid.Now(), entropy)
+
+	dir := filepath.Join(snapshotDir, uid.String())
+	tmp := dir + ".tmp"
+
+	if err := os.RemoveAll(tmp); err != nil {
+		return err
+	}
+
+	if err := os.MkdirAll(tmp, 0777); err != nil {
+		return err
+	}
+
+	// Populate chunk and index files into temporary directory with
+	// data of all blocks.
+	chunkw, err := newChunkWriter(chunkDir(tmp))
+	if err != nil {
+		return errors.Wrap(err, "open chunk writer")
+	}
+	indexw, err := newIndexWriter(tmp)
+	if err != nil {
+		return errors.Wrap(err, "open index writer")
+	}
+
+	meta, err := populateBlock([]Block{h}, indexw, chunkw)
+	if err != nil {
+		return errors.Wrap(err, "write snapshot")
+	}
+	meta.ULID = uid
+
+	if err = writeMetaFile(tmp, meta); err != nil {
+		return errors.Wrap(err, "write merged meta")
+	}
+
+	if err = chunkw.Close(); err != nil {
+		return errors.Wrap(err, "close chunk writer")
+	}
+	if err = indexw.Close(); err != nil {
+		return errors.Wrap(err, "close index writer")
+	}
+
+	// Create an empty tombstones file.
+	if err := writeTombstoneFile(tmp, newEmptyTombstoneReader()); err != nil {
+		return errors.Wrap(err, "write new tombstones file")
+	}
+
+	// Block successfully written, make visible
+	if err := renameFile(tmp, dir); err != nil {
+		return errors.Wrap(err, "rename block dir")
+	}
+
+	return nil
+}
+
 // Dir returns the directory of the block.
 func (h *HeadBlock) Dir() string { return h.dir }

@ -217,10 +348,12 @@ func (h *HeadBlock) Querier(mint, maxt int64) Querier {
 	series := h.series[:]

 	return &blockQuerier{
-		mint:   mint,
-		maxt:   maxt,
-		index:  h.Index(),
-		chunks: h.Chunks(),
+		mint:       mint,
+		maxt:       maxt,
+		index:      h.Index(),
+		chunks:     h.Chunks(),
+		tombstones: h.Tombstones(),
+
 		postingsMapper: func(p Postings) Postings {
 			ep := make([]uint32, 0, 64)

@ -388,15 +521,17 @@ func (a *headAppender) AddFast(ref string, t int64, v float64) error {
 	return nil
 }

-func (a *headAppender) createSeries() {
+func (a *headAppender) createSeries() error {
 	if len(a.newSeries) == 0 {
-		return
+		return nil
 	}
 	a.newLabels = make([]labels.Labels, 0, len(a.newSeries))
 	base0 := len(a.series)

 	a.mtx.RUnlock()
+	defer a.mtx.RLock()
 	a.mtx.Lock()
+	defer a.mtx.Unlock()

 	base1 := len(a.series)

@ -416,15 +551,22 @@ func (a *headAppender) createSeries() {
 		a.create(l.hash, l.labels)
 	}

-	a.mtx.Unlock()
-	a.mtx.RLock()
+	// Write all new series to the WAL.
+	if err := a.wal.LogSeries(a.newLabels); err != nil {
+		return errors.Wrap(err, "WAL log series")
+	}
+
+	return nil
 }

 func (a *headAppender) Commit() error {
 	defer atomic.AddUint64(&a.activeWriters, ^uint64(0))
 	defer putHeadAppendBuffer(a.samples)
+	defer a.mtx.RUnlock()

-	a.createSeries()
+	if err := a.createSeries(); err != nil {
+		return err
+	}

 	// We have to update the refs of samples for series we just created.
 	for i := range a.samples {
@ -434,11 +576,10 @@ func (a *headAppender) Commit() error {
 		}
 	}

-	// Write all new series and samples to the WAL and add it to the
+	// Write all new samples to the WAL and add them to the
 	// in-mem database on success.
-	if err := a.wal.Log(a.newLabels, a.samples); err != nil {
-		a.mtx.RUnlock()
-		return err
+	if err := a.wal.LogSamples(a.samples); err != nil {
+		return errors.Wrap(err, "WAL log samples")
 	}

 	total := uint64(len(a.samples))
@ -449,8 +590,6 @@ func (a *headAppender) Commit() error {
 		}
 	}

-	a.mtx.RUnlock()
-
 	atomic.AddUint64(&a.meta.Stats.NumSamples, total)
 	atomic.AddUint64(&a.meta.Stats.NumSeries, uint64(len(a.newSeries)))

@ -538,6 +677,7 @@ func (h *headIndexReader) Series(ref uint32) (labels.Labels, []*ChunkMeta, error
 	if int(ref) >= len(h.series) {
 		return nil, nil, ErrNotFound
 	}
+
 	s := h.series[ref]
 	if s == nil {
 		return nil, nil, ErrNotFound
@ -584,12 +724,7 @@ func (h *HeadBlock) get(hash uint64, lset labels.Labels) *memSeries {
 }

 func (h *HeadBlock) create(hash uint64, lset labels.Labels) *memSeries {
-	s := &memSeries{
-		lset: lset,
-		ref:  uint32(len(h.series)),
-	}
-	// create the initial chunk and appender
-	s.cut()
+	s := newMemSeries(lset, uint32(len(h.series)), h.meta.MaxTime)

 	// Allocate empty space until we can insert at the given index.
 	h.series = append(h.series, s)
@ -624,15 +759,18 @@ type memSeries struct {
 	lset   labels.Labels
 	chunks []*memChunk

+	nextAt    int64 // timestamp at which to cut the next chunk.
+	maxt      int64 // maximum timestamp for the series.
 	lastValue float64
 	sampleBuf [4]sample

 	app chunks.Appender // Current appender for the chunk.
 }

-func (s *memSeries) cut() *memChunk {
+func (s *memSeries) cut(mint int64) *memChunk {
 	c := &memChunk{
 		chunk:   chunks.NewXORChunk(),
+		minTime: mint,
 		maxTime: math.MinInt64,
 	}
 	s.chunks = append(s.chunks, c)
@ -641,32 +779,47 @@ func (s *memSeries) cut() *memChunk {
 	if err != nil {
 		panic(err)
 	}
-
 	s.app = app
 	return c
 }

+func newMemSeries(lset labels.Labels, id uint32, maxt int64) *memSeries {
+	s := &memSeries{
+		lset:   lset,
+		ref:    id,
+		maxt:   maxt,
+		nextAt: math.MinInt64,
+	}
+	return s
+}
+
 func (s *memSeries) append(t int64, v float64) bool {
+	const samplesPerChunk = 120
+
 	s.mtx.Lock()
 	defer s.mtx.Unlock()

 	var c *memChunk

-	if s.head().samples > 130 {
-		c = s.cut()
-		c.minTime = t
-	} else {
-		c = s.head()
-		// Skip duplicate and out of order samples.
-		if c.maxTime >= t {
-			return false
-		}
+	if len(s.chunks) == 0 {
+		c = s.cut(t)
+	}
+	c = s.head()
+	if c.maxTime >= t {
+		return false
+	}
+	if c.samples > samplesPerChunk/4 && t >= s.nextAt {
+		c = s.cut(t)
 	}
 	s.app.Append(t, v)

 	c.maxTime = t
 	c.samples++

+	if c.samples == samplesPerChunk/4 {
+		s.nextAt = computeChunkEndTime(c.minTime, c.maxTime, s.maxt)
+	}
+
 	s.lastValue = v

 	s.sampleBuf[0] = s.sampleBuf[1]
@ -677,6 +830,17 @@ func (s *memSeries) append(t int64, v float64) bool {
 	return true
 }

+// computeChunkEndTime estimates the end timestamp based the beginning of a chunk,
+// its current timestamp and the upper bound up to which we insert data.
+// It assumes that the time range is 1/4 full.
+func computeChunkEndTime(start, cur, max int64) int64 {
+	a := (max - start) / ((cur - start + 1) * 4)
+	if a == 0 {
+		return max
+	}
+	return start + (max-start)/a
+}
+
 func (s *memSeries) iterator(i int) chunks.Iterator {
 	c := s.chunks[i]

--- a/vendor/github.com/prometheus/tsdb/index.go
+++ b/vendor/github.com/prometheus/tsdb/index.go
@ -39,6 +39,8 @@ const (
 	indexFormatV1 = 1
 )

+const indexFilename = "index"
+
 const compactionPageBytes = minSectorSize * 64

 type indexWriterSeries struct {
@ -138,7 +140,7 @@ func newIndexWriter(dir string) (*indexWriter, error) {
 	if err != nil {
 		return nil, err
 	}
-	f, err := os.OpenFile(filepath.Join(dir, "index"), os.O_CREATE|os.O_WRONLY, 0666)
+	f, err := os.OpenFile(filepath.Join(dir, indexFilename), os.O_CREATE|os.O_WRONLY, 0666)
 	if err != nil {
 		return nil, err
 	}
@ -569,11 +571,7 @@ func newIndexReader(dir string) (*indexReader, error) {
 		return nil, errors.Wrap(err, "read label index table")
 	}
 	r.postings, err = r.readOffsetTable(r.toc.postingsTable)
-	if err != nil {
-		return nil, errors.Wrap(err, "read postings table")
-	}
-
-	return r, nil
+	return r, errors.Wrap(err, "read postings table")
 }

 func (r *indexReader) readTOC() error {
--- a/vendor/github.com/prometheus/tsdb/querier.go
+++ b/vendor/github.com/prometheus/tsdb/querier.go
@ -126,8 +126,9 @@ func (q *querier) Close() error {

 // blockQuerier provides querying access to a single block database.
 type blockQuerier struct {
-	index  IndexReader
-	chunks ChunkReader
+	index      IndexReader
+	chunks     ChunkReader
+	tombstones TombstoneReader

 	postingsMapper func(Postings) Postings

@ -149,6 +150,8 @@ func (q *blockQuerier) Select(ms ...labels.Matcher) SeriesSet {
 				p:      p,
 				index:  q.index,
 				absent: absent,
+
+				tombstones: q.tombstones,
 			},
 			chunks: q.chunks,
 			mint:   q.mint,
@ -366,29 +369,35 @@ func (s *mergedSeriesSet) Next() bool {

 type chunkSeriesSet interface {
 	Next() bool
-	At() (labels.Labels, []*ChunkMeta)
+	At() (labels.Labels, []*ChunkMeta, intervals)
 	Err() error
 }

 // baseChunkSeries loads the label set and chunk references for a postings
 // list from an index. It filters out series that have labels set that should be unset.
 type baseChunkSeries struct {
-	p      Postings
-	index  IndexReader
-	absent []string // labels that must be unset in results.
+	p          Postings
+	index      IndexReader
+	tombstones TombstoneReader
+	absent     []string // labels that must be unset in results.

-	lset labels.Labels
-	chks []*ChunkMeta
-	err  error
+	lset      labels.Labels
+	chks      []*ChunkMeta
+	intervals intervals
+	err       error
 }

-func (s *baseChunkSeries) At() (labels.Labels, []*ChunkMeta) { return s.lset, s.chks }
-func (s *baseChunkSeries) Err() error                        { return s.err }
+func (s *baseChunkSeries) At() (labels.Labels, []*ChunkMeta, intervals) {
+	return s.lset, s.chks, s.intervals
+}
+
+func (s *baseChunkSeries) Err() error { return s.err }

 func (s *baseChunkSeries) Next() bool {
 Outer:
 	for s.p.Next() {
-		lset, chunks, err := s.index.Series(s.p.At())
+		ref := s.p.At()
+		lset, chunks, err := s.index.Series(ref)
 		if err != nil {
 			s.err = err
 			return false
@ -403,6 +412,19 @@ Outer:

 		s.lset = lset
 		s.chks = chunks
+		s.intervals = s.tombstones.Get(s.p.At())
+
+		if len(s.intervals) > 0 {
+			// Only those chunks that are not entirely deleted.
+			chks := make([]*ChunkMeta, 0, len(s.chks))
+			for _, chk := range s.chks {
+				if !(interval{chk.MinTime, chk.MaxTime}.isSubrange(s.intervals)) {
+					chks = append(chks, chk)
+				}
+			}
+
+			s.chks = chks
+		}

 		return true
 	}
@ -420,17 +442,20 @@ type populatedChunkSeries struct {
 	chunks     ChunkReader
 	mint, maxt int64

-	err  error
-	chks []*ChunkMeta
-	lset labels.Labels
+	err       error
+	chks      []*ChunkMeta
+	lset      labels.Labels
+	intervals intervals
 }

-func (s *populatedChunkSeries) At() (labels.Labels, []*ChunkMeta) { return s.lset, s.chks }
-func (s *populatedChunkSeries) Err() error                        { return s.err }
+func (s *populatedChunkSeries) At() (labels.Labels, []*ChunkMeta, intervals) {
+	return s.lset, s.chks, s.intervals
+}
+func (s *populatedChunkSeries) Err() error { return s.err }

 func (s *populatedChunkSeries) Next() bool {
 	for s.set.Next() {
-		lset, chks := s.set.At()
+		lset, chks, dranges := s.set.At()

 		for len(chks) > 0 {
 			if chks[0].MaxTime >= s.mint {
@ -457,6 +482,7 @@ func (s *populatedChunkSeries) Next() bool {

 		s.lset = lset
 		s.chks = chks
+		s.intervals = dranges

 		return true
 	}
@ -477,8 +503,15 @@ type blockSeriesSet struct {

 func (s *blockSeriesSet) Next() bool {
 	for s.set.Next() {
-		lset, chunks := s.set.At()
-		s.cur = &chunkSeries{labels: lset, chunks: chunks, mint: s.mint, maxt: s.maxt}
+		lset, chunks, dranges := s.set.At()
+		s.cur = &chunkSeries{
+			labels: lset,
+			chunks: chunks,
+			mint:   s.mint,
+			maxt:   s.maxt,
+
+			intervals: dranges,
+		}
 		return true
 	}
 	if s.set.Err() != nil {
@ -497,6 +530,8 @@ type chunkSeries struct {
 	chunks []*ChunkMeta // in-order chunk refs

 	mint, maxt int64
+
+	intervals intervals
 }

 func (s *chunkSeries) Labels() labels.Labels {
@ -504,7 +539,7 @@ func (s *chunkSeries) Labels() labels.Labels {
 }

 func (s *chunkSeries) Iterator() SeriesIterator {
-	return newChunkSeriesIterator(s.chunks, s.mint, s.maxt)
+	return newChunkSeriesIterator(s.chunks, s.intervals, s.mint, s.maxt)
 }

 // SeriesIterator iterates over the data of a time series.
@ -601,16 +636,24 @@ type chunkSeriesIterator struct {
 	cur chunks.Iterator

 	maxt, mint int64
+
+	intervals intervals
 }

-func newChunkSeriesIterator(cs []*ChunkMeta, mint, maxt int64) *chunkSeriesIterator {
+func newChunkSeriesIterator(cs []*ChunkMeta, dranges intervals, mint, maxt int64) *chunkSeriesIterator {
+	it := cs[0].Chunk.Iterator()
+	if len(dranges) > 0 {
+		it = &deletedIterator{it: it, intervals: dranges}
+	}
 	return &chunkSeriesIterator{
 		chunks: cs,
 		i:      0,
-		cur:    cs[0].Chunk.Iterator(),
+		cur:    it,

 		mint: mint,
 		maxt: maxt,
+
+		intervals: dranges,
 	}
 }

@ -645,6 +688,9 @@ func (it *chunkSeriesIterator) Seek(t int64) (ok bool) {

 	it.i = x
 	it.cur = it.chunks[x].Chunk.Iterator()
+	if len(it.intervals) > 0 {
+		it.cur = &deletedIterator{it: it.cur, intervals: it.intervals}
+	}

 	for it.cur.Next() {
 		t0, _ := it.cur.At()
@ -676,6 +722,9 @@ func (it *chunkSeriesIterator) Next() bool {

 	it.i++
 	it.cur = it.chunks[it.i].Chunk.Iterator()
+	if len(it.intervals) > 0 {
+		it.cur = &deletedIterator{it: it.cur, intervals: it.intervals}
+	}

 	return it.Next()
 }
--- a/vendor/github.com/prometheus/tsdb/tombstones.go
+++ b/vendor/github.com/prometheus/tsdb/tombstones.go
@ -0,0 +1,223 @@
+// Copyright 2017 The Prometheus Authors
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tsdb
+
+import (
+	"encoding/binary"
+	"fmt"
+	"hash/crc32"
+	"io"
+	"io/ioutil"
+	"os"
+	"path/filepath"
+
+	"github.com/pkg/errors"
+)
+
+const tombstoneFilename = "tombstones"
+
+const (
+	// MagicTombstone is 4 bytes at the head of a tombstone file.
+	MagicTombstone = 0x130BA30
+
+	tombstoneFormatV1 = 1
+)
+
+func writeTombstoneFile(dir string, tr tombstoneReader) error {
+	path := filepath.Join(dir, tombstoneFilename)
+	tmp := path + ".tmp"
+	hash := crc32.New(crc32.MakeTable(crc32.Castagnoli))
+
+	f, err := os.Create(tmp)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	buf := encbuf{b: make([]byte, 3*binary.MaxVarintLen64)}
+	buf.reset()
+	// Write the meta.
+	buf.putBE32(MagicTombstone)
+	buf.putByte(tombstoneFormatV1)
+	_, err = f.Write(buf.get())
+	if err != nil {
+		return err
+	}
+
+	mw := io.MultiWriter(f, hash)
+
+	for k, v := range tr {
+		for _, itv := range v {
+			buf.reset()
+			buf.putUvarint32(k)
+			buf.putVarint64(itv.mint)
+			buf.putVarint64(itv.maxt)
+
+			_, err = mw.Write(buf.get())
+			if err != nil {
+				return err
+			}
+		}
+	}
+
+	_, err = f.Write(hash.Sum(nil))
+	if err != nil {
+		return err
+	}
+
+	return renameFile(tmp, path)
+}
+
+// Stone holds the information on the posting and time-range
+// that is deleted.
+type Stone struct {
+	ref       uint32
+	intervals intervals
+}
+
+// TombstoneReader is the iterator over tombstones.
+type TombstoneReader interface {
+	Get(ref uint32) intervals
+}
+
+func readTombstones(dir string) (tombstoneReader, error) {
+	b, err := ioutil.ReadFile(filepath.Join(dir, tombstoneFilename))
+	if err != nil {
+		return nil, err
+	}
+
+	if len(b) < 5 {
+		return nil, errors.Wrap(errInvalidSize, "tombstones header")
+	}
+
+	d := &decbuf{b: b[:len(b)-4]} // 4 for the checksum.
+	if mg := d.be32(); mg != MagicTombstone {
+		return nil, fmt.Errorf("invalid magic number %x", mg)
+	}
+	if flag := d.byte(); flag != tombstoneFormatV1 {
+		return nil, fmt.Errorf("invalid tombstone format %x", flag)
+	}
+
+	if d.err() != nil {
+		return nil, d.err()
+	}
+
+	// Verify checksum
+	hash := crc32.New(crc32.MakeTable(crc32.Castagnoli))
+	if _, err := hash.Write(d.get()); err != nil {
+		return nil, errors.Wrap(err, "write to hash")
+	}
+	if binary.BigEndian.Uint32(b[len(b)-4:]) != hash.Sum32() {
+		return nil, errors.New("checksum did not match")
+	}
+
+	stonesMap := newEmptyTombstoneReader()
+	for d.len() > 0 {
+		k := d.uvarint32()
+		mint := d.varint64()
+		maxt := d.varint64()
+		if d.err() != nil {
+			return nil, d.err()
+		}
+
+		stonesMap.add(k, interval{mint, maxt})
+	}
+
+	return newTombstoneReader(stonesMap), nil
+}
+
+type tombstoneReader map[uint32]intervals
+
+func newTombstoneReader(ts map[uint32]intervals) tombstoneReader {
+	return tombstoneReader(ts)
+}
+
+func newEmptyTombstoneReader() tombstoneReader {
+	return tombstoneReader(make(map[uint32]intervals))
+}
+
+func (t tombstoneReader) Get(ref uint32) intervals {
+	return t[ref]
+}
+
+func (t tombstoneReader) add(ref uint32, itv interval) {
+	t[ref] = t[ref].add(itv)
+}
+
+type interval struct {
+	mint, maxt int64
+}
+
+func (tr interval) inBounds(t int64) bool {
+	return t >= tr.mint && t <= tr.maxt
+}
+
+func (tr interval) isSubrange(dranges intervals) bool {
+	for _, r := range dranges {
+		if r.inBounds(tr.mint) && r.inBounds(tr.maxt) {
+			return true
+		}
+	}
+
+	return false
+}
+
+type intervals []interval
+
+// This adds the new time-range to the existing ones.
+// The existing ones must be sorted.
+func (itvs intervals) add(n interval) intervals {
+	for i, r := range itvs {
+		// TODO(gouthamve): Make this codepath easier to digest.
+		if r.inBounds(n.mint-1) || r.inBounds(n.mint) {
+			if n.maxt > r.maxt {
+				itvs[i].maxt = n.maxt
+			}
+
+			j := 0
+			for _, r2 := range itvs[i+1:] {
+				if n.maxt < r2.mint {
+					break
+				}
+				j++
+			}
+			if j != 0 {
+				if itvs[i+j].maxt > n.maxt {
+					itvs[i].maxt = itvs[i+j].maxt
+				}
+				itvs = append(itvs[:i+1], itvs[i+j+1:]...)
+			}
+			return itvs
+		}
+
+		if r.inBounds(n.maxt+1) || r.inBounds(n.maxt) {
+			if n.mint < r.maxt {
+				itvs[i].mint = n.mint
+			}
+			return itvs
+		}
+
+		if n.mint < r.mint {
+			newRange := make(intervals, i, len(itvs[:i])+1)
+			copy(newRange, itvs[:i])
+			newRange = append(newRange, n)
+			newRange = append(newRange, itvs[i:]...)
+
+			return newRange
+		}
+	}
+
+	itvs = append(itvs, n)
+	return itvs
+}
--- a/vendor/github.com/prometheus/tsdb/wal.go
+++ b/vendor/github.com/prometheus/tsdb/wal.go
@ -46,8 +46,18 @@ const (
 	WALEntrySymbols WALEntryType = 1
 	WALEntrySeries  WALEntryType = 2
 	WALEntrySamples WALEntryType = 3
+	WALEntryDeletes WALEntryType = 4
 )

+// SamplesCB is the callback after reading samples.
+type SamplesCB func([]RefSample) error
+
+// SeriesCB is the callback after reading series.
+type SeriesCB func([]labels.Labels) error
+
+// DeletesCB is the callback after reading deletes.
+type DeletesCB func([]Stone) error
+
 // SegmentWAL is a write ahead log for series data.
 type SegmentWAL struct {
 	mtx sync.Mutex
@ -71,15 +81,15 @@ type SegmentWAL struct {
 // It must be completely read before new entries are logged.
 type WAL interface {
 	Reader() WALReader
-	Log([]labels.Labels, []RefSample) error
+	LogSeries([]labels.Labels) error
+	LogSamples([]RefSample) error
+	LogDeletes([]Stone) error
 	Close() error
 }

 // WALReader reads entries from a WAL.
 type WALReader interface {
-	At() ([]labels.Labels, []RefSample)
-	Next() bool
-	Err() error
+	Read(SeriesCB, SamplesCB, DeletesCB) error
 }

 // RefSample is a timestamp/value pair associated with a reference to a series.
@ -141,13 +151,40 @@ func (w *SegmentWAL) Reader() WALReader {
 }

 // Log writes a batch of new series labels and samples to the log.
-func (w *SegmentWAL) Log(series []labels.Labels, samples []RefSample) error {
+//func (w *SegmentWAL) Log(series []labels.Labels, samples []RefSample) error {
+//return nil
+//}
+
+// LogSeries writes a batch of new series labels to the log.
+func (w *SegmentWAL) LogSeries(series []labels.Labels) error {
 	if err := w.encodeSeries(series); err != nil {
 		return err
 	}
+
+	if w.flushInterval <= 0 {
+		return w.Sync()
+	}
+	return nil
+}
+
+// LogSamples writes a batch of new samples to the log.
+func (w *SegmentWAL) LogSamples(samples []RefSample) error {
 	if err := w.encodeSamples(samples); err != nil {
 		return err
 	}
+
+	if w.flushInterval <= 0 {
+		return w.Sync()
+	}
+	return nil
+}
+
+// LogDeletes write a batch of new deletes to the log.
+func (w *SegmentWAL) LogDeletes(stones []Stone) error {
+	if err := w.encodeDeletes(stones); err != nil {
+		return err
+	}
+
 	if w.flushInterval <= 0 {
 		return w.Sync()
 	}
@ -369,6 +406,7 @@ func (w *SegmentWAL) entry(et WALEntryType, flag byte, buf []byte) error {
 const (
 	walSeriesSimple  = 1
 	walSamplesSimple = 1
+	walDeletesSimple = 1
 )

 var walBuffers = sync.Pool{}
@ -445,6 +483,23 @@ func (w *SegmentWAL) encodeSamples(samples []RefSample) error {
 	return w.entry(WALEntrySamples, walSamplesSimple, buf)
 }

+func (w *SegmentWAL) encodeDeletes(stones []Stone) error {
+	b := make([]byte, 2*binary.MaxVarintLen64)
+	eb := &encbuf{b: b}
+	buf := getWALBuffer()
+	for _, s := range stones {
+		for _, itv := range s.intervals {
+			eb.reset()
+			eb.putUvarint32(s.ref)
+			eb.putVarint64(itv.mint)
+			eb.putVarint64(itv.maxt)
+			buf = append(buf, eb.get()...)
+		}
+	}
+
+	return w.entry(WALEntryDeletes, walDeletesSimple, buf)
+}
+
 // walReader decodes and emits write ahead log entries.
 type walReader struct {
 	logger log.Logger
@ -454,9 +509,11 @@ type walReader struct {
 	buf   []byte
 	crc32 hash.Hash32

-	err     error
-	labels  []labels.Labels
-	samples []RefSample
+	curType WALEntryType
+	curFlag byte
+	curBuf  []byte
+
+	err error
 }

 func newWALReader(w *SegmentWAL, l log.Logger) *walReader {
@ -471,18 +528,41 @@ func newWALReader(w *SegmentWAL, l log.Logger) *walReader {
 	}
 }

-// At returns the last decoded entry of labels or samples.
-// The returned slices are only valid until the next call to Next(). Their elements
-// have to be copied to preserve them.
-func (r *walReader) At() ([]labels.Labels, []RefSample) {
-	return r.labels, r.samples
-}
-
 // Err returns the last error the reader encountered.
 func (r *walReader) Err() error {
 	return r.err
 }

+func (r *walReader) Read(seriesf SeriesCB, samplesf SamplesCB, deletesf DeletesCB) error {
+	for r.next() {
+		et, flag, b := r.at()
+		// In decoding below we never return a walCorruptionErr for now.
+		// Those should generally be catched by entry decoding before.
+		switch et {
+		case WALEntrySeries:
+			s, err := r.decodeSeries(flag, b)
+			if err != nil {
+				return err
+			}
+			seriesf(s)
+		case WALEntrySamples:
+			s, err := r.decodeSamples(flag, b)
+			if err != nil {
+				return err
+			}
+			samplesf(s)
+		case WALEntryDeletes:
+			s, err := r.decodeDeletes(flag, b)
+			if err != nil {
+				return err
+			}
+			deletesf(s)
+		}
+	}
+
+	return r.Err()
+}
+
 // nextEntry retrieves the next entry. It is also used as a testing hook.
 func (r *walReader) nextEntry() (WALEntryType, byte, []byte, error) {
 	if r.cur >= len(r.wal.files) {
@ -505,12 +585,13 @@ func (r *walReader) nextEntry() (WALEntryType, byte, []byte, error) {
 	return et, flag, b, err
 }

-// Next returns decodes the next entry pair and returns true
-// if it was succesful.
-func (r *walReader) Next() bool {
-	r.labels = r.labels[:0]
-	r.samples = r.samples[:0]
+func (r *walReader) at() (WALEntryType, byte, []byte) {
+	return r.curType, r.curFlag, r.curBuf
+}

+// next returns decodes the next entry pair and returns true
+// if it was succesful.
+func (r *walReader) next() bool {
 	if r.cur >= len(r.wal.files) {
 		return false
 	}
@ -537,7 +618,7 @@ func (r *walReader) Next() bool {
 			return false
 		}
 		r.cur++
-		return r.Next()
+		return r.next()
 	}
 	if err != nil {
 		r.err = err
@ -548,19 +629,9 @@ func (r *walReader) Next() bool {
 		return false
 	}

-	// In decoding below we never return a walCorruptionErr for now.
-	// Those should generally be catched by entry decoding before.
-
-	switch et {
-	case WALEntrySamples:
-		if err := r.decodeSamples(flag, b); err != nil {
-			r.err = err
-		}
-	case WALEntrySeries:
-		if err := r.decodeSeries(flag, b); err != nil {
-			r.err = err
-		}
-	}
+	r.curType = et
+	r.curFlag = flag
+	r.curBuf = b
 	return r.err == nil
 }

@ -617,7 +688,7 @@ func (r *walReader) entry(cr io.Reader) (WALEntryType, byte, []byte, error) {
 	if etype == 0 {
 		return 0, 0, nil, io.EOF
 	}
-	if etype != WALEntrySeries && etype != WALEntrySamples {
+	if etype != WALEntrySeries && etype != WALEntrySamples && etype != WALEntryDeletes {
 		return 0, 0, nil, walCorruptionErrf("invalid entry type %d", etype)
 	}

@ -644,11 +715,12 @@ func (r *walReader) entry(cr io.Reader) (WALEntryType, byte, []byte, error) {
 	return etype, flag, buf, nil
 }

-func (r *walReader) decodeSeries(flag byte, b []byte) error {
+func (r *walReader) decodeSeries(flag byte, b []byte) ([]labels.Labels, error) {
+	series := []labels.Labels{}
 	for len(b) > 0 {
 		l, n := binary.Uvarint(b)
 		if n < 1 {
-			return errors.Wrap(errInvalidSize, "number of labels")
+			return nil, errors.Wrap(errInvalidSize, "number of labels")
 		}
 		b = b[n:]
 		lset := make(labels.Labels, l)
@ -656,27 +728,29 @@ func (r *walReader) decodeSeries(flag byte, b []byte) error {
 		for i := 0; i < int(l); i++ {
 			nl, n := binary.Uvarint(b)
 			if n < 1 || len(b) < n+int(nl) {
-				return errors.Wrap(errInvalidSize, "label name")
+				return nil, errors.Wrap(errInvalidSize, "label name")
 			}
 			lset[i].Name = string(b[n : n+int(nl)])
 			b = b[n+int(nl):]

 			vl, n := binary.Uvarint(b)
 			if n < 1 || len(b) < n+int(vl) {
-				return errors.Wrap(errInvalidSize, "label value")
+				return nil, errors.Wrap(errInvalidSize, "label value")
 			}
 			lset[i].Value = string(b[n : n+int(vl)])
 			b = b[n+int(vl):]
 		}

-		r.labels = append(r.labels, lset)
+		series = append(series, lset)
 	}
-	return nil
+	return series, nil
 }

-func (r *walReader) decodeSamples(flag byte, b []byte) error {
+func (r *walReader) decodeSamples(flag byte, b []byte) ([]RefSample, error) {
+	samples := []RefSample{}
+
 	if len(b) < 16 {
-		return errors.Wrap(errInvalidSize, "header length")
+		return nil, errors.Wrap(errInvalidSize, "header length")
 	}
 	var (
 		baseRef  = binary.BigEndian.Uint64(b)
@ -689,7 +763,7 @@ func (r *walReader) decodeSamples(flag byte, b []byte) error {

 		dref, n := binary.Varint(b)
 		if n < 1 {
-			return errors.Wrap(errInvalidSize, "sample ref delta")
+			return nil, errors.Wrap(errInvalidSize, "sample ref delta")
 		}
 		b = b[n:]

@ -697,18 +771,36 @@ func (r *walReader) decodeSamples(flag byte, b []byte) error {

 		dtime, n := binary.Varint(b)
 		if n < 1 {
-			return errors.Wrap(errInvalidSize, "sample timestamp delta")
+			return nil, errors.Wrap(errInvalidSize, "sample timestamp delta")
 		}
 		b = b[n:]
 		smpl.T = baseTime + dtime

 		if len(b) < 8 {
-			return errors.Wrapf(errInvalidSize, "sample value bits %d", len(b))
+			return nil, errors.Wrapf(errInvalidSize, "sample value bits %d", len(b))
 		}
 		smpl.V = float64(math.Float64frombits(binary.BigEndian.Uint64(b)))
 		b = b[8:]

-		r.samples = append(r.samples, smpl)
+		samples = append(samples, smpl)
 	}
-	return nil
+	return samples, nil
+}
+
+func (r *walReader) decodeDeletes(flag byte, b []byte) ([]Stone, error) {
+	db := &decbuf{b: b}
+	stones := []Stone{}
+
+	for db.len() > 0 {
+		var s Stone
+		s.ref = db.uvarint32()
+		s.intervals = intervals{{db.varint64(), db.varint64()}}
+		if db.err() != nil {
+			return nil, db.err()
+		}
+
+		stones = append(stones, s)
+	}
+
+	return stones, nil
 }
--- a/vendor/vendor.json
+++ b/vendor/vendor.json
@ -751,10 +751,10 @@
 			"revisionTime": "2016-04-11T19:08:41Z"
 		},
 		{
-			"checksumSHA1": "q2GxuO+ppV/gqBir/Z6ijx7aOOU=",
+			"checksumSHA1": "XXXDHMZe3Y3gosaF/1staHm3INc=",
 			"path": "github.com/prometheus/tsdb",
-			"revision": "4f2eb2057ee0a7f2b984503886bff970a9dab1a8",
-			"revisionTime": "2017-05-22T06:49:09Z"
+			"revision": "9963a4c7c3b2a742e00a63c54084b051e3174b06",
+			"revisionTime": "2017-06-12T09:17:49Z"
 		},
 		{
 			"checksumSHA1": "9EH3v+JdbikCUJAgD4VEOPIaWfs=",