prometheus/wal.go

596 lines
13 KiB
Go
Raw Normal View History

2016-12-22 11:05:24 +00:00
package tsdb
import (
2017-01-16 13:18:25 +00:00
"bufio"
2016-12-22 11:05:24 +00:00
"encoding/binary"
"hash"
2016-12-22 11:05:24 +00:00
"hash/crc32"
"io"
"math"
"os"
"path/filepath"
"sync"
"time"
2016-12-22 11:05:24 +00:00
"github.com/coreos/etcd/pkg/fileutil"
"github.com/fabxc/tsdb/labels"
"github.com/go-kit/kit/log"
2016-12-22 14:18:33 +00:00
"github.com/pkg/errors"
2016-12-22 11:05:24 +00:00
)
// WALEntryType indicates what data a WAL entry contains.
type WALEntryType byte
const (
2017-02-14 23:54:52 +00:00
// WALMagic is a 4 byte number every WAL segment file starts with.
WALMagic = uint32(0x43AF00EF)
2017-02-14 23:54:52 +00:00
// WALFormatDefault is the version flag for the default outer segment file format.
WALFormatDefault = byte(1)
)
2017-02-14 23:54:52 +00:00
// Entry types in a segment file.
const (
WALEntrySymbols WALEntryType = 1
WALEntrySeries WALEntryType = 2
WALEntrySamples WALEntryType = 3
2016-12-22 11:05:24 +00:00
)
// WAL is a write ahead log for series data. It can only be written to.
// Use WALReader to read back from a write ahead log.
type WAL struct {
mtx sync.Mutex
2017-02-14 07:53:19 +00:00
dirFile *os.File
files []*os.File
2017-02-14 07:53:19 +00:00
logger log.Logger
flushInterval time.Duration
2017-02-14 23:54:52 +00:00
segmentSize int64
crc32 hash.Hash32
cur *bufio.Writer
curN int64
2017-02-14 07:53:19 +00:00
stopc chan struct{}
donec chan struct{}
2016-12-22 11:05:24 +00:00
}
2017-02-14 07:53:19 +00:00
const (
walDirName = "wal"
walSegmentSizeBytes = 256 * 1024 * 1024 // 256 MB
2017-02-14 07:53:19 +00:00
)
2016-12-22 14:18:33 +00:00
// OpenWAL opens or creates a write ahead log in the given directory.
// The WAL must be read completely before new data is written.
func OpenWAL(dir string, l log.Logger, flushInterval time.Duration) (*WAL, error) {
2017-02-14 07:53:19 +00:00
dir = filepath.Join(dir, walDirName)
2016-12-22 11:05:24 +00:00
if err := os.MkdirAll(dir, 0777); err != nil {
return nil, err
}
2017-02-14 07:53:19 +00:00
df, err := fileutil.OpenDir(dir)
2016-12-22 15:14:34 +00:00
if err != nil {
return nil, err
}
2016-12-22 11:05:24 +00:00
w := &WAL{
2017-02-14 07:53:19 +00:00
dirFile: df,
logger: l,
flushInterval: flushInterval,
donec: make(chan struct{}),
stopc: make(chan struct{}),
2017-02-14 23:54:52 +00:00
segmentSize: walSegmentSizeBytes,
crc32: crc32.New(crc32.MakeTable(crc32.Castagnoli)),
2016-12-22 11:05:24 +00:00
}
2017-02-14 07:53:19 +00:00
if err := w.initSegments(); err != nil {
return nil, err
}
go w.run(flushInterval)
2016-12-22 11:05:24 +00:00
return w, nil
}
2017-02-15 05:54:59 +00:00
// Reader returns a new reader over the the write ahead log data.
// It must be completely consumed before writing to the WAL.
func (w *WAL) Reader() *WALReader {
var rs []io.ReadCloser
for _, f := range w.files {
rs = append(rs, f)
2016-12-22 14:18:33 +00:00
}
return NewWALReader(rs...)
2016-12-22 14:18:33 +00:00
}
2016-12-22 11:05:24 +00:00
// Log writes a batch of new series labels and samples to the log.
func (w *WAL) Log(series []labels.Labels, samples []refdSample) error {
2017-02-14 07:53:19 +00:00
if err := w.encodeSeries(series); err != nil {
2016-12-22 11:05:24 +00:00
return err
}
2017-02-14 07:53:19 +00:00
if err := w.encodeSamples(samples); err != nil {
2016-12-22 11:05:24 +00:00
return err
}
if w.flushInterval <= 0 {
2017-02-14 07:53:19 +00:00
return w.Sync()
}
2016-12-22 11:05:24 +00:00
return nil
}
2017-02-14 07:53:19 +00:00
// initSegments finds all existing segment files and opens them in the
// appropriate file modes.
func (w *WAL) initSegments() error {
fns, err := sequenceFiles(w.dirFile.Name(), "")
if err != nil {
return err
}
if len(fns) == 0 {
return nil
}
if len(fns) > 1 {
2017-02-14 23:54:52 +00:00
for _, fn := range fns[:len(fns)-1] {
f, err := os.Open(fn)
2017-02-14 07:53:19 +00:00
if err != nil {
return err
}
w.files = append(w.files, f)
2017-02-14 07:53:19 +00:00
}
}
// The most recent WAL file is the one we have to keep appending to.
f, err := os.OpenFile(fns[len(fns)-1], os.O_RDWR, 0666)
2017-02-14 07:53:19 +00:00
if err != nil {
return err
}
w.files = append(w.files, f)
2017-02-14 07:53:19 +00:00
// Consume and validate meta headers.
for _, f := range w.files {
metab := make([]byte, 8)
if n, err := f.Read(metab); err != nil {
return errors.Wrapf(err, "validate meta %q", f.Name())
} else if n != 8 {
return errors.Errorf("invalid header size %d in %q", n, f.Name())
}
if m := binary.BigEndian.Uint32(metab[:4]); m != WALMagic {
return errors.Errorf("invalid magic header %x in %q", m, f.Name())
}
if metab[4] != WALFormatDefault {
return errors.Errorf("unknown WAL segment format %d in %q", metab[4], f.Name())
}
}
2017-02-14 07:53:19 +00:00
return nil
}
// cut finishes the currently active segments and open the next one.
// The encoder is reset to point to the new segment.
func (w *WAL) cut() error {
2017-02-14 23:54:52 +00:00
// Sync current tail to disc and close.
2017-02-14 07:53:19 +00:00
if tf := w.tail(); tf != nil {
2017-02-14 23:54:52 +00:00
if err := w.sync(); err != nil {
2017-02-14 07:53:19 +00:00
return err
}
2017-02-15 05:54:59 +00:00
off, err := tf.Seek(0, os.SEEK_CUR)
if err != nil {
return err
}
if err := tf.Truncate(off); err != nil {
return err
}
2017-02-14 23:54:52 +00:00
if err := tf.Close(); err != nil {
2017-02-14 07:53:19 +00:00
return err
}
}
p, _, err := nextSequenceFile(w.dirFile.Name(), "")
if err != nil {
return err
}
f, err := os.Create(p)
2017-02-14 07:53:19 +00:00
if err != nil {
return err
}
if err = fileutil.Preallocate(f, w.segmentSize, true); err != nil {
2017-02-14 07:53:19 +00:00
return err
}
if err = w.dirFile.Sync(); err != nil {
return err
}
// Write header metadata for new file.
metab := make([]byte, 8)
binary.BigEndian.PutUint32(metab[:4], WALMagic)
metab[4] = WALFormatDefault
if _, err := f.Write(metab); err != nil {
return err
}
2017-02-14 07:53:19 +00:00
w.files = append(w.files, f)
w.cur = bufio.NewWriterSize(f, 4*1024*1024)
2017-02-14 23:54:52 +00:00
w.curN = 8
2017-02-14 07:53:19 +00:00
return nil
}
func (w *WAL) tail() *os.File {
2017-02-14 07:53:19 +00:00
if len(w.files) == 0 {
return nil
}
return w.files[len(w.files)-1]
}
func (w *WAL) Sync() error {
w.mtx.Lock()
defer w.mtx.Unlock()
return w.sync()
}
2016-12-22 11:05:24 +00:00
func (w *WAL) sync() error {
2017-02-15 05:54:59 +00:00
if w.cur == nil {
return nil
}
2017-02-14 07:53:19 +00:00
if err := w.cur.Flush(); err != nil {
2016-12-22 15:14:34 +00:00
return err
}
return fileutil.Fdatasync(w.tail())
2016-12-22 11:05:24 +00:00
}
func (w *WAL) run(interval time.Duration) {
var tick <-chan time.Time
if interval > 0 {
ticker := time.NewTicker(interval)
defer ticker.Stop()
tick = ticker.C
}
defer close(w.donec)
for {
select {
case <-w.stopc:
return
case <-tick:
2017-02-14 07:53:19 +00:00
if err := w.Sync(); err != nil {
w.logger.Log("msg", "sync failed", "err", err)
}
}
}
}
2016-12-22 11:05:24 +00:00
// Close sync all data and closes the underlying resources.
func (w *WAL) Close() error {
close(w.stopc)
<-w.donec
2017-02-14 07:53:19 +00:00
w.mtx.Lock()
defer w.mtx.Unlock()
2016-12-22 11:05:24 +00:00
if err := w.sync(); err != nil {
return err
}
2017-02-14 23:54:52 +00:00
// On opening, a WAL must be fully consumed once. Afterwards
// only the current segment will still be open.
2017-02-15 05:54:59 +00:00
if tf := w.tail(); tf != nil {
return tf.Close()
}
return nil
2016-12-22 11:05:24 +00:00
}
2016-12-22 19:00:24 +00:00
const (
minSectorSize = 512
// walPageBytes is the alignment for flushing records to the backing Writer.
// It should be a multiple of the minimum sector size so that WAL can safely
// distinguish between torn writes and ordinary data corruption.
2017-01-09 17:34:29 +00:00
walPageBytes = 16 * minSectorSize
2016-12-22 19:00:24 +00:00
)
2016-12-22 15:14:34 +00:00
2017-02-14 07:53:19 +00:00
func (w *WAL) entry(et WALEntryType, flag byte, buf []byte) error {
w.mtx.Lock()
defer w.mtx.Unlock()
2017-01-06 17:36:42 +00:00
2017-02-15 05:54:59 +00:00
// Cut to the next segment if exceeds the file size unless it would also
// exceed the size of a new segment.
var (
sz = int64(6 + 4 + len(buf))
newsz = w.curN + sz
)
if w.cur == nil || w.curN > w.segmentSize || newsz > w.segmentSize && sz <= w.segmentSize {
2017-02-14 07:53:19 +00:00
if err := w.cut(); err != nil {
return err
}
}
2017-01-06 17:36:42 +00:00
w.crc32.Reset()
wr := io.MultiWriter(w.crc32, w.cur)
2016-12-22 11:05:24 +00:00
b := make([]byte, 6)
b[0] = byte(et)
b[1] = flag
2017-01-06 17:36:42 +00:00
binary.BigEndian.PutUint32(b[2:], uint32(len(buf)))
2016-12-22 11:05:24 +00:00
2017-02-14 07:53:19 +00:00
if _, err := wr.Write(b); err != nil {
2016-12-22 11:05:24 +00:00
return err
}
2017-02-14 07:53:19 +00:00
if _, err := wr.Write(buf); err != nil {
2016-12-22 11:05:24 +00:00
return err
}
if _, err := w.cur.Write(w.crc32.Sum(nil)); err != nil {
2016-12-22 11:05:24 +00:00
return err
}
2017-02-14 07:53:19 +00:00
w.curN += sz
2017-01-06 17:36:42 +00:00
putWALBuffer(buf)
2016-12-22 11:05:24 +00:00
return nil
}
const (
walSeriesSimple = 1
walSamplesSimple = 1
)
2017-01-06 17:36:42 +00:00
var walBuffers = sync.Pool{}
func getWALBuffer() []byte {
b := walBuffers.Get()
if b == nil {
return make([]byte, 0, 64*1024)
}
return b.([]byte)
}
func putWALBuffer(b []byte) {
b = b[:0]
walBuffers.Put(b)
}
2017-02-14 07:53:19 +00:00
func (w *WAL) encodeSeries(series []labels.Labels) error {
2016-12-22 11:05:24 +00:00
if len(series) == 0 {
return nil
}
2016-12-22 14:18:33 +00:00
b := make([]byte, binary.MaxVarintLen32)
2017-01-06 17:36:42 +00:00
buf := getWALBuffer()
2016-12-22 11:05:24 +00:00
for _, lset := range series {
n := binary.PutUvarint(b, uint64(len(lset)))
2017-01-06 17:36:42 +00:00
buf = append(buf, b[:n]...)
2016-12-22 11:05:24 +00:00
for _, l := range lset {
n = binary.PutUvarint(b, uint64(len(l.Name)))
2017-01-06 17:36:42 +00:00
buf = append(buf, b[:n]...)
buf = append(buf, l.Name...)
2016-12-22 11:05:24 +00:00
n = binary.PutUvarint(b, uint64(len(l.Value)))
2017-01-06 17:36:42 +00:00
buf = append(buf, b[:n]...)
buf = append(buf, l.Value...)
2016-12-22 11:05:24 +00:00
}
}
2017-02-14 07:53:19 +00:00
return w.entry(WALEntrySeries, walSeriesSimple, buf)
2016-12-22 11:05:24 +00:00
}
2017-02-14 07:53:19 +00:00
func (w *WAL) encodeSamples(samples []refdSample) error {
2016-12-22 11:05:24 +00:00
if len(samples) == 0 {
return nil
}
2016-12-22 14:18:33 +00:00
b := make([]byte, binary.MaxVarintLen64)
2017-01-06 17:36:42 +00:00
buf := getWALBuffer()
2016-12-22 11:05:24 +00:00
// Store base timestamp and base reference number of first sample.
// All samples encode their timestamp and ref as delta to those.
//
// TODO(fabxc): optimize for all samples having the same timestamp.
first := samples[0]
binary.BigEndian.PutUint64(b, first.ref)
2017-01-17 07:40:31 +00:00
buf = append(buf, b[:8]...)
2016-12-22 11:05:24 +00:00
binary.BigEndian.PutUint64(b, uint64(first.t))
2017-01-06 17:36:42 +00:00
buf = append(buf, b[:8]...)
2016-12-22 11:05:24 +00:00
for _, s := range samples {
n := binary.PutVarint(b, int64(s.ref)-int64(first.ref))
2017-01-06 17:36:42 +00:00
buf = append(buf, b[:n]...)
2016-12-22 11:05:24 +00:00
n = binary.PutVarint(b, s.t-first.t)
2017-01-06 17:36:42 +00:00
buf = append(buf, b[:n]...)
2016-12-22 11:05:24 +00:00
binary.BigEndian.PutUint64(b, math.Float64bits(s.v))
2017-01-06 17:36:42 +00:00
buf = append(buf, b[:8]...)
2016-12-22 11:05:24 +00:00
}
2017-02-14 07:53:19 +00:00
return w.entry(WALEntrySamples, walSamplesSimple, buf)
2016-12-22 11:05:24 +00:00
}
2017-02-15 05:54:59 +00:00
// WALReader decodes and emits write ahead log entries.
type WALReader struct {
rs []io.ReadCloser
cur int
buf []byte
crc32 hash.Hash32
2017-02-15 05:54:59 +00:00
err error
labels []labels.Labels
samples []refdSample
2016-12-22 14:18:33 +00:00
}
2017-02-15 05:54:59 +00:00
// NewWALReader returns a new WALReader over the sequence of the given ReadClosers.
func NewWALReader(rs ...io.ReadCloser) *WALReader {
return &WALReader{
rs: rs,
buf: make([]byte, 0, 128*4096),
crc32: crc32.New(crc32.MakeTable(crc32.Castagnoli)),
2016-12-22 14:18:33 +00:00
}
}
2017-02-15 05:54:59 +00:00
// At returns the last decoded entry of labels or samples.
func (r *WALReader) At() ([]labels.Labels, []refdSample) {
return r.labels, r.samples
}
// Err returns the last error the reader encountered.
func (r *WALReader) Err() error {
return r.err
}
// nextEntry retrieves the next entry. It is also used as a testing hook.
func (r *WALReader) nextEntry() (WALEntryType, byte, []byte, error) {
if r.cur >= len(r.rs) {
return 0, 0, nil, io.EOF
}
cr := r.rs[r.cur]
et, flag, b, err := r.entry(cr)
if err == io.EOF {
// Current reader completed, close and move to the next one.
if err := cr.Close(); err != nil {
return 0, 0, nil, err
}
r.cur++
return r.nextEntry()
}
return et, flag, b, err
}
// Next returns decodes the next entry pair and returns true
// if it was succesful.
func (r *WALReader) Next() bool {
r.labels = r.labels[:0]
r.samples = r.samples[:0]
et, flag, b, err := r.nextEntry()
2017-02-14 23:54:52 +00:00
if err != nil {
2017-02-15 05:54:59 +00:00
if err != io.EOF {
r.err = err
}
return false
2017-02-14 23:54:52 +00:00
}
2017-02-15 05:54:59 +00:00
switch et {
2017-02-14 23:54:52 +00:00
case WALEntrySamples:
2017-02-15 05:54:59 +00:00
if err := r.decodeSamples(flag, b); err != nil {
r.err = err
}
2017-02-14 23:54:52 +00:00
case WALEntrySeries:
2017-02-15 05:54:59 +00:00
if err := r.decodeSeries(flag, b); err != nil {
r.err = err
}
default:
r.err = errors.Errorf("unknown WAL entry type %d", et)
}
return r.err == nil
}
func (r *WALReader) entry(cr io.Reader) (WALEntryType, byte, []byte, error) {
r.crc32.Reset()
tr := io.TeeReader(cr, r.crc32)
2017-02-15 05:54:59 +00:00
b := make([]byte, 6)
if _, err := tr.Read(b); err != nil {
return 0, 0, nil, err
}
var (
etype = WALEntryType(b[0])
flag = b[1]
length = int(binary.BigEndian.Uint32(b[2:]))
)
// Exit if we reached pre-allocated space.
if etype == 0 {
return 0, 0, nil, io.EOF
2017-02-14 23:54:52 +00:00
}
2017-02-15 05:54:59 +00:00
if length > len(r.buf) {
r.buf = make([]byte, length)
}
buf := r.buf[:length]
if _, err := tr.Read(buf); err != nil {
return 0, 0, nil, err
}
_, err := cr.Read(b[:4])
if err != nil {
return 0, 0, nil, err
}
if exp, has := binary.BigEndian.Uint32(b[:4]), r.crc32.Sum32(); has != exp {
2017-02-15 05:54:59 +00:00
return 0, 0, nil, errors.Errorf("unexpected CRC32 checksum %x, want %x", has, exp)
}
return etype, flag, buf, nil
2017-02-14 23:54:52 +00:00
}
2017-02-15 05:54:59 +00:00
func (r *WALReader) decodeSeries(flag byte, b []byte) error {
2016-12-22 14:18:33 +00:00
for len(b) > 0 {
l, n := binary.Uvarint(b)
if n < 1 {
return errors.Wrap(errInvalidSize, "number of labels")
}
b = b[n:]
lset := make(labels.Labels, l)
for i := 0; i < int(l); i++ {
nl, n := binary.Uvarint(b)
if n < 1 || len(b) < n+int(nl) {
return errors.Wrap(errInvalidSize, "label name")
}
lset[i].Name = string(b[n : n+int(nl)])
b = b[n+int(nl):]
vl, n := binary.Uvarint(b)
if n < 1 || len(b) < n+int(vl) {
return errors.Wrap(errInvalidSize, "label value")
}
lset[i].Value = string(b[n : n+int(vl)])
b = b[n+int(vl):]
}
2017-02-15 05:54:59 +00:00
r.labels = append(r.labels, lset)
2016-12-22 14:18:33 +00:00
}
return nil
}
2017-02-15 05:54:59 +00:00
func (r *WALReader) decodeSamples(flag byte, b []byte) error {
2017-01-17 07:40:31 +00:00
if len(b) < 16 {
2016-12-22 14:18:33 +00:00
return errors.Wrap(errInvalidSize, "header length")
}
var (
baseRef = binary.BigEndian.Uint64(b)
2017-01-17 07:40:31 +00:00
baseTime = int64(binary.BigEndian.Uint64(b[8:]))
2016-12-22 14:18:33 +00:00
)
2017-01-17 07:40:31 +00:00
b = b[16:]
2016-12-22 14:18:33 +00:00
for len(b) > 0 {
var smpl refdSample
2016-12-22 14:18:33 +00:00
dref, n := binary.Varint(b)
if n < 1 {
return errors.Wrap(errInvalidSize, "sample ref delta")
}
b = b[n:]
smpl.ref = uint64(int64(baseRef) + dref)
2016-12-22 14:18:33 +00:00
dtime, n := binary.Varint(b)
if n < 1 {
return errors.Wrap(errInvalidSize, "sample timestamp delta")
}
b = b[n:]
smpl.t = baseTime + dtime
if len(b) < 8 {
return errors.Wrapf(errInvalidSize, "sample value bits %d", len(b))
}
smpl.v = float64(math.Float64frombits(binary.BigEndian.Uint64(b)))
b = b[8:]
2017-02-15 05:54:59 +00:00
r.samples = append(r.samples, smpl)
2016-12-22 14:18:33 +00:00
}
return nil
}