628 lines
15 KiB
Go
628 lines
15 KiB
Go
|
// Copyright 2017 The Prometheus Authors
|
||
|
|
||
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
// you may not use this file except in compliance with the License.
|
||
|
// You may obtain a copy of the License at
|
||
|
//
|
||
|
// http://www.apache.org/licenses/LICENSE-2.0
|
||
|
//
|
||
|
// Unless required by applicable law or agreed to in writing, software
|
||
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
// See the License for the specific language governing permissions and
|
||
|
// limitations under the License.
|
||
|
|
||
|
package wal
|
||
|
|
||
|
import (
|
||
|
"bufio"
|
||
|
"encoding/binary"
|
||
|
"fmt"
|
||
|
"hash/crc32"
|
||
|
"io"
|
||
|
"os"
|
||
|
"path/filepath"
|
||
|
"strconv"
|
||
|
"sync"
|
||
|
"time"
|
||
|
|
||
|
"github.com/go-kit/kit/log"
|
||
|
"github.com/go-kit/kit/log/level"
|
||
|
"github.com/pkg/errors"
|
||
|
"github.com/prometheus/client_golang/prometheus"
|
||
|
"github.com/prometheus/tsdb/fileutil"
|
||
|
)
|
||
|
|
||
|
const (
|
||
|
version = 1
|
||
|
defaultSegmentSize = 128 * 1024 * 1024 // 128 MB
|
||
|
maxRecordSize = 1 * 1024 * 1024 // 1MB
|
||
|
pageSize = 32 * 1024 // 32KB
|
||
|
recordHeaderSize = 7
|
||
|
)
|
||
|
|
||
|
// The table gets initialized with sync.Once but may still cause a race
|
||
|
// with any other use of the crc32 package anywhere. Thus we initialize it
|
||
|
// before.
|
||
|
var castagnoliTable = crc32.MakeTable(crc32.Castagnoli)
|
||
|
|
||
|
type page struct {
|
||
|
alloc int
|
||
|
flushed int
|
||
|
buf [pageSize]byte
|
||
|
}
|
||
|
|
||
|
func (p *page) remaining() int {
|
||
|
return pageSize - p.alloc
|
||
|
}
|
||
|
|
||
|
func (p *page) full() bool {
|
||
|
return pageSize-p.alloc < recordHeaderSize
|
||
|
}
|
||
|
|
||
|
// WAL is a write ahead log that stores records in segment files.
|
||
|
// Segments are written to in pages of 32KB, with records possibly split
|
||
|
// across page boundaries.
|
||
|
// Records are never split across segments to allow full segments to be
|
||
|
// safely truncated.
|
||
|
// Segments are terminated by one full zero page to allow tailing readers
|
||
|
// to detect segment boundaries.
|
||
|
type WAL struct {
|
||
|
dir string
|
||
|
logger log.Logger
|
||
|
segmentSize int
|
||
|
mtx sync.RWMutex
|
||
|
segment *os.File // active segment
|
||
|
donePages int // pages written to the segment
|
||
|
page *page // active page
|
||
|
stopc chan chan struct{}
|
||
|
actorc chan func()
|
||
|
|
||
|
fsyncDuration prometheus.Summary
|
||
|
pageFlushes prometheus.Counter
|
||
|
pageCompletions prometheus.Counter
|
||
|
}
|
||
|
|
||
|
// New returns a new WAL over the given directory.
|
||
|
func New(logger log.Logger, reg prometheus.Registerer, dir string) (*WAL, error) {
|
||
|
return newWAL(logger, reg, dir, defaultSegmentSize)
|
||
|
}
|
||
|
|
||
|
func newWAL(logger log.Logger, reg prometheus.Registerer, dir string, segmentSize int) (*WAL, error) {
|
||
|
if segmentSize%pageSize != 0 {
|
||
|
return nil, errors.New("invalid segment size")
|
||
|
}
|
||
|
if err := os.MkdirAll(dir, 0777); err != nil {
|
||
|
return nil, errors.Wrap(err, "create dir")
|
||
|
}
|
||
|
if logger == nil {
|
||
|
logger = log.NewNopLogger()
|
||
|
}
|
||
|
w := &WAL{
|
||
|
dir: dir,
|
||
|
logger: logger,
|
||
|
segmentSize: segmentSize,
|
||
|
page: &page{},
|
||
|
actorc: make(chan func(), 100),
|
||
|
stopc: make(chan chan struct{}),
|
||
|
}
|
||
|
w.fsyncDuration = prometheus.NewSummary(prometheus.SummaryOpts{
|
||
|
Name: "prometheus_tsdb_wal_fsync_duration_seconds",
|
||
|
Help: "Duration of WAL fsync.",
|
||
|
})
|
||
|
w.pageFlushes = prometheus.NewCounter(prometheus.CounterOpts{
|
||
|
Name: "prometheus_tsdb_wal_page_flushes_total",
|
||
|
Help: "Total number of page flushes.",
|
||
|
})
|
||
|
w.pageCompletions = prometheus.NewCounter(prometheus.CounterOpts{
|
||
|
Name: "prometheus_tsdb_wal_completed_pages_total",
|
||
|
Help: "Total number of completed pages.",
|
||
|
})
|
||
|
if reg != nil {
|
||
|
reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions)
|
||
|
}
|
||
|
|
||
|
_, j, err := w.Segments()
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
// Fresh dir, no segments yet.
|
||
|
if j == -1 {
|
||
|
w.segment, err = os.OpenFile(SegmentName(w.dir, 0), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666)
|
||
|
} else {
|
||
|
w.segment, err = os.OpenFile(SegmentName(w.dir, j), os.O_WRONLY|os.O_APPEND, 0666)
|
||
|
}
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
go w.run()
|
||
|
|
||
|
return w, nil
|
||
|
}
|
||
|
|
||
|
// Dir returns the directory of the WAL.
|
||
|
func (w *WAL) Dir() string {
|
||
|
return w.dir
|
||
|
}
|
||
|
|
||
|
func (w *WAL) run() {
|
||
|
for {
|
||
|
// Processing all pending functions has precedence over shutdown.
|
||
|
select {
|
||
|
case f := <-w.actorc:
|
||
|
f()
|
||
|
default:
|
||
|
}
|
||
|
select {
|
||
|
case f := <-w.actorc:
|
||
|
f()
|
||
|
case donec := <-w.stopc:
|
||
|
close(donec)
|
||
|
return
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// SegmentName builds a segment name for the directory.
|
||
|
func SegmentName(dir string, i int) string {
|
||
|
return filepath.Join(dir, fmt.Sprintf("%06d", i))
|
||
|
}
|
||
|
|
||
|
// nextSegment creates the next segment and closes the previous one.
|
||
|
func (w *WAL) nextSegment() error {
|
||
|
if err := w.flushPage(true); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
k, err := strconv.Atoi(filepath.Base(w.segment.Name()))
|
||
|
if err != nil {
|
||
|
return errors.Errorf("current segment %q not numerical", w.segment.Name())
|
||
|
}
|
||
|
// TODO(fabxc): write initialization page with meta info?
|
||
|
next, err := os.OpenFile(SegmentName(w.dir, k+1), os.O_WRONLY|os.O_CREATE|os.O_APPEND, 0666)
|
||
|
if err != nil {
|
||
|
return errors.Wrap(err, "create new segment file")
|
||
|
}
|
||
|
prev := w.segment
|
||
|
w.segment = next
|
||
|
w.donePages = 0
|
||
|
|
||
|
// Don't block further writes by handling the last segment.
|
||
|
// TODO(fabxc): write a termination page as a marker to detect torn segments?
|
||
|
w.actorc <- func() {
|
||
|
if err := w.fsync(prev); err != nil {
|
||
|
level.Error(w.logger).Log("msg", "sync previous segment", "err", err)
|
||
|
}
|
||
|
if err := prev.Close(); err != nil {
|
||
|
level.Error(w.logger).Log("msg", "close previous segment", "err", err)
|
||
|
}
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// flushPage writes the new contents of the page to disk. If no more records will fit into
|
||
|
// the page, the remaining bytes will be set to zero and a new page will be started.
|
||
|
// If clear is true, this is enforced regardless of how many bytes are left in the page.
|
||
|
func (w *WAL) flushPage(clear bool) error {
|
||
|
w.pageFlushes.Inc()
|
||
|
|
||
|
p := w.page
|
||
|
clear = clear || p.full()
|
||
|
|
||
|
// No more data will fit into the page. Enqueue and clear it.
|
||
|
if clear {
|
||
|
p.alloc = pageSize // write till end of page
|
||
|
w.pageCompletions.Inc()
|
||
|
}
|
||
|
n, err := w.segment.Write(p.buf[p.flushed:p.alloc])
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
p.flushed += n
|
||
|
|
||
|
if clear {
|
||
|
for i := range p.buf {
|
||
|
p.buf[i] = 0
|
||
|
}
|
||
|
p.alloc = 0
|
||
|
p.flushed = 0
|
||
|
w.donePages++
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
type recType uint8
|
||
|
|
||
|
const (
|
||
|
recPageTerm recType = 0 // rest of page is empty
|
||
|
recFull recType = 1 // full record
|
||
|
recFirst recType = 2 // first fragment of a record
|
||
|
recMiddle recType = 3 // middle fragments of a record
|
||
|
recLast recType = 4 // final fragment of a record
|
||
|
)
|
||
|
|
||
|
func (t recType) String() string {
|
||
|
switch t {
|
||
|
case recPageTerm:
|
||
|
return "zero"
|
||
|
case recFull:
|
||
|
return "full"
|
||
|
case recFirst:
|
||
|
return "first"
|
||
|
case recMiddle:
|
||
|
return "middle"
|
||
|
case recLast:
|
||
|
return "last"
|
||
|
default:
|
||
|
return "<invalid>"
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (w *WAL) pagesPerSegment() int {
|
||
|
return w.segmentSize / pageSize
|
||
|
}
|
||
|
|
||
|
// Log writes the records into the log.
|
||
|
// Multiple records can be passed at once to reduce writes and increase throughput.
|
||
|
func (w *WAL) Log(recs ...[]byte) error {
|
||
|
// Callers could just implement their own list record format but adding
|
||
|
// a bit of extra logic here frees them from that overhead.
|
||
|
for i, r := range recs {
|
||
|
if err := w.log(r, i == len(recs)-1); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// log writes rec to the log and forces a flush of the current page if its
|
||
|
// the final record of a batch.
|
||
|
func (w *WAL) log(rec []byte, final bool) error {
|
||
|
w.mtx.Lock()
|
||
|
defer w.mtx.Unlock()
|
||
|
|
||
|
// If the record is too big to fit within pages in the current
|
||
|
// segment, terminate the active segment and advance to the next one.
|
||
|
// This ensures that records do not cross segment boundaries.
|
||
|
left := w.page.remaining() - recordHeaderSize // active page
|
||
|
left += (pageSize - recordHeaderSize) * (w.pagesPerSegment() - w.donePages - 1) // free pages
|
||
|
|
||
|
if len(rec) > left {
|
||
|
if err := w.nextSegment(); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Populate as many pages as necessary to fit the record.
|
||
|
// Be careful to always do one pass to ensure we write zero-length records.
|
||
|
for i := 0; i == 0 || len(rec) > 0; i++ {
|
||
|
p := w.page
|
||
|
|
||
|
// Find how much of the record we can fit into the page.
|
||
|
var (
|
||
|
l = min(len(rec), (pageSize-p.alloc)-recordHeaderSize)
|
||
|
part = rec[:l]
|
||
|
buf = p.buf[p.alloc:]
|
||
|
typ recType
|
||
|
)
|
||
|
|
||
|
switch {
|
||
|
case i == 0 && len(part) == len(rec):
|
||
|
typ = recFull
|
||
|
case len(part) == len(rec):
|
||
|
typ = recLast
|
||
|
case i == 0:
|
||
|
typ = recFirst
|
||
|
default:
|
||
|
typ = recMiddle
|
||
|
}
|
||
|
|
||
|
buf[0] = byte(typ)
|
||
|
crc := crc32.Checksum(part, castagnoliTable)
|
||
|
binary.BigEndian.PutUint16(buf[1:], uint16(len(part)))
|
||
|
binary.BigEndian.PutUint32(buf[3:], crc)
|
||
|
|
||
|
copy(buf[7:], part)
|
||
|
p.alloc += len(part) + recordHeaderSize
|
||
|
|
||
|
// If we wrote a full record, we can fit more records of the batch
|
||
|
// into the page before flushing it.
|
||
|
if final || typ != recFull || w.page.full() {
|
||
|
if err := w.flushPage(false); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
}
|
||
|
rec = rec[l:]
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// Segments returns the range [m, n] of currently existing segments.
|
||
|
// If no segments are found, m and n are -1.
|
||
|
func (w *WAL) Segments() (m, n int, err error) {
|
||
|
refs, err := listSegments(w.dir)
|
||
|
if err != nil {
|
||
|
return 0, 0, err
|
||
|
}
|
||
|
if len(refs) == 0 {
|
||
|
return -1, -1, nil
|
||
|
}
|
||
|
return refs[0].n, refs[len(refs)-1].n, nil
|
||
|
}
|
||
|
|
||
|
// Truncate drops all segments before i.
|
||
|
func (w *WAL) Truncate(i int) error {
|
||
|
refs, err := listSegments(w.dir)
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
for _, r := range refs {
|
||
|
if r.n >= i {
|
||
|
break
|
||
|
}
|
||
|
if err := os.Remove(filepath.Join(w.dir, r.s)); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
}
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (w *WAL) fsync(f *os.File) error {
|
||
|
start := time.Now()
|
||
|
err := fileutil.Fsync(f)
|
||
|
w.fsyncDuration.Observe(time.Since(start).Seconds())
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
// Close flushes all writes and closes active segment.
|
||
|
func (w *WAL) Close() (err error) {
|
||
|
w.mtx.Lock()
|
||
|
defer w.mtx.Unlock()
|
||
|
|
||
|
// Flush the last page and zero out all its remaining size.
|
||
|
// We must not flush an empty page as it would falsely signal
|
||
|
// the segment is done if we start writing to it again after opening.
|
||
|
if w.page.alloc > 0 {
|
||
|
if err := w.flushPage(true); err != nil {
|
||
|
return err
|
||
|
}
|
||
|
}
|
||
|
|
||
|
donec := make(chan struct{})
|
||
|
w.stopc <- donec
|
||
|
<-donec
|
||
|
|
||
|
if err = w.fsync(w.segment); err != nil {
|
||
|
level.Error(w.logger).Log("msg", "sync previous segment", "err", err)
|
||
|
}
|
||
|
if err := w.segment.Close(); err != nil {
|
||
|
level.Error(w.logger).Log("msg", "close previous segment", "err", err)
|
||
|
}
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
type segmentRef struct {
|
||
|
s string
|
||
|
n int
|
||
|
}
|
||
|
|
||
|
func listSegments(dir string) (refs []segmentRef, err error) {
|
||
|
files, err := fileutil.ReadDir(dir)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
var last int
|
||
|
for _, fn := range files {
|
||
|
k, err := strconv.Atoi(fn)
|
||
|
if err != nil {
|
||
|
continue
|
||
|
}
|
||
|
if len(refs) > 0 && k > last+1 {
|
||
|
return nil, errors.New("segments are not sequential")
|
||
|
}
|
||
|
refs = append(refs, segmentRef{s: fn, n: k})
|
||
|
last = k
|
||
|
}
|
||
|
return refs, nil
|
||
|
}
|
||
|
|
||
|
type multiReadCloser struct {
|
||
|
io.Reader
|
||
|
files []*os.File
|
||
|
}
|
||
|
|
||
|
// NewSegmentsReader returns a new reader over all segments in the directory.
|
||
|
func NewSegmentsReader(dir string) (io.ReadCloser, error) {
|
||
|
refs, err := listSegments(dir)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
var rdrs []io.Reader
|
||
|
var files []*os.File
|
||
|
|
||
|
for _, r := range refs {
|
||
|
f, err := os.Open(filepath.Join(dir, r.s))
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
rdrs = append(rdrs, f)
|
||
|
files = append(files, f)
|
||
|
}
|
||
|
return &multiReadCloser{
|
||
|
Reader: io.MultiReader(rdrs...),
|
||
|
files: files,
|
||
|
}, nil
|
||
|
}
|
||
|
|
||
|
// NewSegmentsRangeReader returns a new reader over the given WAL segment range.
|
||
|
func NewSegmentsRangeReader(dir string, m, n int) (io.ReadCloser, error) {
|
||
|
refs, err := listSegments(dir)
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
var rdrs []io.Reader
|
||
|
var files []*os.File
|
||
|
|
||
|
for _, r := range refs {
|
||
|
if r.n < m {
|
||
|
continue
|
||
|
}
|
||
|
if r.n > n {
|
||
|
break
|
||
|
}
|
||
|
f, err := os.Open(filepath.Join(dir, r.s))
|
||
|
if err != nil {
|
||
|
return nil, err
|
||
|
}
|
||
|
rdrs = append(rdrs, f)
|
||
|
files = append(files, f)
|
||
|
}
|
||
|
return &multiReadCloser{
|
||
|
Reader: io.MultiReader(rdrs...),
|
||
|
files: files,
|
||
|
}, nil
|
||
|
}
|
||
|
|
||
|
func (r *multiReadCloser) Close() (err error) {
|
||
|
for _, s := range r.files {
|
||
|
if e := s.Close(); e != nil {
|
||
|
err = e
|
||
|
}
|
||
|
}
|
||
|
return err
|
||
|
}
|
||
|
|
||
|
// Reader reads WAL records from an io.Reader.
|
||
|
type Reader struct {
|
||
|
rdr *bufio.Reader
|
||
|
err error
|
||
|
rec []byte
|
||
|
total int // total bytes processed.
|
||
|
}
|
||
|
|
||
|
// NewReader returns a new reader.
|
||
|
func NewReader(r io.Reader) *Reader {
|
||
|
return &Reader{rdr: bufio.NewReader(r)}
|
||
|
}
|
||
|
|
||
|
// Next advances the reader to the next records and returns true if it exists.
|
||
|
// It must not be called once after it returned false.
|
||
|
func (r *Reader) Next() bool {
|
||
|
err := r.next()
|
||
|
if err == io.EOF {
|
||
|
return false
|
||
|
}
|
||
|
r.err = err
|
||
|
return r.err == nil
|
||
|
}
|
||
|
|
||
|
func (r *Reader) next() (err error) {
|
||
|
var hdr [recordHeaderSize]byte
|
||
|
var buf [pageSize]byte
|
||
|
r.rec = r.rec[:0]
|
||
|
|
||
|
i := 0
|
||
|
for {
|
||
|
hdr[0], err = r.rdr.ReadByte()
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
r.total++
|
||
|
typ := recType(hdr[0])
|
||
|
|
||
|
// Gobble up zero bytes.
|
||
|
if typ == recPageTerm {
|
||
|
// We are pedantic and check whether the zeros are actually up
|
||
|
// to a page boundary.
|
||
|
// It's not strictly necessary but may catch sketchy state early.
|
||
|
k := pageSize - (r.total % pageSize)
|
||
|
if k == pageSize {
|
||
|
continue // initial 0 byte was last page byte
|
||
|
}
|
||
|
n, err := io.ReadFull(r.rdr, buf[:k])
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
r.total += n
|
||
|
|
||
|
for _, c := range buf[:k] {
|
||
|
if c != 0 {
|
||
|
return errors.New("unexpected non-zero byte in padded page")
|
||
|
}
|
||
|
}
|
||
|
continue
|
||
|
}
|
||
|
n, err := io.ReadFull(r.rdr, hdr[1:])
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
r.total += n
|
||
|
|
||
|
var (
|
||
|
length = binary.BigEndian.Uint16(hdr[1:])
|
||
|
crc = binary.BigEndian.Uint32(hdr[3:])
|
||
|
)
|
||
|
|
||
|
if length > pageSize {
|
||
|
return errors.Errorf("invalid record size %d", length)
|
||
|
}
|
||
|
n, err = io.ReadFull(r.rdr, buf[:length])
|
||
|
if err != nil {
|
||
|
return err
|
||
|
}
|
||
|
r.total += n
|
||
|
|
||
|
if n != int(length) {
|
||
|
return errors.Errorf("invalid size: expected %d, got %d", length, n)
|
||
|
}
|
||
|
if c := crc32.Checksum(buf[:length], castagnoliTable); c != crc {
|
||
|
return errors.Errorf("unexpected checksum %x, expected %x", c, crc)
|
||
|
}
|
||
|
r.rec = append(r.rec, buf[:length]...)
|
||
|
|
||
|
switch typ {
|
||
|
case recFull:
|
||
|
if i != 0 {
|
||
|
return errors.New("unexpected full record")
|
||
|
}
|
||
|
return nil
|
||
|
case recFirst:
|
||
|
if i != 0 {
|
||
|
return errors.New("unexpected first record")
|
||
|
}
|
||
|
case recMiddle:
|
||
|
if i == 0 {
|
||
|
return errors.New("unexpected middle record")
|
||
|
}
|
||
|
case recLast:
|
||
|
if i == 0 {
|
||
|
return errors.New("unexpected last record")
|
||
|
}
|
||
|
return nil
|
||
|
default:
|
||
|
return errors.Errorf("unexpected record type %d", typ)
|
||
|
}
|
||
|
// Only increment i for non-zero records since we use it
|
||
|
// to determine valid content record sequences.
|
||
|
i++
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Err returns the last encountered error.
|
||
|
func (r *Reader) Err() error {
|
||
|
return r.err
|
||
|
}
|
||
|
|
||
|
// Record returns the current record. The returned byte slice is only
|
||
|
// valid until the next call to Next.
|
||
|
func (r *Reader) Record() []byte {
|
||
|
return r.rec
|
||
|
}
|
||
|
|
||
|
func min(i, j int) int {
|
||
|
if i < j {
|
||
|
return i
|
||
|
}
|
||
|
return j
|
||
|
}
|