mirror of
https://github.com/prometheus/prometheus
synced 2025-01-12 01:29:43 +00:00
b5f6f8fb36
Signed-off-by: Levi Harrison <git@leviharrison.dev>
632 lines
18 KiB
Go
632 lines
18 KiB
Go
// Copyright 2018 The Prometheus Authors
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
package wal
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"io/ioutil"
|
|
"math"
|
|
"os"
|
|
"path"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/go-kit/log"
|
|
"github.com/go-kit/log/level"
|
|
"github.com/pkg/errors"
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
|
|
"github.com/prometheus/prometheus/pkg/timestamp"
|
|
"github.com/prometheus/prometheus/tsdb/record"
|
|
)
|
|
|
|
const (
|
|
readPeriod = 10 * time.Millisecond
|
|
checkpointPeriod = 5 * time.Second
|
|
segmentCheckPeriod = 100 * time.Millisecond
|
|
consumer = "consumer"
|
|
)
|
|
|
|
// WriteTo is an interface used by the Watcher to send the samples it's read
|
|
// from the WAL on to somewhere else. Functions will be called concurrently
|
|
// and it is left to the implementer to make sure they are safe.
|
|
type WriteTo interface {
|
|
Append([]record.RefSample) bool
|
|
AppendExemplars([]record.RefExemplar) bool
|
|
StoreSeries([]record.RefSeries, int)
|
|
// SeriesReset is called after reading a checkpoint to allow the deletion
|
|
// of all series created in a segment lower than the argument.
|
|
SeriesReset(int)
|
|
}
|
|
|
|
type WatcherMetrics struct {
|
|
recordsRead *prometheus.CounterVec
|
|
recordDecodeFails *prometheus.CounterVec
|
|
samplesSentPreTailing *prometheus.CounterVec
|
|
currentSegment *prometheus.GaugeVec
|
|
}
|
|
|
|
// Watcher watches the TSDB WAL for a given WriteTo.
|
|
type Watcher struct {
|
|
name string
|
|
writer WriteTo
|
|
logger log.Logger
|
|
walDir string
|
|
lastCheckpoint string
|
|
sendExemplars bool
|
|
metrics *WatcherMetrics
|
|
readerMetrics *LiveReaderMetrics
|
|
|
|
startTime time.Time
|
|
startTimestamp int64 // the start time as a Prometheus timestamp
|
|
sendSamples bool
|
|
|
|
recordsReadMetric *prometheus.CounterVec
|
|
recordDecodeFailsMetric prometheus.Counter
|
|
samplesSentPreTailing prometheus.Counter
|
|
currentSegmentMetric prometheus.Gauge
|
|
|
|
quit chan struct{}
|
|
done chan struct{}
|
|
|
|
// For testing, stop when we hit this segment.
|
|
MaxSegment int
|
|
}
|
|
|
|
func NewWatcherMetrics(reg prometheus.Registerer) *WatcherMetrics {
|
|
m := &WatcherMetrics{
|
|
recordsRead: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "prometheus",
|
|
Subsystem: "wal_watcher",
|
|
Name: "records_read_total",
|
|
Help: "Number of records read by the WAL watcher from the WAL.",
|
|
},
|
|
[]string{consumer, "type"},
|
|
),
|
|
recordDecodeFails: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "prometheus",
|
|
Subsystem: "wal_watcher",
|
|
Name: "record_decode_failures_total",
|
|
Help: "Number of records read by the WAL watcher that resulted in an error when decoding.",
|
|
},
|
|
[]string{consumer},
|
|
),
|
|
samplesSentPreTailing: prometheus.NewCounterVec(
|
|
prometheus.CounterOpts{
|
|
Namespace: "prometheus",
|
|
Subsystem: "wal_watcher",
|
|
Name: "samples_sent_pre_tailing_total",
|
|
Help: "Number of sample records read by the WAL watcher and sent to remote write during replay of existing WAL.",
|
|
},
|
|
[]string{consumer},
|
|
),
|
|
currentSegment: prometheus.NewGaugeVec(
|
|
prometheus.GaugeOpts{
|
|
Namespace: "prometheus",
|
|
Subsystem: "wal_watcher",
|
|
Name: "current_segment",
|
|
Help: "Current segment the WAL watcher is reading records from.",
|
|
},
|
|
[]string{consumer},
|
|
),
|
|
}
|
|
|
|
if reg != nil {
|
|
reg.MustRegister(m.recordsRead)
|
|
reg.MustRegister(m.recordDecodeFails)
|
|
reg.MustRegister(m.samplesSentPreTailing)
|
|
reg.MustRegister(m.currentSegment)
|
|
}
|
|
|
|
return m
|
|
}
|
|
|
|
// NewWatcher creates a new WAL watcher for a given WriteTo.
|
|
func NewWatcher(metrics *WatcherMetrics, readerMetrics *LiveReaderMetrics, logger log.Logger, name string, writer WriteTo, walDir string, sendExemplars bool) *Watcher {
|
|
if logger == nil {
|
|
logger = log.NewNopLogger()
|
|
}
|
|
return &Watcher{
|
|
logger: logger,
|
|
writer: writer,
|
|
metrics: metrics,
|
|
readerMetrics: readerMetrics,
|
|
walDir: path.Join(walDir, "wal"),
|
|
name: name,
|
|
sendExemplars: sendExemplars,
|
|
|
|
quit: make(chan struct{}),
|
|
done: make(chan struct{}),
|
|
|
|
MaxSegment: -1,
|
|
}
|
|
}
|
|
|
|
func (w *Watcher) setMetrics() {
|
|
// Setup the WAL Watchers metrics. We do this here rather than in the
|
|
// constructor because of the ordering of creating Queue Managers's,
|
|
// stopping them, and then starting new ones in storage/remote/storage.go ApplyConfig.
|
|
if w.metrics != nil {
|
|
w.recordsReadMetric = w.metrics.recordsRead.MustCurryWith(prometheus.Labels{consumer: w.name})
|
|
w.recordDecodeFailsMetric = w.metrics.recordDecodeFails.WithLabelValues(w.name)
|
|
w.samplesSentPreTailing = w.metrics.samplesSentPreTailing.WithLabelValues(w.name)
|
|
w.currentSegmentMetric = w.metrics.currentSegment.WithLabelValues(w.name)
|
|
}
|
|
}
|
|
|
|
// Start the Watcher.
|
|
func (w *Watcher) Start() {
|
|
w.setMetrics()
|
|
level.Info(w.logger).Log("msg", "Starting WAL watcher", "queue", w.name)
|
|
|
|
go w.loop()
|
|
}
|
|
|
|
// Stop the Watcher.
|
|
func (w *Watcher) Stop() {
|
|
close(w.quit)
|
|
<-w.done
|
|
|
|
// Records read metric has series and samples.
|
|
if w.metrics != nil {
|
|
w.metrics.recordsRead.DeleteLabelValues(w.name, "series")
|
|
w.metrics.recordsRead.DeleteLabelValues(w.name, "samples")
|
|
w.metrics.recordDecodeFails.DeleteLabelValues(w.name)
|
|
w.metrics.samplesSentPreTailing.DeleteLabelValues(w.name)
|
|
w.metrics.currentSegment.DeleteLabelValues(w.name)
|
|
}
|
|
|
|
level.Info(w.logger).Log("msg", "WAL watcher stopped", "queue", w.name)
|
|
}
|
|
|
|
func (w *Watcher) loop() {
|
|
defer close(w.done)
|
|
|
|
// We may encounter failures processing the WAL; we should wait and retry.
|
|
for !isClosed(w.quit) {
|
|
w.SetStartTime(time.Now())
|
|
if err := w.Run(); err != nil {
|
|
level.Error(w.logger).Log("msg", "error tailing WAL", "err", err)
|
|
}
|
|
|
|
select {
|
|
case <-w.quit:
|
|
return
|
|
case <-time.After(5 * time.Second):
|
|
}
|
|
}
|
|
}
|
|
|
|
// Run the watcher, which will tail the WAL until the quit channel is closed
|
|
// or an error case is hit.
|
|
func (w *Watcher) Run() error {
|
|
_, lastSegment, err := w.firstAndLast()
|
|
if err != nil {
|
|
return errors.Wrap(err, "wal.Segments")
|
|
}
|
|
|
|
// We want to ensure this is false across iterations since
|
|
// Run will be called again if there was a failure to read the WAL.
|
|
w.sendSamples = false
|
|
|
|
level.Info(w.logger).Log("msg", "Replaying WAL", "queue", w.name)
|
|
|
|
// Backfill from the checkpoint first if it exists.
|
|
lastCheckpoint, checkpointIndex, err := LastCheckpoint(w.walDir)
|
|
if err != nil && err != record.ErrNotFound {
|
|
return errors.Wrap(err, "tsdb.LastCheckpoint")
|
|
}
|
|
|
|
if err == nil {
|
|
if err = w.readCheckpoint(lastCheckpoint); err != nil {
|
|
return errors.Wrap(err, "readCheckpoint")
|
|
}
|
|
}
|
|
w.lastCheckpoint = lastCheckpoint
|
|
|
|
currentSegment, err := w.findSegmentForIndex(checkpointIndex)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
level.Debug(w.logger).Log("msg", "Tailing WAL", "lastCheckpoint", lastCheckpoint, "checkpointIndex", checkpointIndex, "currentSegment", currentSegment, "lastSegment", lastSegment)
|
|
for !isClosed(w.quit) {
|
|
w.currentSegmentMetric.Set(float64(currentSegment))
|
|
level.Debug(w.logger).Log("msg", "Processing segment", "currentSegment", currentSegment)
|
|
|
|
// On start, after reading the existing WAL for series records, we have a pointer to what is the latest segment.
|
|
// On subsequent calls to this function, currentSegment will have been incremented and we should open that segment.
|
|
if err := w.watch(currentSegment, currentSegment >= lastSegment); err != nil {
|
|
return err
|
|
}
|
|
|
|
// For testing: stop when you hit a specific segment.
|
|
if currentSegment == w.MaxSegment {
|
|
return nil
|
|
}
|
|
|
|
currentSegment++
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// findSegmentForIndex finds the first segment greater than or equal to index.
|
|
func (w *Watcher) findSegmentForIndex(index int) (int, error) {
|
|
refs, err := w.segments(w.walDir)
|
|
if err != nil {
|
|
return -1, err
|
|
}
|
|
|
|
for _, r := range refs {
|
|
if r >= index {
|
|
return r, nil
|
|
}
|
|
}
|
|
|
|
return -1, errors.New("failed to find segment for index")
|
|
}
|
|
|
|
func (w *Watcher) firstAndLast() (int, int, error) {
|
|
refs, err := w.segments(w.walDir)
|
|
if err != nil {
|
|
return -1, -1, err
|
|
}
|
|
|
|
if len(refs) == 0 {
|
|
return -1, -1, nil
|
|
}
|
|
return refs[0], refs[len(refs)-1], nil
|
|
}
|
|
|
|
// Copied from tsdb/wal/wal.go so we do not have to open a WAL.
|
|
// Plan is to move WAL watcher to TSDB and dedupe these implementations.
|
|
func (w *Watcher) segments(dir string) ([]int, error) {
|
|
files, err := ioutil.ReadDir(dir)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var refs []int
|
|
for _, f := range files {
|
|
k, err := strconv.Atoi(f.Name())
|
|
if err != nil {
|
|
continue
|
|
}
|
|
refs = append(refs, k)
|
|
}
|
|
sort.Ints(refs)
|
|
for i := 0; i < len(refs)-1; i++ {
|
|
if refs[i]+1 != refs[i+1] {
|
|
return nil, errors.New("segments are not sequential")
|
|
}
|
|
}
|
|
return refs, nil
|
|
}
|
|
|
|
// Use tail true to indicate that the reader is currently on a segment that is
|
|
// actively being written to. If false, assume it's a full segment and we're
|
|
// replaying it on start to cache the series records.
|
|
func (w *Watcher) watch(segmentNum int, tail bool) error {
|
|
segment, err := OpenReadSegment(SegmentName(w.walDir, segmentNum))
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer segment.Close()
|
|
|
|
reader := NewLiveReader(w.logger, w.readerMetrics, segment)
|
|
|
|
readTicker := time.NewTicker(readPeriod)
|
|
defer readTicker.Stop()
|
|
|
|
checkpointTicker := time.NewTicker(checkpointPeriod)
|
|
defer checkpointTicker.Stop()
|
|
|
|
segmentTicker := time.NewTicker(segmentCheckPeriod)
|
|
defer segmentTicker.Stop()
|
|
|
|
// If we're replaying the segment we need to know the size of the file to know
|
|
// when to return from watch and move on to the next segment.
|
|
size := int64(math.MaxInt64)
|
|
if !tail {
|
|
segmentTicker.Stop()
|
|
checkpointTicker.Stop()
|
|
var err error
|
|
size, err = getSegmentSize(w.walDir, segmentNum)
|
|
if err != nil {
|
|
return errors.Wrap(err, "getSegmentSize")
|
|
}
|
|
}
|
|
|
|
gcSem := make(chan struct{}, 1)
|
|
for {
|
|
select {
|
|
case <-w.quit:
|
|
return nil
|
|
|
|
case <-checkpointTicker.C:
|
|
// Periodically check if there is a new checkpoint so we can garbage
|
|
// collect labels. As this is considered an optimisation, we ignore
|
|
// errors during checkpoint processing. Doing the process asynchronously
|
|
// allows the current WAL segment to be processed while reading the
|
|
// checkpoint.
|
|
select {
|
|
case gcSem <- struct{}{}:
|
|
go func() {
|
|
defer func() {
|
|
<-gcSem
|
|
}()
|
|
if err := w.garbageCollectSeries(segmentNum); err != nil {
|
|
level.Warn(w.logger).Log("msg", "Error process checkpoint", "err", err)
|
|
}
|
|
}()
|
|
default:
|
|
// Currently doing a garbage collect, try again later.
|
|
}
|
|
|
|
case <-segmentTicker.C:
|
|
_, last, err := w.firstAndLast()
|
|
if err != nil {
|
|
return errors.Wrap(err, "segments")
|
|
}
|
|
|
|
// Check if new segments exists.
|
|
if last <= segmentNum {
|
|
continue
|
|
}
|
|
|
|
err = w.readSegment(reader, segmentNum, tail)
|
|
|
|
// Ignore errors reading to end of segment whilst replaying the WAL.
|
|
if !tail {
|
|
if err != nil && errors.Cause(err) != io.EOF {
|
|
level.Warn(w.logger).Log("msg", "Ignoring error reading to end of segment, may have dropped data", "err", err)
|
|
} else if reader.Offset() != size {
|
|
level.Warn(w.logger).Log("msg", "Expected to have read whole segment, may have dropped data", "segment", segmentNum, "read", reader.Offset(), "size", size)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Otherwise, when we are tailing, non-EOFs are fatal.
|
|
if errors.Cause(err) != io.EOF {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
|
|
case <-readTicker.C:
|
|
err = w.readSegment(reader, segmentNum, tail)
|
|
|
|
// Ignore all errors reading to end of segment whilst replaying the WAL.
|
|
if !tail {
|
|
if err != nil && errors.Cause(err) != io.EOF {
|
|
level.Warn(w.logger).Log("msg", "Ignoring error reading to end of segment, may have dropped data", "segment", segmentNum, "err", err)
|
|
} else if reader.Offset() != size {
|
|
level.Warn(w.logger).Log("msg", "Expected to have read whole segment, may have dropped data", "segment", segmentNum, "read", reader.Offset(), "size", size)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Otherwise, when we are tailing, non-EOFs are fatal.
|
|
if errors.Cause(err) != io.EOF {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
func (w *Watcher) garbageCollectSeries(segmentNum int) error {
|
|
dir, _, err := LastCheckpoint(w.walDir)
|
|
if err != nil && err != record.ErrNotFound {
|
|
return errors.Wrap(err, "tsdb.LastCheckpoint")
|
|
}
|
|
|
|
if dir == "" || dir == w.lastCheckpoint {
|
|
return nil
|
|
}
|
|
w.lastCheckpoint = dir
|
|
|
|
index, err := checkpointNum(dir)
|
|
if err != nil {
|
|
return errors.Wrap(err, "error parsing checkpoint filename")
|
|
}
|
|
|
|
if index >= segmentNum {
|
|
level.Debug(w.logger).Log("msg", "Current segment is behind the checkpoint, skipping reading of checkpoint", "current", fmt.Sprintf("%08d", segmentNum), "checkpoint", dir)
|
|
return nil
|
|
}
|
|
|
|
level.Debug(w.logger).Log("msg", "New checkpoint detected", "new", dir, "currentSegment", segmentNum)
|
|
|
|
if err = w.readCheckpoint(dir); err != nil {
|
|
return errors.Wrap(err, "readCheckpoint")
|
|
}
|
|
|
|
// Clear series with a checkpoint or segment index # lower than the checkpoint we just read.
|
|
w.writer.SeriesReset(index)
|
|
return nil
|
|
}
|
|
|
|
func (w *Watcher) readSegment(r *LiveReader, segmentNum int, tail bool) error {
|
|
var (
|
|
dec record.Decoder
|
|
series []record.RefSeries
|
|
samples []record.RefSample
|
|
send []record.RefSample
|
|
exemplars []record.RefExemplar
|
|
)
|
|
for r.Next() && !isClosed(w.quit) {
|
|
rec := r.Record()
|
|
w.recordsReadMetric.WithLabelValues(recordType(dec.Type(rec))).Inc()
|
|
|
|
switch dec.Type(rec) {
|
|
case record.Series:
|
|
series, err := dec.Series(rec, series[:0])
|
|
if err != nil {
|
|
w.recordDecodeFailsMetric.Inc()
|
|
return err
|
|
}
|
|
w.writer.StoreSeries(series, segmentNum)
|
|
|
|
case record.Samples:
|
|
// If we're not tailing a segment we can ignore any samples records we see.
|
|
// This speeds up replay of the WAL by > 10x.
|
|
if !tail {
|
|
break
|
|
}
|
|
samples, err := dec.Samples(rec, samples[:0])
|
|
if err != nil {
|
|
w.recordDecodeFailsMetric.Inc()
|
|
return err
|
|
}
|
|
for _, s := range samples {
|
|
if s.T > w.startTimestamp {
|
|
if !w.sendSamples {
|
|
w.sendSamples = true
|
|
duration := time.Since(w.startTime)
|
|
level.Info(w.logger).Log("msg", "Done replaying WAL", "duration", duration)
|
|
}
|
|
send = append(send, s)
|
|
}
|
|
}
|
|
if len(send) > 0 {
|
|
// Blocks until the sample is sent to all remote write endpoints or closed (because enqueue blocks).
|
|
w.writer.Append(send)
|
|
send = send[:0]
|
|
}
|
|
|
|
case record.Exemplars:
|
|
// Skip if experimental "exemplars over remote write" is not enabled.
|
|
if !w.sendExemplars {
|
|
break
|
|
}
|
|
// If we're not tailing a segment we can ignore any exemplars records we see.
|
|
// This speeds up replay of the WAL significantly.
|
|
if !tail {
|
|
break
|
|
}
|
|
exemplars, err := dec.Exemplars(rec, exemplars[:0])
|
|
if err != nil {
|
|
w.recordDecodeFailsMetric.Inc()
|
|
return err
|
|
}
|
|
w.writer.AppendExemplars(exemplars)
|
|
|
|
case record.Tombstones:
|
|
|
|
default:
|
|
// Could be corruption, or reading from a WAL from a newer Prometheus.
|
|
w.recordDecodeFailsMetric.Inc()
|
|
}
|
|
}
|
|
return errors.Wrapf(r.Err(), "segment %d: %v", segmentNum, r.Err())
|
|
}
|
|
|
|
func (w *Watcher) SetStartTime(t time.Time) {
|
|
w.startTime = t
|
|
w.startTimestamp = timestamp.FromTime(t)
|
|
}
|
|
|
|
func recordType(rt record.Type) string {
|
|
switch rt {
|
|
case record.Series:
|
|
return "series"
|
|
case record.Samples:
|
|
return "samples"
|
|
case record.Tombstones:
|
|
return "tombstones"
|
|
default:
|
|
return "unknown"
|
|
}
|
|
}
|
|
|
|
// Read all the series records from a Checkpoint directory.
|
|
func (w *Watcher) readCheckpoint(checkpointDir string) error {
|
|
level.Debug(w.logger).Log("msg", "Reading checkpoint", "dir", checkpointDir)
|
|
index, err := checkpointNum(checkpointDir)
|
|
if err != nil {
|
|
return errors.Wrap(err, "checkpointNum")
|
|
}
|
|
|
|
// Ensure we read the whole contents of every segment in the checkpoint dir.
|
|
segs, err := w.segments(checkpointDir)
|
|
if err != nil {
|
|
return errors.Wrap(err, "Unable to get segments checkpoint dir")
|
|
}
|
|
for _, seg := range segs {
|
|
size, err := getSegmentSize(checkpointDir, seg)
|
|
if err != nil {
|
|
return errors.Wrap(err, "getSegmentSize")
|
|
}
|
|
|
|
sr, err := OpenReadSegment(SegmentName(checkpointDir, seg))
|
|
if err != nil {
|
|
return errors.Wrap(err, "unable to open segment")
|
|
}
|
|
defer sr.Close()
|
|
|
|
r := NewLiveReader(w.logger, w.readerMetrics, sr)
|
|
if err := w.readSegment(r, index, false); errors.Cause(err) != io.EOF && err != nil {
|
|
return errors.Wrap(err, "readSegment")
|
|
}
|
|
|
|
if r.Offset() != size {
|
|
return fmt.Errorf("readCheckpoint wasn't able to read all data from the checkpoint %s/%08d, size: %d, totalRead: %d", checkpointDir, seg, size, r.Offset())
|
|
}
|
|
}
|
|
|
|
level.Debug(w.logger).Log("msg", "Read series references from checkpoint", "checkpoint", checkpointDir)
|
|
return nil
|
|
}
|
|
|
|
func checkpointNum(dir string) (int, error) {
|
|
// Checkpoint dir names are in the format checkpoint.000001
|
|
// dir may contain a hidden directory, so only check the base directory
|
|
chunks := strings.Split(path.Base(dir), ".")
|
|
if len(chunks) != 2 {
|
|
return 0, errors.Errorf("invalid checkpoint dir string: %s", dir)
|
|
}
|
|
|
|
result, err := strconv.Atoi(chunks[1])
|
|
if err != nil {
|
|
return 0, errors.Errorf("invalid checkpoint dir string: %s", dir)
|
|
}
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// Get size of segment.
|
|
func getSegmentSize(dir string, index int) (int64, error) {
|
|
i := int64(-1)
|
|
fi, err := os.Stat(SegmentName(dir, index))
|
|
if err == nil {
|
|
i = fi.Size()
|
|
}
|
|
return i, err
|
|
}
|
|
|
|
func isClosed(c chan struct{}) bool {
|
|
select {
|
|
case <-c:
|
|
return true
|
|
default:
|
|
return false
|
|
}
|
|
}
|