diff --git a/checkpoint.go b/checkpoint.go index f45f3791f..b8de5d14a 100644 --- a/checkpoint.go +++ b/checkpoint.go @@ -23,8 +23,6 @@ import ( "strconv" "strings" - "github.com/go-kit/kit/log" - "github.com/go-kit/kit/log/level" "github.com/pkg/errors" "github.com/prometheus/tsdb/fileutil" "github.com/prometheus/tsdb/wal" @@ -100,12 +98,7 @@ const checkpointPrefix = "checkpoint." // segmented format as the original WAL itself. // This makes it easy to read it through the WAL package and concatenate // it with the original WAL. -// -// Non-critical errors are logged and not returned. -func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bool, mint int64) (*CheckpointStats, error) { - if logger == nil { - logger = log.NewNopLogger() - } +func Checkpoint(w *wal.WAL, m, n int, keep func(id uint64) bool, mint int64) (*CheckpointStats, error) { stats := &CheckpointStats{} var sr io.Reader @@ -272,17 +265,5 @@ func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bo if err := closeAll(closers...); err != nil { return stats, errors.Wrap(err, "close opened files") } - if err := w.Truncate(n + 1); err != nil { - // If truncating fails, we'll just try again at the next checkpoint. - // Leftover segments will just be ignored in the future if there's a checkpoint - // that supersedes them. - level.Error(logger).Log("msg", "truncating segments failed", "err", err) - } - if err := DeleteCheckpoints(w.Dir(), n); err != nil { - // Leftover old checkpoints do not cause problems down the line beyond - // occupying disk space. - // They will just be ignored since a higher checkpoint exists. - level.Error(logger).Log("msg", "delete old checkpoints", "err", err) - } return stats, nil } diff --git a/checkpoint_test.go b/checkpoint_test.go index daa54df19..60f99fd74 100644 --- a/checkpoint_test.go +++ b/checkpoint_test.go @@ -137,10 +137,12 @@ func TestCheckpoint(t *testing.T) { } testutil.Ok(t, w.Close()) - _, err = Checkpoint(nil, w, 100, 106, func(x uint64) bool { + _, err = Checkpoint(w, 100, 106, func(x uint64) bool { return x%2 == 0 }, last/2) testutil.Ok(t, err) + testutil.Ok(t, w.Truncate(107)) + testutil.Ok(t, DeleteCheckpoints(w.Dir(), 106)) // Only the new checkpoint should be left. files, err := fileutil.ReadDir(dir) diff --git a/head.go b/head.go index bc8cdfbe4..8d259fd66 100644 --- a/head.go +++ b/head.go @@ -76,19 +76,25 @@ type Head struct { } type headMetrics struct { - activeAppenders prometheus.Gauge - series prometheus.Gauge - seriesCreated prometheus.Counter - seriesRemoved prometheus.Counter - seriesNotFound prometheus.Counter - chunks prometheus.Gauge - chunksCreated prometheus.Counter - chunksRemoved prometheus.Counter - gcDuration prometheus.Summary - minTime prometheus.GaugeFunc - maxTime prometheus.GaugeFunc - samplesAppended prometheus.Counter - walTruncateDuration prometheus.Summary + activeAppenders prometheus.Gauge + series prometheus.Gauge + seriesCreated prometheus.Counter + seriesRemoved prometheus.Counter + seriesNotFound prometheus.Counter + chunks prometheus.Gauge + chunksCreated prometheus.Counter + chunksRemoved prometheus.Counter + gcDuration prometheus.Summary + minTime prometheus.GaugeFunc + maxTime prometheus.GaugeFunc + samplesAppended prometheus.Counter + walTruncateDuration prometheus.Summary + headTruncateFail prometheus.Counter + headTruncateTotal prometheus.Counter + checkpointDeleteFail prometheus.Counter + checkpointDeleteTotal prometheus.Counter + checkpointCreationFail prometheus.Counter + checkpointCreationTotal prometheus.Counter } func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { @@ -150,6 +156,30 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { Name: "prometheus_tsdb_head_samples_appended_total", Help: "Total number of appended samples.", }) + m.headTruncateFail = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_head_truncations_failed_total", + Help: "Total number of head truncations that failed.", + }) + m.headTruncateTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_head_truncations_total", + Help: "Total number of head truncations attempted.", + }) + m.checkpointDeleteFail = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_checkpoint_deletions_failed_total", + Help: "Total number of checkpoint deletions that failed.", + }) + m.checkpointDeleteTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_checkpoint_deletions_total", + Help: "Total number of checkpoint deletions attempted.", + }) + m.checkpointCreationFail = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_checkpoint_creations_failed_total", + Help: "Total number of checkpoint creations that failed.", + }) + m.checkpointCreationTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_checkpoint_creations_total", + Help: "Total number of checkpoint creations attempted.", + }) if r != nil { r.MustRegister( @@ -166,6 +196,12 @@ func newHeadMetrics(h *Head, r prometheus.Registerer) *headMetrics { m.gcDuration, m.walTruncateDuration, m.samplesAppended, + m.headTruncateFail, + m.headTruncateTotal, + m.checkpointDeleteFail, + m.checkpointDeleteTotal, + m.checkpointCreationFail, + m.checkpointCreationTotal, ) } return m @@ -421,7 +457,12 @@ func (h *Head) Init() error { } // Truncate removes old data before mint from the head. -func (h *Head) Truncate(mint int64) error { +func (h *Head) Truncate(mint int64) (err error) { + defer func() { + if err != nil { + h.metrics.headTruncateFail.Inc() + } + }() initialize := h.MinTime() == math.MaxInt64 if h.MinTime() >= mint && !initialize { @@ -440,6 +481,7 @@ func (h *Head) Truncate(mint int64) error { return nil } + h.metrics.headTruncateTotal.Inc() start := time.Now() h.gc() @@ -469,9 +511,25 @@ func (h *Head) Truncate(mint int64) error { keep := func(id uint64) bool { return h.series.getByID(id) != nil } - if _, err = Checkpoint(h.logger, h.wal, m, n, keep, mint); err != nil { + h.metrics.checkpointCreationTotal.Inc() + if _, err = Checkpoint(h.wal, m, n, keep, mint); err != nil { + h.metrics.checkpointCreationFail.Inc() return errors.Wrap(err, "create checkpoint") } + if err := h.wal.Truncate(n + 1); err != nil { + // If truncating fails, we'll just try again at the next checkpoint. + // Leftover segments will just be ignored in the future if there's a checkpoint + // that supersedes them. + level.Error(h.logger).Log("msg", "truncating segments failed", "err", err) + } + h.metrics.checkpointDeleteTotal.Inc() + if err := DeleteCheckpoints(h.wal.Dir(), n); err != nil { + // Leftover old checkpoints do not cause problems down the line beyond + // occupying disk space. + // They will just be ignored since a higher checkpoint exists. + level.Error(h.logger).Log("msg", "delete old checkpoints", "err", err) + h.metrics.checkpointDeleteFail.Inc() + } h.metrics.walTruncateDuration.Observe(time.Since(start).Seconds()) level.Info(h.logger).Log("msg", "WAL checkpoint complete", diff --git a/wal/wal.go b/wal/wal.go index aa52738fa..d9a59c005 100644 --- a/wal/wal.go +++ b/wal/wal.go @@ -162,6 +162,8 @@ type WAL struct { fsyncDuration prometheus.Summary pageFlushes prometheus.Counter pageCompletions prometheus.Counter + truncateFail prometheus.Counter + truncateTotal prometheus.Counter } // New returns a new WAL over the given directory. @@ -201,8 +203,16 @@ func NewSize(logger log.Logger, reg prometheus.Registerer, dir string, segmentSi Name: "prometheus_tsdb_wal_completed_pages_total", Help: "Total number of completed pages.", }) + w.truncateFail = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wal_truncations_failed_total", + Help: "Total number of WAL truncations that failed.", + }) + w.truncateTotal = prometheus.NewCounter(prometheus.CounterOpts{ + Name: "prometheus_tsdb_wal_truncations_total", + Help: "Total number of WAL truncations attempted.", + }) if reg != nil { - reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions) + reg.MustRegister(w.fsyncDuration, w.pageFlushes, w.pageCompletions, w.truncateFail, w.truncateTotal) } _, j, err := w.Segments() @@ -527,7 +537,13 @@ func (w *WAL) Segments() (m, n int, err error) { } // Truncate drops all segments before i. -func (w *WAL) Truncate(i int) error { +func (w *WAL) Truncate(i int) (err error) { + w.truncateTotal.Inc() + defer func() { + if err != nil { + w.truncateFail.Inc() + } + }() refs, err := listSegments(w.dir) if err != nil { return err @@ -536,7 +552,7 @@ func (w *WAL) Truncate(i int) error { if r.n >= i { break } - if err := os.Remove(filepath.Join(w.dir, r.s)); err != nil { + if err = os.Remove(filepath.Join(w.dir, r.s)); err != nil { return err } }