move the wal repair logic in db.Open (#633)

* move the wal repair logic in db.Open

This is to allow opening a wal in a read oly mode without triggering a
repair.

Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
This commit is contained in:
Krasi Georgiev 2019-06-14 17:39:22 +02:00 committed by GitHub
parent 4892da59eb
commit 69740485c1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 84 additions and 42 deletions

8
db.go
View File

@ -328,8 +328,12 @@ func Open(dir string, l log.Logger, r prometheus.Registerer, opts *Options) (db
minValidTime = blocks[len(blocks)-1].Meta().MaxTime minValidTime = blocks[len(blocks)-1].Meta().MaxTime
} }
if err := db.head.Init(minValidTime); err != nil { if initErr := db.head.Init(minValidTime); initErr != nil {
return nil, errors.Wrap(err, "read WAL") db.head.metrics.walCorruptionsTotal.Inc()
level.Warn(db.logger).Log("msg", "encountered WAL read error, attempting repair", "err", err)
if err := wlog.Repair(initErr); err != nil {
return nil, errors.Wrap(err, "repair corrupted WAL")
}
} }
go db.run() go db.run()

45
head.go
View File

@ -315,7 +315,7 @@ func (h *Head) updateMinMaxTime(mint, maxt int64) {
} }
} }
func (h *Head) loadWAL(r *wal.Reader, multiRef map[uint64]uint64) error { func (h *Head) loadWAL(r *wal.Reader, multiRef map[uint64]uint64) (err error) {
// Track number of samples that referenced a series we don't know about // Track number of samples that referenced a series we don't know about
// for error reporting. // for error reporting.
var unknownRefs uint64 var unknownRefs uint64
@ -332,6 +332,18 @@ func (h *Head) loadWAL(r *wal.Reader, multiRef map[uint64]uint64) error {
) )
wg.Add(n) wg.Add(n)
defer func() {
// For CorruptionErr ensure to terminate all workers before exiting.
if _, ok := err.(*wal.CorruptionErr); ok {
for i := 0; i < n; i++ {
close(inputs[i])
for range outputs[i] {
}
}
wg.Wait()
}
}()
for i := 0; i < n; i++ { for i := 0; i < n; i++ {
outputs[i] = make(chan []RefSample, 300) outputs[i] = make(chan []RefSample, 300)
inputs[i] = make(chan []RefSample, 300) inputs[i] = make(chan []RefSample, 300)
@ -349,9 +361,12 @@ func (h *Head) loadWAL(r *wal.Reader, multiRef map[uint64]uint64) error {
samples []RefSample samples []RefSample
tstones []Stone tstones []Stone
allStones = newMemTombstones() allStones = newMemTombstones()
err error
) )
defer allStones.Close() defer func() {
if err := allStones.Close(); err != nil {
level.Warn(h.logger).Log("msg", "closing memTombstones during wal read", "err", err)
}
}()
for r.Next() { for r.Next() {
series, samples, tstones = series[:0], samples[:0], tstones[:0] series, samples, tstones = series[:0], samples[:0], tstones[:0]
rec := r.Record() rec := r.Record()
@ -450,9 +465,6 @@ func (h *Head) loadWAL(r *wal.Reader, multiRef map[uint64]uint64) error {
} }
} }
} }
if r.Err() != nil {
return errors.Wrap(r.Err(), "read records")
}
// Signal termination to each worker and wait for it to close its output channel. // Signal termination to each worker and wait for it to close its output channel.
for i := 0; i < n; i++ { for i := 0; i < n; i++ {
@ -462,6 +474,10 @@ func (h *Head) loadWAL(r *wal.Reader, multiRef map[uint64]uint64) error {
} }
wg.Wait() wg.Wait()
if r.Err() != nil {
return errors.Wrap(r.Err(), "read records")
}
if err := allStones.Iter(func(ref uint64, dranges Intervals) error { if err := allStones.Iter(func(ref uint64, dranges Intervals) error {
return h.chunkRewrite(ref, dranges) return h.chunkRewrite(ref, dranges)
}); err != nil { }); err != nil {
@ -497,7 +513,11 @@ func (h *Head) Init(minValidTime int64) error {
if err != nil { if err != nil {
return errors.Wrap(err, "open checkpoint") return errors.Wrap(err, "open checkpoint")
} }
defer sr.Close() defer func() {
if err := sr.Close(); err != nil {
level.Warn(h.logger).Log("msg", "error while closing the wal segments reader", "err", err)
}
}()
// A corrupted checkpoint is a hard error for now and requires user // A corrupted checkpoint is a hard error for now and requires user
// intervention. There's likely little data that can be recovered anyway. // intervention. There's likely little data that can be recovered anyway.
@ -522,14 +542,11 @@ func (h *Head) Init(minValidTime int64) error {
sr := wal.NewSegmentBufReader(s) sr := wal.NewSegmentBufReader(s)
err = h.loadWAL(wal.NewReader(sr), multiRef) err = h.loadWAL(wal.NewReader(sr), multiRef)
sr.Close() // Close the reader so that if there was an error the repair can remove the corrupted file under Windows. if err := sr.Close(); err != nil {
if err == nil { level.Warn(h.logger).Log("msg", "error while closing the wal segments reader", "err", err)
continue
} }
level.Warn(h.logger).Log("msg", "encountered WAL error, attempting repair", "err", err) if err != nil {
h.metrics.walCorruptionsTotal.Inc() return err
if err := h.wal.Repair(err); err != nil {
return errors.Wrap(err, "repair corrupted WAL")
} }
} }

View File

@ -19,9 +19,11 @@ import (
"math/rand" "math/rand"
"os" "os"
"path" "path"
"path/filepath"
"sort" "sort"
"testing" "testing"
"github.com/pkg/errors"
prom_testutil "github.com/prometheus/client_golang/prometheus/testutil" prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
"github.com/prometheus/tsdb/chunkenc" "github.com/prometheus/tsdb/chunkenc"
"github.com/prometheus/tsdb/chunks" "github.com/prometheus/tsdb/chunks"
@ -1088,42 +1090,61 @@ func TestWalRepair_DecodingError(t *testing.T) {
}, },
} { } {
t.Run(name, func(t *testing.T) { t.Run(name, func(t *testing.T) {
dir, err := ioutil.TempDir("", "wal_head_repair") dir, err := ioutil.TempDir("", "wal_repair")
testutil.Ok(t, err) testutil.Ok(t, err)
defer func() { defer func() {
testutil.Ok(t, os.RemoveAll(dir)) testutil.Ok(t, os.RemoveAll(dir))
}() }()
w, err := wal.New(nil, nil, dir) // Fill the wal and corrupt it.
testutil.Ok(t, err) {
defer w.Close() w, err := wal.New(nil, nil, filepath.Join(dir, "wal"))
testutil.Ok(t, err)
for i := 1; i <= test.totalRecs; i++ { for i := 1; i <= test.totalRecs; i++ {
// At this point insert a corrupted record. // At this point insert a corrupted record.
if i-1 == test.expRecs { if i-1 == test.expRecs {
testutil.Ok(t, w.Log(test.corrFunc(test.rec))) testutil.Ok(t, w.Log(test.corrFunc(test.rec)))
continue continue
}
testutil.Ok(t, w.Log(test.rec))
} }
testutil.Ok(t, w.Log(test.rec))
h, err := NewHead(nil, nil, w, 1)
testutil.Ok(t, err)
testutil.Equals(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal))
initErr := h.Init(math.MinInt64)
err = errors.Cause(initErr) // So that we can pick up errors even if wrapped.
_, corrErr := err.(*wal.CorruptionErr)
testutil.Assert(t, corrErr, "reading the wal didn't return corruption error")
testutil.Ok(t, w.Close())
} }
h, err := NewHead(nil, nil, w, 1) // Open the db to trigger a repair.
testutil.Ok(t, err) {
testutil.Equals(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal)) db, err := Open(dir, nil, nil, DefaultOptions)
testutil.Ok(t, h.Init(math.MinInt64)) testutil.Ok(t, err)
testutil.Equals(t, 1.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal)) defer func() {
testutil.Ok(t, db.Close())
sr, err := wal.NewSegmentsReader(dir) }()
testutil.Ok(t, err) testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.walCorruptionsTotal))
defer sr.Close() }
r := wal.NewReader(sr)
// Read the wal content after the repair.
var actRec int {
for r.Next() { sr, err := wal.NewSegmentsReader(filepath.Join(dir, "wal"))
actRec++ testutil.Ok(t, err)
defer sr.Close()
r := wal.NewReader(sr)
var actRec int
for r.Next() {
actRec++
}
testutil.Ok(t, r.Err())
testutil.Equals(t, test.expRecs, actRec, "Wrong number of intact records")
} }
testutil.Ok(t, r.Err())
testutil.Equals(t, test.expRecs, actRec, "Wrong number of intact records")
}) })
} }