move the wal repair logic in db.Open (#633)
* move the wal repair logic in db.Open This is to allow opening a wal in a read oly mode without triggering a repair. Signed-off-by: Krasi Georgiev <8903888+krasi-georgiev@users.noreply.github.com>
This commit is contained in:
parent
4892da59eb
commit
69740485c1
8
db.go
8
db.go
|
@ -328,8 +328,12 @@ func Open(dir string, l log.Logger, r prometheus.Registerer, opts *Options) (db
|
||||||
minValidTime = blocks[len(blocks)-1].Meta().MaxTime
|
minValidTime = blocks[len(blocks)-1].Meta().MaxTime
|
||||||
}
|
}
|
||||||
|
|
||||||
if err := db.head.Init(minValidTime); err != nil {
|
if initErr := db.head.Init(minValidTime); initErr != nil {
|
||||||
return nil, errors.Wrap(err, "read WAL")
|
db.head.metrics.walCorruptionsTotal.Inc()
|
||||||
|
level.Warn(db.logger).Log("msg", "encountered WAL read error, attempting repair", "err", err)
|
||||||
|
if err := wlog.Repair(initErr); err != nil {
|
||||||
|
return nil, errors.Wrap(err, "repair corrupted WAL")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
go db.run()
|
go db.run()
|
||||||
|
|
45
head.go
45
head.go
|
@ -315,7 +315,7 @@ func (h *Head) updateMinMaxTime(mint, maxt int64) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (h *Head) loadWAL(r *wal.Reader, multiRef map[uint64]uint64) error {
|
func (h *Head) loadWAL(r *wal.Reader, multiRef map[uint64]uint64) (err error) {
|
||||||
// Track number of samples that referenced a series we don't know about
|
// Track number of samples that referenced a series we don't know about
|
||||||
// for error reporting.
|
// for error reporting.
|
||||||
var unknownRefs uint64
|
var unknownRefs uint64
|
||||||
|
@ -332,6 +332,18 @@ func (h *Head) loadWAL(r *wal.Reader, multiRef map[uint64]uint64) error {
|
||||||
)
|
)
|
||||||
wg.Add(n)
|
wg.Add(n)
|
||||||
|
|
||||||
|
defer func() {
|
||||||
|
// For CorruptionErr ensure to terminate all workers before exiting.
|
||||||
|
if _, ok := err.(*wal.CorruptionErr); ok {
|
||||||
|
for i := 0; i < n; i++ {
|
||||||
|
close(inputs[i])
|
||||||
|
for range outputs[i] {
|
||||||
|
}
|
||||||
|
}
|
||||||
|
wg.Wait()
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
for i := 0; i < n; i++ {
|
for i := 0; i < n; i++ {
|
||||||
outputs[i] = make(chan []RefSample, 300)
|
outputs[i] = make(chan []RefSample, 300)
|
||||||
inputs[i] = make(chan []RefSample, 300)
|
inputs[i] = make(chan []RefSample, 300)
|
||||||
|
@ -349,9 +361,12 @@ func (h *Head) loadWAL(r *wal.Reader, multiRef map[uint64]uint64) error {
|
||||||
samples []RefSample
|
samples []RefSample
|
||||||
tstones []Stone
|
tstones []Stone
|
||||||
allStones = newMemTombstones()
|
allStones = newMemTombstones()
|
||||||
err error
|
|
||||||
)
|
)
|
||||||
defer allStones.Close()
|
defer func() {
|
||||||
|
if err := allStones.Close(); err != nil {
|
||||||
|
level.Warn(h.logger).Log("msg", "closing memTombstones during wal read", "err", err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
for r.Next() {
|
for r.Next() {
|
||||||
series, samples, tstones = series[:0], samples[:0], tstones[:0]
|
series, samples, tstones = series[:0], samples[:0], tstones[:0]
|
||||||
rec := r.Record()
|
rec := r.Record()
|
||||||
|
@ -450,9 +465,6 @@ func (h *Head) loadWAL(r *wal.Reader, multiRef map[uint64]uint64) error {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if r.Err() != nil {
|
|
||||||
return errors.Wrap(r.Err(), "read records")
|
|
||||||
}
|
|
||||||
|
|
||||||
// Signal termination to each worker and wait for it to close its output channel.
|
// Signal termination to each worker and wait for it to close its output channel.
|
||||||
for i := 0; i < n; i++ {
|
for i := 0; i < n; i++ {
|
||||||
|
@ -462,6 +474,10 @@ func (h *Head) loadWAL(r *wal.Reader, multiRef map[uint64]uint64) error {
|
||||||
}
|
}
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
|
||||||
|
if r.Err() != nil {
|
||||||
|
return errors.Wrap(r.Err(), "read records")
|
||||||
|
}
|
||||||
|
|
||||||
if err := allStones.Iter(func(ref uint64, dranges Intervals) error {
|
if err := allStones.Iter(func(ref uint64, dranges Intervals) error {
|
||||||
return h.chunkRewrite(ref, dranges)
|
return h.chunkRewrite(ref, dranges)
|
||||||
}); err != nil {
|
}); err != nil {
|
||||||
|
@ -497,7 +513,11 @@ func (h *Head) Init(minValidTime int64) error {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return errors.Wrap(err, "open checkpoint")
|
return errors.Wrap(err, "open checkpoint")
|
||||||
}
|
}
|
||||||
defer sr.Close()
|
defer func() {
|
||||||
|
if err := sr.Close(); err != nil {
|
||||||
|
level.Warn(h.logger).Log("msg", "error while closing the wal segments reader", "err", err)
|
||||||
|
}
|
||||||
|
}()
|
||||||
|
|
||||||
// A corrupted checkpoint is a hard error for now and requires user
|
// A corrupted checkpoint is a hard error for now and requires user
|
||||||
// intervention. There's likely little data that can be recovered anyway.
|
// intervention. There's likely little data that can be recovered anyway.
|
||||||
|
@ -522,14 +542,11 @@ func (h *Head) Init(minValidTime int64) error {
|
||||||
|
|
||||||
sr := wal.NewSegmentBufReader(s)
|
sr := wal.NewSegmentBufReader(s)
|
||||||
err = h.loadWAL(wal.NewReader(sr), multiRef)
|
err = h.loadWAL(wal.NewReader(sr), multiRef)
|
||||||
sr.Close() // Close the reader so that if there was an error the repair can remove the corrupted file under Windows.
|
if err := sr.Close(); err != nil {
|
||||||
if err == nil {
|
level.Warn(h.logger).Log("msg", "error while closing the wal segments reader", "err", err)
|
||||||
continue
|
|
||||||
}
|
}
|
||||||
level.Warn(h.logger).Log("msg", "encountered WAL error, attempting repair", "err", err)
|
if err != nil {
|
||||||
h.metrics.walCorruptionsTotal.Inc()
|
return err
|
||||||
if err := h.wal.Repair(err); err != nil {
|
|
||||||
return errors.Wrap(err, "repair corrupted WAL")
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
73
head_test.go
73
head_test.go
|
@ -19,9 +19,11 @@ import (
|
||||||
"math/rand"
|
"math/rand"
|
||||||
"os"
|
"os"
|
||||||
"path"
|
"path"
|
||||||
|
"path/filepath"
|
||||||
"sort"
|
"sort"
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"github.com/pkg/errors"
|
||||||
prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
|
prom_testutil "github.com/prometheus/client_golang/prometheus/testutil"
|
||||||
"github.com/prometheus/tsdb/chunkenc"
|
"github.com/prometheus/tsdb/chunkenc"
|
||||||
"github.com/prometheus/tsdb/chunks"
|
"github.com/prometheus/tsdb/chunks"
|
||||||
|
@ -1088,42 +1090,61 @@ func TestWalRepair_DecodingError(t *testing.T) {
|
||||||
},
|
},
|
||||||
} {
|
} {
|
||||||
t.Run(name, func(t *testing.T) {
|
t.Run(name, func(t *testing.T) {
|
||||||
dir, err := ioutil.TempDir("", "wal_head_repair")
|
dir, err := ioutil.TempDir("", "wal_repair")
|
||||||
testutil.Ok(t, err)
|
testutil.Ok(t, err)
|
||||||
defer func() {
|
defer func() {
|
||||||
testutil.Ok(t, os.RemoveAll(dir))
|
testutil.Ok(t, os.RemoveAll(dir))
|
||||||
}()
|
}()
|
||||||
|
|
||||||
w, err := wal.New(nil, nil, dir)
|
// Fill the wal and corrupt it.
|
||||||
testutil.Ok(t, err)
|
{
|
||||||
defer w.Close()
|
w, err := wal.New(nil, nil, filepath.Join(dir, "wal"))
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
|
||||||
for i := 1; i <= test.totalRecs; i++ {
|
for i := 1; i <= test.totalRecs; i++ {
|
||||||
// At this point insert a corrupted record.
|
// At this point insert a corrupted record.
|
||||||
if i-1 == test.expRecs {
|
if i-1 == test.expRecs {
|
||||||
testutil.Ok(t, w.Log(test.corrFunc(test.rec)))
|
testutil.Ok(t, w.Log(test.corrFunc(test.rec)))
|
||||||
continue
|
continue
|
||||||
|
}
|
||||||
|
testutil.Ok(t, w.Log(test.rec))
|
||||||
}
|
}
|
||||||
testutil.Ok(t, w.Log(test.rec))
|
|
||||||
|
h, err := NewHead(nil, nil, w, 1)
|
||||||
|
testutil.Ok(t, err)
|
||||||
|
testutil.Equals(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal))
|
||||||
|
initErr := h.Init(math.MinInt64)
|
||||||
|
|
||||||
|
err = errors.Cause(initErr) // So that we can pick up errors even if wrapped.
|
||||||
|
_, corrErr := err.(*wal.CorruptionErr)
|
||||||
|
testutil.Assert(t, corrErr, "reading the wal didn't return corruption error")
|
||||||
|
testutil.Ok(t, w.Close())
|
||||||
}
|
}
|
||||||
|
|
||||||
h, err := NewHead(nil, nil, w, 1)
|
// Open the db to trigger a repair.
|
||||||
testutil.Ok(t, err)
|
{
|
||||||
testutil.Equals(t, 0.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal))
|
db, err := Open(dir, nil, nil, DefaultOptions)
|
||||||
testutil.Ok(t, h.Init(math.MinInt64))
|
testutil.Ok(t, err)
|
||||||
testutil.Equals(t, 1.0, prom_testutil.ToFloat64(h.metrics.walCorruptionsTotal))
|
defer func() {
|
||||||
|
testutil.Ok(t, db.Close())
|
||||||
sr, err := wal.NewSegmentsReader(dir)
|
}()
|
||||||
testutil.Ok(t, err)
|
testutil.Equals(t, 1.0, prom_testutil.ToFloat64(db.head.metrics.walCorruptionsTotal))
|
||||||
defer sr.Close()
|
}
|
||||||
r := wal.NewReader(sr)
|
|
||||||
|
// Read the wal content after the repair.
|
||||||
var actRec int
|
{
|
||||||
for r.Next() {
|
sr, err := wal.NewSegmentsReader(filepath.Join(dir, "wal"))
|
||||||
actRec++
|
testutil.Ok(t, err)
|
||||||
|
defer sr.Close()
|
||||||
|
r := wal.NewReader(sr)
|
||||||
|
|
||||||
|
var actRec int
|
||||||
|
for r.Next() {
|
||||||
|
actRec++
|
||||||
|
}
|
||||||
|
testutil.Ok(t, r.Err())
|
||||||
|
testutil.Equals(t, test.expRecs, actRec, "Wrong number of intact records")
|
||||||
}
|
}
|
||||||
testutil.Ok(t, r.Err())
|
|
||||||
testutil.Equals(t, test.expRecs, actRec, "Wrong number of intact records")
|
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue