Address comments
Signed-off-by: Fabian Reinartz <freinartz@google.com>
This commit is contained in:
parent
0ad2b8a349
commit
3e76f0163e
|
@ -35,9 +35,9 @@ type CheckpointStats struct {
|
|||
DroppedSeries int
|
||||
DroppedSamples int
|
||||
DroppedTombstones int
|
||||
TotalSeries int
|
||||
TotalSamples int
|
||||
TotalTombstones int
|
||||
TotalSeries int // Processed series including dropped ones.
|
||||
TotalSamples int // Processed samples inlcuding dropped ones.
|
||||
TotalTombstones int // Processed tombstones including droppes ones.
|
||||
}
|
||||
|
||||
// LastCheckpoint returns the directory name of the most recent checkpoint.
|
||||
|
@ -129,16 +129,16 @@ func Checkpoint(logger log.Logger, w *wal.WAL, m, n int, keep func(id uint64) bo
|
|||
sr = last
|
||||
}
|
||||
|
||||
segs, err := wal.NewSegmentsRangeReader(w.Dir(), m, n)
|
||||
segsr, err := wal.NewSegmentsRangeReader(w.Dir(), m, n)
|
||||
if err != nil {
|
||||
return nil, errors.Wrap(err, "create segment reader")
|
||||
}
|
||||
defer segs.Close()
|
||||
defer segsr.Close()
|
||||
|
||||
if sr != nil {
|
||||
sr = io.MultiReader(sr, segs)
|
||||
sr = io.MultiReader(sr, segsr)
|
||||
} else {
|
||||
sr = segs
|
||||
sr = segsr
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -5,14 +5,17 @@ e.g. `000000`, `000001`, `000002`, etc., and are limited to 128MB by default.
|
|||
A segment is written to in pages of 32KB. Only the last page of the most recent segment
|
||||
may be partial. A WAL record is an opaque byte slice that gets split up into sub-records
|
||||
should it exceed the remaining space of the current page. Records are never split across
|
||||
segment boundaries.
|
||||
The encoding of pages is largely borrowed from [LevelDB's/RocksDB's wirte ahead log.][1]
|
||||
segment boundaries. If a single record exceeds the default segment size, a segment with
|
||||
a larger size will be created.
|
||||
The encoding of pages is largely borrowed from [LevelDB's/RocksDB's write ahead log.][1]
|
||||
|
||||
Notable deviations are that the record fragment is encoded as:
|
||||
|
||||
```
|
||||
┌───────────┬──────────┬────────────┬──────────────┐
|
||||
│ type <1b> │ len <2b> │ CRC32 <4b> │ data <bytes> │
|
||||
└───────────┴──────────┴────────────┴──────────────┘
|
||||
```
|
||||
|
||||
## Record encoding
|
||||
|
||||
|
@ -22,6 +25,7 @@ The records written to the write ahead log are encoded as follows:
|
|||
|
||||
Series records encode the labels that identifier a series and its unique ID.
|
||||
|
||||
```
|
||||
┌────────────────────────────────────────────┐
|
||||
│ type = 1 <1b> │
|
||||
├────────────────────────────────────────────┤
|
||||
|
@ -36,12 +40,14 @@ Series records encode the labels that identifier a series and its unique ID.
|
|||
│ └───────────────────────┴────────────────┘ │
|
||||
│ . . . │
|
||||
└────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Sample records
|
||||
|
||||
Sample records encode samples as a list of triples `(series_id, timestamp, value)`.
|
||||
Series reference and timestamp are encoded as deltas w.r.t the first sample.
|
||||
|
||||
```
|
||||
┌──────────────────────────────────────────────────────────────────┐
|
||||
│ type = 2 <1b> │
|
||||
├──────────────────────────────────────────────────────────────────┤
|
||||
|
@ -53,13 +59,14 @@ Series reference and timestamp are encoded as deltas w.r.t the first sample.
|
|||
│ └────────────────────┴───────────────────────────┴─────────────┘ │
|
||||
│ . . . │
|
||||
└──────────────────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
### Tombstone records
|
||||
|
||||
Tombstone records encode tombstones as a list of triples `(series_id, min_time, max_time)`
|
||||
and specify an interval for which samples of a series got deleted.
|
||||
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────┐
|
||||
│ type = 3 <1b> │
|
||||
├─────────────────────────────────────────────────────┤
|
||||
|
@ -68,5 +75,6 @@ and specify an interval for which samples of a series got deleted.
|
|||
│ └─────────┴───────────────────┴───────────────────┘ │
|
||||
│ . . . │
|
||||
└─────────────────────────────────────────────────────┘
|
||||
```
|
||||
|
||||
[1][https://github.com/facebook/rocksdb/wiki/Write-Ahead-Log-File-Format]
|
2
head.go
2
head.go
|
@ -438,7 +438,7 @@ func (h *Head) Truncate(mint int64) error {
|
|||
return nil // no segments yet.
|
||||
}
|
||||
// The lower third of segments should contain mostly obsolete samples.
|
||||
// If we have too few segments, it's not worth checkpointing yet.
|
||||
// If we have less than three segments, it's not worth checkpointing yet.
|
||||
n = m + (n-m)/3
|
||||
if n <= m {
|
||||
return nil
|
||||
|
|
36
wal/wal.go
36
wal/wal.go
|
@ -23,6 +23,7 @@ import (
|
|||
"math"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"sync"
|
||||
"time"
|
||||
|
@ -35,9 +36,7 @@ import (
|
|||
)
|
||||
|
||||
const (
|
||||
version = 1
|
||||
defaultSegmentSize = 128 * 1024 * 1024 // 128 MB
|
||||
maxRecordSize = 1 * 1024 * 1024 // 1MB
|
||||
pageSize = 32 * 1024 // 32KB
|
||||
recordHeaderSize = 7
|
||||
)
|
||||
|
@ -94,7 +93,6 @@ func (e *CorruptionErr) Error() string {
|
|||
|
||||
// OpenWriteSegment opens segment k in dir. The returned segment is ready for new appends.
|
||||
func OpenWriteSegment(dir string, k int) (*Segment, error) {
|
||||
// Only .active segments are allowed to be opened for write.
|
||||
f, err := os.OpenFile(SegmentName(dir, k), os.O_WRONLY|os.O_APPEND, 0666)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
@ -127,7 +125,7 @@ func CreateSegment(dir string, k int) (*Segment, error) {
|
|||
return &Segment{File: f, i: k, dir: dir}, nil
|
||||
}
|
||||
|
||||
// OpenReadSegment opens the segment k in dir for reading.
|
||||
// OpenReadSegment opens the segment with the given filename.
|
||||
func OpenReadSegment(fn string) (*Segment, error) {
|
||||
k, err := strconv.Atoi(filepath.Base(fn))
|
||||
if err != nil {
|
||||
|
@ -142,7 +140,7 @@ func OpenReadSegment(fn string) (*Segment, error) {
|
|||
|
||||
// WAL is a write ahead log that stores records in segment files.
|
||||
// It must be read from start to end once before logging new data.
|
||||
// If an errore occurs during read, the repair procedure must be called
|
||||
// If an erroe occurs during read, the repair procedure must be called
|
||||
// before it's safe to do further writes.
|
||||
//
|
||||
// Segments are written to in pages of 32KB, with records possibly split
|
||||
|
@ -244,23 +242,19 @@ Loop:
|
|||
case f := <-w.actorc:
|
||||
f()
|
||||
case donec := <-w.stopc:
|
||||
close(w.actorc)
|
||||
defer close(donec)
|
||||
break Loop
|
||||
}
|
||||
}
|
||||
// Drain and process any remaining functions.
|
||||
for {
|
||||
select {
|
||||
case f := <-w.actorc:
|
||||
for f := range w.actorc {
|
||||
f()
|
||||
default:
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Repair attempts to repair the WAL based on the error.
|
||||
// It discards all data behind the corruption
|
||||
// It discards all data after the corruption.
|
||||
func (w *WAL) Repair(err error) error {
|
||||
// We could probably have a mode that only discards torn records right around
|
||||
// the corruption to preserve as data much as possible.
|
||||
|
@ -333,7 +327,7 @@ func (w *WAL) Repair(err error) error {
|
|||
|
||||
// SegmentName builds a segment name for the directory.
|
||||
func SegmentName(dir string, i int) string {
|
||||
return filepath.Join(dir, fmt.Sprintf("%06d", i))
|
||||
return filepath.Join(dir, fmt.Sprintf("%08d", i))
|
||||
}
|
||||
|
||||
// nextSegment creates the next segment and closes the previous one.
|
||||
|
@ -384,6 +378,7 @@ func (w *WAL) flushPage(clear bool) error {
|
|||
}
|
||||
p.flushed += n
|
||||
|
||||
// We flushed an entire page, prepare a new one.
|
||||
if clear {
|
||||
for i := range p.buf {
|
||||
p.buf[i] = 0
|
||||
|
@ -485,7 +480,7 @@ func (w *WAL) log(rec []byte, final bool) error {
|
|||
binary.BigEndian.PutUint16(buf[1:], uint16(len(part)))
|
||||
binary.BigEndian.PutUint32(buf[3:], crc)
|
||||
|
||||
copy(buf[7:], part)
|
||||
copy(buf[recordHeaderSize:], part)
|
||||
p.alloc += len(part) + recordHeaderSize
|
||||
|
||||
// If we wrote a full record, we can fit more records of the batch
|
||||
|
@ -587,6 +582,9 @@ func listSegments(dir string) (refs []segmentRef, err error) {
|
|||
refs = append(refs, segmentRef{s: fn, n: k})
|
||||
last = k
|
||||
}
|
||||
sort.Slice(refs, func(i, j int) bool {
|
||||
return refs[i].n < refs[j].n
|
||||
})
|
||||
return refs, nil
|
||||
}
|
||||
|
||||
|
@ -667,10 +665,6 @@ func (r *segmentBufReader) Read(b []byte) (n int, err error) {
|
|||
// Only unset more so we don't invalidate the current segment and
|
||||
// offset before the next read.
|
||||
r.more = false
|
||||
// If no more segments are left, it's the end for the reader.
|
||||
if len(r.segs) == 0 {
|
||||
return n, io.EOF
|
||||
}
|
||||
return n, nil
|
||||
}
|
||||
|
||||
|
@ -689,7 +683,7 @@ func NewReader(r io.Reader) *Reader {
|
|||
}
|
||||
|
||||
// Next advances the reader to the next records and returns true if it exists.
|
||||
// It must not be called once after it returned false.
|
||||
// It must not be called again after it returned false.
|
||||
func (r *Reader) Next() bool {
|
||||
err := r.next()
|
||||
if errors.Cause(err) == io.EOF {
|
||||
|
@ -702,8 +696,8 @@ func (r *Reader) Next() bool {
|
|||
func (r *Reader) next() (err error) {
|
||||
// We have to use r.buf since allocating byte arrays here fails escape
|
||||
// analysis and ends up on the heap, even though it seemingly should not.
|
||||
hdr := r.buf[:7]
|
||||
buf := r.buf[7:]
|
||||
hdr := r.buf[:recordHeaderSize]
|
||||
buf := r.buf[recordHeaderSize:]
|
||||
|
||||
r.rec = r.rec[:0]
|
||||
|
||||
|
|
Loading…
Reference in New Issue