prometheus/model/labels/labels_dedupelabels.go
Bryan Boreham 2ced2f6aec [PERF] Labels: faster varint for dedupelabels
Including tests.

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>
2024-06-21 11:57:09 +01:00

801 lines
22 KiB
Go

// Copyright 2024 The Prometheus Authors
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//go:build dedupelabels
package labels
import (
"bytes"
"slices"
"strings"
"sync"
"github.com/cespare/xxhash/v2"
)
// Labels is implemented by a SymbolTable and string holding name/value
// pairs encoded as indexes into the table in varint encoding.
// Names are in alphabetical order.
type Labels struct {
syms *nameTable
data string
}
// Split SymbolTable into the part used by Labels and the part used by Builder. Only the latter needs the map.
// This part is used by Labels. All fields are immutable after construction.
type nameTable struct {
byNum []string // This slice header is never changed, even while we are building the symbol table.
symbolTable *SymbolTable // If we need to use it in a Builder.
}
// SymbolTable is used to map strings into numbers so they can be packed together.
type SymbolTable struct {
mx sync.Mutex
*nameTable
nextNum int
byName map[string]int
}
const defaultSymbolTableSize = 1024
func NewSymbolTable() *SymbolTable {
t := &SymbolTable{
nameTable: &nameTable{byNum: make([]string, defaultSymbolTableSize)},
byName: make(map[string]int, defaultSymbolTableSize),
}
t.nameTable.symbolTable = t
return t
}
func (t *SymbolTable) Len() int {
t.mx.Lock()
defer t.mx.Unlock()
return len(t.byName)
}
// ToNum maps a string to an integer, adding the string to the table if it is not already there.
// Note: copies the string before adding, in case the caller passed part of
// a buffer that should not be kept alive by this SymbolTable.
func (t *SymbolTable) ToNum(name string) int {
t.mx.Lock()
defer t.mx.Unlock()
return t.toNumUnlocked(name)
}
func (t *SymbolTable) toNumUnlocked(name string) int {
if i, found := t.byName[name]; found {
return i
}
i := t.nextNum
if t.nextNum == cap(t.byNum) {
// Name table is full; copy to a new one. Don't touch the existing slice, as nameTable is immutable after construction.
newSlice := make([]string, cap(t.byNum)*2)
copy(newSlice, t.byNum)
t.nameTable = &nameTable{byNum: newSlice, symbolTable: t}
}
name = strings.Clone(name)
t.byNum[i] = name
t.byName[name] = i
t.nextNum++
return i
}
func (t *SymbolTable) checkNum(name string) (int, bool) {
t.mx.Lock()
defer t.mx.Unlock()
i, bool := t.byName[name]
return i, bool
}
// ToName maps an integer to a string.
func (t *nameTable) ToName(num int) string {
return t.byNum[num]
}
// "Varint" in this file is non-standard: we encode small numbers (up to 32767) in 2 bytes,
// because we expect most Prometheus to have more than 127 unique strings.
// And we don't encode numbers larger than 4 bytes because we don't expect more than 536,870,912 unique strings.
func decodeVarint(data string, index int) (int, int) {
b := int(data[index]) + int(data[index+1])<<8
index += 2
if b < 0x8000 {
return b, index
}
value := int(b & 0x7FFF)
b = int(data[index])
index++
if b < 0x80 {
return value | (b << 15), index
}
value |= (b & 0x7f) << 15
b = int(data[index])
index++
return value | (b << 22), index
}
func decodeString(t *nameTable, data string, index int) (string, int) {
var num int
num, index = decodeVarint(data, index)
return t.ToName(num), index
}
// Bytes returns ls as a byte slice.
// It uses non-printing characters and so should not be used for printing.
func (ls Labels) Bytes(buf []byte) []byte {
b := bytes.NewBuffer(buf[:0])
for i := 0; i < len(ls.data); {
if i > 0 {
b.WriteByte(seps[0])
}
var name, value string
name, i = decodeString(ls.syms, ls.data, i)
value, i = decodeString(ls.syms, ls.data, i)
b.WriteString(name)
b.WriteByte(seps[0])
b.WriteString(value)
}
return b.Bytes()
}
// IsZero implements yaml.IsZeroer - if we don't have this then 'omitempty' fields are always omitted.
func (ls Labels) IsZero() bool {
return len(ls.data) == 0
}
// MatchLabels returns a subset of Labels that matches/does not match with the provided label names based on the 'on' boolean.
// If on is set to true, it returns the subset of labels that match with the provided label names and its inverse when 'on' is set to false.
// TODO: This is only used in printing an error message
func (ls Labels) MatchLabels(on bool, names ...string) Labels {
b := NewBuilder(ls)
if on {
b.Keep(names...)
} else {
b.Del(MetricName)
b.Del(names...)
}
return b.Labels()
}
// Hash returns a hash value for the label set.
// Note: the result is not guaranteed to be consistent across different runs of Prometheus.
func (ls Labels) Hash() uint64 {
// Use xxhash.Sum64(b) for fast path as it's faster.
b := make([]byte, 0, 1024)
for pos := 0; pos < len(ls.data); {
name, newPos := decodeString(ls.syms, ls.data, pos)
value, newPos := decodeString(ls.syms, ls.data, newPos)
if len(b)+len(name)+len(value)+2 >= cap(b) {
// If labels entry is 1KB+, hash the rest of them via Write().
h := xxhash.New()
_, _ = h.Write(b)
for pos < len(ls.data) {
name, pos = decodeString(ls.syms, ls.data, pos)
value, pos = decodeString(ls.syms, ls.data, pos)
_, _ = h.WriteString(name)
_, _ = h.Write(seps)
_, _ = h.WriteString(value)
_, _ = h.Write(seps)
}
return h.Sum64()
}
b = append(b, name...)
b = append(b, seps[0])
b = append(b, value...)
b = append(b, seps[0])
pos = newPos
}
return xxhash.Sum64(b)
}
// HashForLabels returns a hash value for the labels matching the provided names.
// 'names' have to be sorted in ascending order.
func (ls Labels) HashForLabels(b []byte, names ...string) (uint64, []byte) {
b = b[:0]
j := 0
for i := 0; i < len(ls.data); {
var name, value string
name, i = decodeString(ls.syms, ls.data, i)
value, i = decodeString(ls.syms, ls.data, i)
for j < len(names) && names[j] < name {
j++
}
if j == len(names) {
break
}
if name == names[j] {
b = append(b, name...)
b = append(b, seps[0])
b = append(b, value...)
b = append(b, seps[0])
}
}
return xxhash.Sum64(b), b
}
// HashWithoutLabels returns a hash value for all labels except those matching
// the provided names.
// 'names' have to be sorted in ascending order.
func (ls Labels) HashWithoutLabels(b []byte, names ...string) (uint64, []byte) {
b = b[:0]
j := 0
for i := 0; i < len(ls.data); {
var name, value string
name, i = decodeString(ls.syms, ls.data, i)
value, i = decodeString(ls.syms, ls.data, i)
for j < len(names) && names[j] < name {
j++
}
if name == MetricName || (j < len(names) && name == names[j]) {
continue
}
b = append(b, name...)
b = append(b, seps[0])
b = append(b, value...)
b = append(b, seps[0])
}
return xxhash.Sum64(b), b
}
// BytesWithLabels is just as Bytes(), but only for labels matching names.
// 'names' have to be sorted in ascending order.
func (ls Labels) BytesWithLabels(buf []byte, names ...string) []byte {
b := bytes.NewBuffer(buf[:0])
j := 0
for pos := 0; pos < len(ls.data); {
lName, newPos := decodeString(ls.syms, ls.data, pos)
lValue, newPos := decodeString(ls.syms, ls.data, newPos)
for j < len(names) && names[j] < lName {
j++
}
if j == len(names) {
break
}
if lName == names[j] {
if b.Len() > 1 {
b.WriteByte(seps[0])
}
b.WriteString(lName)
b.WriteByte(seps[0])
b.WriteString(lValue)
}
pos = newPos
}
return b.Bytes()
}
// BytesWithoutLabels is just as Bytes(), but only for labels not matching names.
// 'names' have to be sorted in ascending order.
func (ls Labels) BytesWithoutLabels(buf []byte, names ...string) []byte {
b := bytes.NewBuffer(buf[:0])
j := 0
for pos := 0; pos < len(ls.data); {
lName, newPos := decodeString(ls.syms, ls.data, pos)
lValue, newPos := decodeString(ls.syms, ls.data, newPos)
for j < len(names) && names[j] < lName {
j++
}
if j == len(names) || lName != names[j] {
if b.Len() > 1 {
b.WriteByte(seps[0])
}
b.WriteString(lName)
b.WriteByte(seps[0])
b.WriteString(lValue)
}
pos = newPos
}
return b.Bytes()
}
// Copy returns a copy of the labels.
func (ls Labels) Copy() Labels {
return Labels{syms: ls.syms, data: strings.Clone(ls.data)}
}
// Get returns the value for the label with the given name.
// Returns an empty string if the label doesn't exist.
func (ls Labels) Get(name string) string {
if name == "" { // Avoid crash in loop if someone asks for "".
return "" // Prometheus does not store blank label names.
}
for i := 0; i < len(ls.data); {
var lName, lValue string
lName, i = decodeString(ls.syms, ls.data, i)
if lName == name {
lValue, _ = decodeString(ls.syms, ls.data, i)
return lValue
} else if lName[0] > name[0] { // Stop looking if we've gone past.
break
}
_, i = decodeVarint(ls.data, i)
}
return ""
}
// Has returns true if the label with the given name is present.
func (ls Labels) Has(name string) bool {
if name == "" { // Avoid crash in loop if someone asks for "".
return false // Prometheus does not store blank label names.
}
for i := 0; i < len(ls.data); {
var lName string
lName, i = decodeString(ls.syms, ls.data, i)
if lName == name {
return true
} else if lName[0] > name[0] { // Stop looking if we've gone past.
break
}
_, i = decodeVarint(ls.data, i)
}
return false
}
// HasDuplicateLabelNames returns whether ls has duplicate label names.
// It assumes that the labelset is sorted.
func (ls Labels) HasDuplicateLabelNames() (string, bool) {
prevNum := -1
for i := 0; i < len(ls.data); {
var lNum int
lNum, i = decodeVarint(ls.data, i)
_, i = decodeVarint(ls.data, i)
if lNum == prevNum {
return ls.syms.ToName(lNum), true
}
prevNum = lNum
}
return "", false
}
// WithoutEmpty returns the labelset without empty labels.
// May return the same labelset.
func (ls Labels) WithoutEmpty() Labels {
if ls.IsEmpty() {
return ls
}
// Idea: have a constant symbol for blank, then we don't have to look it up.
blank, ok := ls.syms.symbolTable.checkNum("")
if !ok { // Symbol table has no entry for blank - none of the values can be blank.
return ls
}
for pos := 0; pos < len(ls.data); {
_, newPos := decodeVarint(ls.data, pos)
lValue, newPos := decodeVarint(ls.data, newPos)
if lValue != blank {
pos = newPos
continue
}
// Do not copy the slice until it's necessary.
// TODO: could optimise the case where all blanks are at the end.
// Note: we size the new buffer on the assumption there is exactly one blank value.
buf := make([]byte, pos, pos+(len(ls.data)-newPos))
copy(buf, ls.data[:pos]) // copy the initial non-blank labels
pos = newPos // move past the first blank value
for pos < len(ls.data) {
var newPos int
_, newPos = decodeVarint(ls.data, pos)
lValue, newPos = decodeVarint(ls.data, newPos)
if lValue != blank {
buf = append(buf, ls.data[pos:newPos]...)
}
pos = newPos
}
return Labels{syms: ls.syms, data: yoloString(buf)}
}
return ls
}
// Equal returns whether the two label sets are equal.
func Equal(a, b Labels) bool {
if a.syms == b.syms {
return a.data == b.data
}
la, lb := len(a.data), len(b.data)
ia, ib := 0, 0
for ia < la && ib < lb {
var aValue, bValue string
aValue, ia = decodeString(a.syms, a.data, ia)
bValue, ib = decodeString(b.syms, b.data, ib)
if aValue != bValue {
return false
}
}
if ia != la || ib != lb {
return false
}
return true
}
// EmptyLabels returns an empty Labels value, for convenience.
func EmptyLabels() Labels {
return Labels{}
}
// New returns a sorted Labels from the given labels.
// The caller has to guarantee that all label names are unique.
// Note this function is not efficient; should not be used in performance-critical places.
func New(ls ...Label) Labels {
slices.SortFunc(ls, func(a, b Label) int { return strings.Compare(a.Name, b.Name) })
syms := NewSymbolTable()
var stackSpace [16]int
size, nums := mapLabelsToNumbers(syms, ls, stackSpace[:])
buf := make([]byte, size)
marshalNumbersToSizedBuffer(nums, buf)
return Labels{syms: syms.nameTable, data: yoloString(buf)}
}
// FromStrings creates new labels from pairs of strings.
func FromStrings(ss ...string) Labels {
if len(ss)%2 != 0 {
panic("invalid number of strings")
}
ls := make([]Label, 0, len(ss)/2)
for i := 0; i < len(ss); i += 2 {
ls = append(ls, Label{Name: ss[i], Value: ss[i+1]})
}
return New(ls...)
}
// Compare compares the two label sets.
// The result will be 0 if a==b, <0 if a < b, and >0 if a > b.
func Compare(a, b Labels) int {
la, lb := len(a.data), len(b.data)
ia, ib := 0, 0
for ia < la && ib < lb {
var aName, bName string
aName, ia = decodeString(a.syms, a.data, ia)
bName, ib = decodeString(b.syms, b.data, ib)
if aName != bName {
if aName < bName {
return -1
}
return 1
}
var aValue, bValue string
aValue, ia = decodeString(a.syms, a.data, ia)
bValue, ib = decodeString(b.syms, b.data, ib)
if aValue != bValue {
if aValue < bValue {
return -1
}
return 1
}
}
// If all labels so far were in common, the set with fewer labels comes first.
return (la - ia) - (lb - ib)
}
// Copy labels from b on top of whatever was in ls previously, reusing memory or expanding if needed.
func (ls *Labels) CopyFrom(b Labels) {
*ls = b // Straightforward memberwise copy is all we need.
}
// IsEmpty returns true if ls represents an empty set of labels.
func (ls Labels) IsEmpty() bool {
return len(ls.data) == 0
}
// Len returns the number of labels; it is relatively slow.
func (ls Labels) Len() int {
count := 0
for i := 0; i < len(ls.data); {
_, i = decodeVarint(ls.data, i)
_, i = decodeVarint(ls.data, i)
count++
}
return count
}
// Range calls f on each label.
func (ls Labels) Range(f func(l Label)) {
for i := 0; i < len(ls.data); {
var lName, lValue string
lName, i = decodeString(ls.syms, ls.data, i)
lValue, i = decodeString(ls.syms, ls.data, i)
f(Label{Name: lName, Value: lValue})
}
}
// Validate calls f on each label. If f returns a non-nil error, then it returns that error cancelling the iteration.
func (ls Labels) Validate(f func(l Label) error) error {
for i := 0; i < len(ls.data); {
var lName, lValue string
lName, i = decodeString(ls.syms, ls.data, i)
lValue, i = decodeString(ls.syms, ls.data, i)
err := f(Label{Name: lName, Value: lValue})
if err != nil {
return err
}
}
return nil
}
// InternStrings calls intern on every string value inside ls, replacing them with what it returns.
func (ls *Labels) InternStrings(intern func(string) string) {
// TODO: remove these calls as there is nothing to do.
}
// ReleaseStrings calls release on every string value inside ls.
func (ls Labels) ReleaseStrings(release func(string)) {
// TODO: remove these calls as there is nothing to do.
}
// DropMetricName returns Labels with "__name__" removed.
func (ls Labels) DropMetricName() Labels {
for i := 0; i < len(ls.data); {
lName, i2 := decodeString(ls.syms, ls.data, i)
_, i2 = decodeVarint(ls.data, i2)
if lName == MetricName {
if i == 0 { // Make common case fast with no allocations.
ls.data = ls.data[i2:]
} else {
ls.data = ls.data[:i] + ls.data[i2:]
}
break
} else if lName[0] > MetricName[0] { // Stop looking if we've gone past.
break
}
i = i2
}
return ls
}
// Builder allows modifying Labels.
type Builder struct {
syms *SymbolTable
nums []int
base Labels
del []string
add []Label
}
// NewBuilderWithSymbolTable returns a new LabelsBuilder not based on any labels, but with the SymbolTable.
func NewBuilderWithSymbolTable(s *SymbolTable) *Builder {
return &Builder{
syms: s,
}
}
// Reset clears all current state for the builder.
func (b *Builder) Reset(base Labels) {
if base.syms != nil { // If base has a symbol table, use that.
b.syms = base.syms.symbolTable
} else if b.syms == nil { // Or continue using previous symbol table in builder.
b.syms = NewSymbolTable() // Don't do this in performance-sensitive code.
}
b.base = base
b.del = b.del[:0]
b.add = b.add[:0]
base.Range(func(l Label) {
if l.Value == "" {
b.del = append(b.del, l.Name)
}
})
}
// Labels returns the labels from the builder.
// If no modifications were made, the original labels are returned.
func (b *Builder) Labels() Labels {
if len(b.del) == 0 && len(b.add) == 0 {
return b.base
}
slices.SortFunc(b.add, func(a, b Label) int { return strings.Compare(a.Name, b.Name) })
slices.Sort(b.del)
a, d, newSize := 0, 0, 0
newSize, b.nums = mapLabelsToNumbers(b.syms, b.add, b.nums)
bufSize := len(b.base.data) + newSize
buf := make([]byte, 0, bufSize)
for pos := 0; pos < len(b.base.data); {
oldPos := pos
var lName string
lName, pos = decodeString(b.base.syms, b.base.data, pos)
_, pos = decodeVarint(b.base.data, pos)
for d < len(b.del) && b.del[d] < lName {
d++
}
if d < len(b.del) && b.del[d] == lName {
continue // This label has been deleted.
}
for ; a < len(b.add) && b.add[a].Name < lName; a++ {
buf = appendLabelTo(b.nums[a*2], b.nums[a*2+1], buf) // Insert label that was not in the base set.
}
if a < len(b.add) && b.add[a].Name == lName {
buf = appendLabelTo(b.nums[a*2], b.nums[a*2+1], buf)
a++
continue // This label has been replaced.
}
buf = append(buf, b.base.data[oldPos:pos]...) // If base had a symbol-table we are using it, so we don't need to look up these symbols.
}
// We have come to the end of the base set; add any remaining labels.
for ; a < len(b.add); a++ {
buf = appendLabelTo(b.nums[a*2], b.nums[a*2+1], buf)
}
return Labels{syms: b.syms.nameTable, data: yoloString(buf)}
}
func marshalNumbersToSizedBuffer(nums []int, data []byte) int {
i := len(data)
for index := len(nums) - 1; index >= 0; index-- {
i = encodeVarint(data, i, nums[index])
}
return len(data) - i
}
func sizeVarint(x uint64) (n int) {
// Most common case first
if x < 1<<15 {
return 2
}
if x < 1<<22 {
return 3
}
if x >= 1<<29 {
panic("Number too large to represent")
}
return 4
}
func encodeVarintSlow(data []byte, offset int, v uint64) int {
offset -= sizeVarint(v)
base := offset
data[offset] = uint8(v)
v >>= 8
offset++
for v >= 1<<7 {
data[offset] = uint8(v&0x7f | 0x80)
v >>= 7
offset++
}
data[offset] = uint8(v)
return base
}
// Special code for the common case that a value is less than 32768
func encodeVarint(data []byte, offset, v int) int {
if v < 1<<15 {
offset -= 2
data[offset] = uint8(v)
data[offset+1] = uint8(v >> 8)
return offset
}
return encodeVarintSlow(data, offset, uint64(v))
}
// Map all the strings in lbls to the symbol table; return the total size required to hold them and all the individual mappings.
func mapLabelsToNumbers(t *SymbolTable, lbls []Label, buf []int) (totalSize int, nums []int) {
nums = buf[:0]
t.mx.Lock()
defer t.mx.Unlock()
// we just encode name/value/name/value, without any extra tags or length bytes
for _, m := range lbls {
// strings are encoded as a single varint, the index into the symbol table.
i := t.toNumUnlocked(m.Name)
nums = append(nums, i)
totalSize += sizeVarint(uint64(i))
i = t.toNumUnlocked(m.Value)
nums = append(nums, i)
totalSize += sizeVarint(uint64(i))
}
return totalSize, nums
}
func appendLabelTo(nameNum, valueNum int, buf []byte) []byte {
size := sizeVarint(uint64(nameNum)) + sizeVarint(uint64(valueNum))
sizeRequired := len(buf) + size
if cap(buf) >= sizeRequired {
buf = buf[:sizeRequired]
} else {
bufSize := cap(buf)
// Double size of buffer each time it needs to grow, to amortise copying cost.
for bufSize < sizeRequired {
bufSize = bufSize*2 + 1
}
newBuf := make([]byte, sizeRequired, bufSize)
copy(newBuf, buf)
buf = newBuf
}
i := sizeRequired
i = encodeVarint(buf, i, valueNum)
i = encodeVarint(buf, i, nameNum)
return buf
}
// ScratchBuilder allows efficient construction of a Labels from scratch.
type ScratchBuilder struct {
syms *SymbolTable
nums []int
add []Label
output Labels
overwriteBuffer []byte
}
// NewScratchBuilder creates a ScratchBuilder initialized for Labels with n entries.
// Warning: expensive; don't call in tight loops.
func NewScratchBuilder(n int) ScratchBuilder {
return ScratchBuilder{syms: NewSymbolTable(), add: make([]Label, 0, n)}
}
// NewScratchBuilderWithSymbolTable creates a ScratchBuilder initialized for Labels with n entries.
func NewScratchBuilderWithSymbolTable(s *SymbolTable, n int) ScratchBuilder {
return ScratchBuilder{syms: s, add: make([]Label, 0, n)}
}
func (b *ScratchBuilder) SetSymbolTable(s *SymbolTable) {
b.syms = s
}
func (b *ScratchBuilder) Reset() {
b.add = b.add[:0]
b.output = EmptyLabels()
}
// Add a name/value pair.
// Note if you Add the same name twice you will get a duplicate label, which is invalid.
func (b *ScratchBuilder) Add(name, value string) {
b.add = append(b.add, Label{Name: name, Value: value})
}
// Add a name/value pair, using []byte instead of string to reduce memory allocations.
// The values must remain live until Labels() is called.
func (b *ScratchBuilder) UnsafeAddBytes(name, value []byte) {
b.add = append(b.add, Label{Name: yoloString(name), Value: yoloString(value)})
}
// Sort the labels added so far by name.
func (b *ScratchBuilder) Sort() {
slices.SortFunc(b.add, func(a, b Label) int { return strings.Compare(a.Name, b.Name) })
}
// Assign is for when you already have a Labels which you want this ScratchBuilder to return.
func (b *ScratchBuilder) Assign(l Labels) {
b.output = l
}
// Labels returns the name/value pairs added as a Labels object. Calling Add() after Labels() has no effect.
// Note: if you want them sorted, call Sort() first.
func (b *ScratchBuilder) Labels() Labels {
if b.output.IsEmpty() {
var size int
size, b.nums = mapLabelsToNumbers(b.syms, b.add, b.nums)
buf := make([]byte, size)
marshalNumbersToSizedBuffer(b.nums, buf)
b.output = Labels{syms: b.syms.nameTable, data: yoloString(buf)}
}
return b.output
}
// Write the newly-built Labels out to ls, reusing an internal buffer.
// Callers must ensure that there are no other references to ls, or any strings fetched from it.
func (b *ScratchBuilder) Overwrite(ls *Labels) {
var size int
size, b.nums = mapLabelsToNumbers(b.syms, b.add, b.nums)
if size <= cap(b.overwriteBuffer) {
b.overwriteBuffer = b.overwriteBuffer[:size]
} else {
b.overwriteBuffer = make([]byte, size)
}
marshalNumbersToSizedBuffer(b.nums, b.overwriteBuffer)
ls.syms = b.syms.nameTable
ls.data = yoloString(b.overwriteBuffer)
}