From ac4f8a5e23e6c03a12e694313cb7eac4f58e4296 Mon Sep 17 00:00:00 2001
From: Bryan Boreham <bjboreham@gmail.com>
Date: Mon, 16 Dec 2024 09:42:52 +0000
Subject: [PATCH] [ENHANCEMENT] TSDB: Improve calculation of space used by
 labels (#13880)

* [ENHANCEMENT] TSDB: Improve calculation of space used by labels

The labels for each series in the Head take up some some space in the
Postings index, but far more space in the `memSeries` structure.

Instead of having the Postings index calculate this overhead, which is
a layering violation, have the caller pass in a function to do it.

Provide three implementations of this function for the three Labels
versions.

Signed-off-by: Bryan Boreham <bjboreham@gmail.com>
---
 model/labels/labels.go              | 6 ++++++
 model/labels/labels_dedupelabels.go | 5 +++++
 model/labels/labels_stringlabels.go | 5 +++++
 tsdb/head.go                        | 2 +-
 tsdb/index/postings.go              | 5 +++--
 tsdb/index/postings_test.go         | 7 ++++---
 6 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/model/labels/labels.go b/model/labels/labels.go
index f4de7496ce..0747ab90d9 100644
--- a/model/labels/labels.go
+++ b/model/labels/labels.go
@@ -19,6 +19,7 @@ import (
 	"bytes"
 	"slices"
 	"strings"
+	"unsafe"
 
 	"github.com/cespare/xxhash/v2"
 )
@@ -488,3 +489,8 @@ func (b *ScratchBuilder) Labels() Labels {
 func (b *ScratchBuilder) Overwrite(ls *Labels) {
 	*ls = append((*ls)[:0], b.add...)
 }
+
+// SizeOfLabels returns the approximate space required for n copies of a label.
+func SizeOfLabels(name, value string, n uint64) uint64 {
+	return (uint64(len(name)) + uint64(unsafe.Sizeof(name)) + uint64(len(value)) + uint64(unsafe.Sizeof(value))) * n
+}
diff --git a/model/labels/labels_dedupelabels.go b/model/labels/labels_dedupelabels.go
index da8a88cc15..a0d83e0044 100644
--- a/model/labels/labels_dedupelabels.go
+++ b/model/labels/labels_dedupelabels.go
@@ -815,3 +815,8 @@ func (b *ScratchBuilder) Overwrite(ls *Labels) {
 	ls.syms = b.syms.nameTable
 	ls.data = yoloString(b.overwriteBuffer)
 }
+
+// SizeOfLabels returns the approximate space required for n copies of a label.
+func SizeOfLabels(name, value string, n uint64) uint64 {
+	return uint64(len(name)+len(value)) + n*4 // Assuming most symbol-table entries are 2 bytes long.
+}
diff --git a/model/labels/labels_stringlabels.go b/model/labels/labels_stringlabels.go
index c64bb990e0..f49ed96f65 100644
--- a/model/labels/labels_stringlabels.go
+++ b/model/labels/labels_stringlabels.go
@@ -691,3 +691,8 @@ func NewScratchBuilderWithSymbolTable(_ *SymbolTable, n int) ScratchBuilder {
 func (b *ScratchBuilder) SetSymbolTable(_ *SymbolTable) {
 	// no-op
 }
+
+// SizeOfLabels returns the approximate space required for n copies of a label.
+func SizeOfLabels(name, value string, n uint64) uint64 {
+	return uint64(labelSize(&Label{Name: name, Value: value})) * n
+}
diff --git a/tsdb/head.go b/tsdb/head.go
index c67c438e52..47f85d7713 100644
--- a/tsdb/head.go
+++ b/tsdb/head.go
@@ -1048,7 +1048,7 @@ func (h *Head) PostingsCardinalityStats(statsByLabelName string, limit int) *ind
 		return h.cardinalityCache
 	}
 	h.cardinalityCacheKey = cacheKey
-	h.cardinalityCache = h.postings.Stats(statsByLabelName, limit)
+	h.cardinalityCache = h.postings.Stats(statsByLabelName, limit, labels.SizeOfLabels)
 	h.lastPostingsStatsCall = time.Duration(time.Now().Unix()) * time.Second
 
 	return h.cardinalityCache
diff --git a/tsdb/index/postings.go b/tsdb/index/postings.go
index ea32ba5632..f9a284bc70 100644
--- a/tsdb/index/postings.go
+++ b/tsdb/index/postings.go
@@ -190,7 +190,8 @@ type PostingsStats struct {
 }
 
 // Stats calculates the cardinality statistics from postings.
-func (p *MemPostings) Stats(label string, limit int) *PostingsStats {
+// Caller can pass in a function which computes the space required for n series with a given label.
+func (p *MemPostings) Stats(label string, limit int, labelSizeFunc func(string, string, uint64) uint64) *PostingsStats {
 	var size uint64
 	p.mtx.RLock()
 
@@ -218,7 +219,7 @@ func (p *MemPostings) Stats(label string, limit int) *PostingsStats {
 			}
 			seriesCnt := uint64(len(values))
 			labelValuePairs.push(Stat{Name: n + "=" + name, Count: seriesCnt})
-			size += uint64(len(name)) * seriesCnt
+			size += labelSizeFunc(n, name, seriesCnt)
 		}
 		labelValueLength.push(Stat{Name: n, Count: size})
 	}
diff --git a/tsdb/index/postings_test.go b/tsdb/index/postings_test.go
index 6ff5b9c060..6dd9f25bc0 100644
--- a/tsdb/index/postings_test.go
+++ b/tsdb/index/postings_test.go
@@ -939,7 +939,7 @@ func BenchmarkPostings_Stats(b *testing.B) {
 	}
 	b.ResetTimer()
 	for n := 0; n < b.N; n++ {
-		p.Stats("__name__", 10)
+		p.Stats("__name__", 10, labels.SizeOfLabels)
 	}
 }
 
@@ -954,7 +954,8 @@ func TestMemPostingsStats(t *testing.T) {
 	p.Add(2, labels.FromStrings("label", "value1"))
 
 	// call the Stats method to calculate the cardinality statistics
-	stats := p.Stats("label", 10)
+	// passing a fake calculation so we get the same result regardless of compilation -tags.
+	stats := p.Stats("label", 10, func(name, value string, n uint64) uint64 { return uint64(len(name)+len(value)) * n })
 
 	// assert that the expected statistics were calculated
 	require.Equal(t, uint64(2), stats.CardinalityMetricsStats[0].Count)
@@ -963,7 +964,7 @@ func TestMemPostingsStats(t *testing.T) {
 	require.Equal(t, uint64(3), stats.CardinalityLabelStats[0].Count)
 	require.Equal(t, "label", stats.CardinalityLabelStats[0].Name)
 
-	require.Equal(t, uint64(24), stats.LabelValueStats[0].Count)
+	require.Equal(t, uint64(44), stats.LabelValueStats[0].Count)
 	require.Equal(t, "label", stats.LabelValueStats[0].Name)
 
 	require.Equal(t, uint64(2), stats.LabelValuePairsStats[0].Count)