From 38bb6ece254024d3b490056e92aa7b777c8b845f Mon Sep 17 00:00:00 2001
From: Neeraj Gartia <80708727+NeerajGartia21@users.noreply.github.com>
Date: Tue, 26 Nov 2024 23:42:36 +0530
Subject: [PATCH] [BUGFIX] PromQL: Fix behaviour of some aggregations with
 histograms (#15432)

promql: fix some aggregations for histograms

This PR fixes the behaviour of `topk`,`bottomk`, `limitk` and `limit_ratio` with histograms. The fixed behaviour are as follows:
- For `topk` and `bottomk` histograms are ignored and add info annotations added.
- For `limitk` and `limit_ratio` histograms are included in the results(if applicable).

Signed-off-by: Neeraj Gartia <neerajgartia211002@gmail.com>
---
 promql/engine.go                            |  53 ++++++++--
 promql/promqltest/testdata/aggregators.test |  43 ++++++++
 promql/promqltest/testdata/limit.test       | 111 +++++++++++++-------
 3 files changed, 161 insertions(+), 46 deletions(-)

diff --git a/promql/engine.go b/promql/engine.go
index fc8256dd79..88fa5e8141 100644
--- a/promql/engine.go
+++ b/promql/engine.go
@@ -3187,18 +3187,19 @@ func (ev *evaluator) aggregationK(e *parser.AggregateExpr, k int, r float64, inp
 
 seriesLoop:
 	for si := range inputMatrix {
-		f, _, ok := ev.nextValues(enh.Ts, &inputMatrix[si])
+		f, h, ok := ev.nextValues(enh.Ts, &inputMatrix[si])
 		if !ok {
 			continue
 		}
-		s = Sample{Metric: inputMatrix[si].Metric, F: f, DropName: inputMatrix[si].DropName}
+		s = Sample{Metric: inputMatrix[si].Metric, F: f, H: h, DropName: inputMatrix[si].DropName}
 
 		group := &groups[seriesToResult[si]]
 		// Initialize this group if it's the first time we've seen it.
 		if !group.seen {
 			// LIMIT_RATIO is a special case, as we may not add this very sample to the heap,
 			// while we also don't know the final size of it.
-			if op == parser.LIMIT_RATIO {
+			switch op {
+			case parser.LIMIT_RATIO:
 				*group = groupedAggregation{
 					seen: true,
 					heap: make(vectorByValueHeap, 0),
@@ -3206,12 +3207,34 @@ seriesLoop:
 				if ratiosampler.AddRatioSample(r, &s) {
 					heap.Push(&group.heap, &s)
 				}
-			} else {
+			case parser.LIMITK:
 				*group = groupedAggregation{
 					seen: true,
 					heap: make(vectorByValueHeap, 1, k),
 				}
 				group.heap[0] = s
+			case parser.TOPK:
+				*group = groupedAggregation{
+					seen: true,
+					heap: make(vectorByValueHeap, 0, k),
+				}
+				if s.H != nil {
+					group.seen = false
+					annos.Add(annotations.NewHistogramIgnoredInAggregationInfo("topk", e.PosRange))
+				} else {
+					heap.Push(&group.heap, &s)
+				}
+			case parser.BOTTOMK:
+				*group = groupedAggregation{
+					seen: true,
+					heap: make(vectorByValueHeap, 0, k),
+				}
+				if s.H != nil {
+					group.seen = false
+					annos.Add(annotations.NewHistogramIgnoredInAggregationInfo("bottomk", e.PosRange))
+				} else {
+					heap.Push(&group.heap, &s)
+				}
 			}
 			continue
 		}
@@ -3220,6 +3243,9 @@ seriesLoop:
 		case parser.TOPK:
 			// We build a heap of up to k elements, with the smallest element at heap[0].
 			switch {
+			case s.H != nil:
+				// Ignore histogram sample and add info annotation.
+				annos.Add(annotations.NewHistogramIgnoredInAggregationInfo("topk", e.PosRange))
 			case len(group.heap) < k:
 				heap.Push(&group.heap, &s)
 			case group.heap[0].F < s.F || (math.IsNaN(group.heap[0].F) && !math.IsNaN(s.F)):
@@ -3233,6 +3259,9 @@ seriesLoop:
 		case parser.BOTTOMK:
 			// We build a heap of up to k elements, with the biggest element at heap[0].
 			switch {
+			case s.H != nil:
+				// Ignore histogram sample and add info annotation.
+				annos.Add(annotations.NewHistogramIgnoredInAggregationInfo("bottomk", e.PosRange))
 			case len(group.heap) < k:
 				heap.Push((*vectorByReverseValueHeap)(&group.heap), &s)
 			case group.heap[0].F > s.F || (math.IsNaN(group.heap[0].F) && !math.IsNaN(s.F)):
@@ -3275,10 +3304,14 @@ seriesLoop:
 		mat = make(Matrix, 0, len(groups))
 	}
 
-	add := func(lbls labels.Labels, f float64, dropName bool) {
+	add := func(lbls labels.Labels, f float64, h *histogram.FloatHistogram, dropName bool) {
 		// If this could be an instant query, add directly to the matrix so the result is in consistent order.
 		if ev.endTimestamp == ev.startTimestamp {
-			mat = append(mat, Series{Metric: lbls, Floats: []FPoint{{T: enh.Ts, F: f}}, DropName: dropName})
+			if h != nil {
+				mat = append(mat, Series{Metric: lbls, Histograms: []HPoint{{T: enh.Ts, H: h}}, DropName: dropName})
+			} else {
+				mat = append(mat, Series{Metric: lbls, Floats: []FPoint{{T: enh.Ts, F: f}}, DropName: dropName})
+			}
 		} else {
 			// Otherwise the results are added into seriess elements.
 			hash := lbls.Hash()
@@ -3286,7 +3319,7 @@ seriesLoop:
 			if !ok {
 				ss = Series{Metric: lbls, DropName: dropName}
 			}
-			addToSeries(&ss, enh.Ts, f, nil, numSteps)
+			addToSeries(&ss, enh.Ts, f, h, numSteps)
 			seriess[hash] = ss
 		}
 	}
@@ -3301,7 +3334,7 @@ seriesLoop:
 				sort.Sort(sort.Reverse(aggr.heap))
 			}
 			for _, v := range aggr.heap {
-				add(v.Metric, v.F, v.DropName)
+				add(v.Metric, v.F, v.H, v.DropName)
 			}
 
 		case parser.BOTTOMK:
@@ -3310,12 +3343,12 @@ seriesLoop:
 				sort.Sort(sort.Reverse((*vectorByReverseValueHeap)(&aggr.heap)))
 			}
 			for _, v := range aggr.heap {
-				add(v.Metric, v.F, v.DropName)
+				add(v.Metric, v.F, v.H, v.DropName)
 			}
 
 		case parser.LIMITK, parser.LIMIT_RATIO:
 			for _, v := range aggr.heap {
-				add(v.Metric, v.F, v.DropName)
+				add(v.Metric, v.F, v.H, v.DropName)
 			}
 		}
 	}
diff --git a/promql/promqltest/testdata/aggregators.test b/promql/promqltest/testdata/aggregators.test
index 00f393a865..244e69273b 100644
--- a/promql/promqltest/testdata/aggregators.test
+++ b/promql/promqltest/testdata/aggregators.test
@@ -272,6 +272,8 @@ load 5m
 	http_requests{job="app-server", instance="1", group="production"}	0+60x10
 	http_requests{job="app-server", instance="0", group="canary"}		0+70x10
 	http_requests{job="app-server", instance="1", group="canary"}		0+80x10
+	http_requests_histogram{job="app-server", instance="2", group="canary"}		{{schema:0 sum:10 count:10}}x11
+	http_requests_histogram{job="api-server", instance="3", group="production"}	{{schema:0 sum:20 count:20}}x11
 	foo 3+0x10
 
 eval_ordered instant at 50m topk(3, http_requests)
@@ -338,6 +340,47 @@ eval_ordered instant at 50m topk(scalar(foo), http_requests)
 	http_requests{group="canary", instance="0", job="app-server"} 700
 	http_requests{group="production", instance="1", job="app-server"} 600
 
+# Tests for histogram: should ignore histograms.
+eval_info instant at 50m topk(100, http_requests_histogram)
+    #empty
+
+eval_info range from 0 to 50m step 5m topk(100, http_requests_histogram)
+    #empty
+
+eval_info instant at 50m topk(1, {__name__=~"http_requests(_histogram)?"})
+    {__name__="http_requests", group="canary", instance="1", job="app-server"} 800
+
+eval_info instant at 50m count(topk(1000, {__name__=~"http_requests(_histogram)?"}))
+    {} 9
+
+eval_info range from 0 to 50m step 5m count(topk(1000, {__name__=~"http_requests(_histogram)?"}))
+    {} 9x10
+
+eval_info instant at 50m topk by (instance) (1, {__name__=~"http_requests(_histogram)?"})
+    {__name__="http_requests", group="canary", instance="0", job="app-server"} 700
+    {__name__="http_requests", group="canary", instance="1", job="app-server"} 800
+    {__name__="http_requests", group="production", instance="2", job="api-server"} NaN
+
+eval_info instant at 50m bottomk(100, http_requests_histogram)
+    #empty
+
+eval_info range from 0 to 50m step 5m bottomk(100, http_requests_histogram)
+    #empty
+
+eval_info instant at 50m bottomk(1, {__name__=~"http_requests(_histogram)?"})
+    {__name__="http_requests", group="production", instance="0", job="api-server"} 100
+
+eval_info instant at 50m count(bottomk(1000, {__name__=~"http_requests(_histogram)?"}))
+    {} 9
+
+eval_info range from 0 to 50m step 5m count(bottomk(1000, {__name__=~"http_requests(_histogram)?"}))
+    {} 9x10
+
+eval_info instant at 50m bottomk by (instance) (1, {__name__=~"http_requests(_histogram)?"})
+    {__name__="http_requests", group="production", instance="0", job="api-server"} 100
+    {__name__="http_requests", group="production", instance="1", job="api-server"} 200
+    {__name__="http_requests", group="production", instance="2", job="api-server"} NaN
+
 clear
 
 # Tests for count_values.
diff --git a/promql/promqltest/testdata/limit.test b/promql/promqltest/testdata/limit.test
index 0ab363f9ae..e6dd007af4 100644
--- a/promql/promqltest/testdata/limit.test
+++ b/promql/promqltest/testdata/limit.test
@@ -9,6 +9,8 @@ load 5m
 	http_requests{job="api-server", instance="1", group="canary"}		0+40x10
 	http_requests{job="api-server", instance="2", group="canary"}		0+50x10
 	http_requests{job="api-server", instance="3", group="canary"}		0+60x10
+	http_requests{job="api-server", instance="histogram_1", group="canary"}       {{schema:0 sum:10 count:10}}x11
+	http_requests{job="api-server", instance="histogram_2", group="canary"}       {{schema:0 sum:20 count:20}}x11
 
 eval instant at 50m count(limitk by (group) (0, http_requests))
 # empty
@@ -16,25 +18,56 @@ eval instant at 50m count(limitk by (group) (0, http_requests))
 eval instant at 50m count(limitk by (group) (-1, http_requests))
 # empty
 
-# Exercise k==1 special case (as sample is added before the main series loop
+# Exercise k==1 special case (as sample is added before the main series loop).
 eval instant at 50m count(limitk by (group) (1, http_requests) and http_requests)
-        {} 2
+    {} 2
 
 eval instant at 50m count(limitk by (group) (2, http_requests) and http_requests)
-        {} 4
+    {} 4
 
 eval instant at 50m count(limitk(100, http_requests) and http_requests)
-        {} 6
+    {} 8
 
-# Exercise k==1 special case (as sample is added before the main series loop
+# Exercise k==1 special case (as sample is added before the main series loop).
 eval instant at 50m count(limitk by (group) (1, http_requests) and http_requests)
-        {} 2
+    {} 2
 
 eval instant at 50m count(limitk by (group) (2, http_requests) and http_requests)
-        {} 4
+    {} 4
 
 eval instant at 50m count(limitk(100, http_requests) and http_requests)
-        {} 6
+    {} 8
+
+# Test for histograms.
+# k==1: verify that histogram is included in the result.
+eval instant at 50m limitk(1, http_requests{instance="histogram_1"})
+    {__name__="http_requests", group="canary", instance="histogram_1", job="api-server"} {{count:10 sum:10}}
+
+eval range from 0 to 50m step 5m limitk(1, http_requests{instance="histogram_1"})
+    {__name__="http_requests", group="canary", instance="histogram_1", job="api-server"} {{count:10 sum:10}}x10
+
+# Histogram is included with mix of floats as well.
+eval instant at 50m limitk(8, http_requests{instance=~"(histogram_2|0)"})
+    {__name__="http_requests", group="canary", instance="histogram_2", job="api-server"} {{count:20 sum:20}}
+    {__name__="http_requests", group="production", instance="0", job="api-server"} 100
+    {__name__="http_requests", group="canary", instance="0", job="api-server"} 300
+
+eval range from 0 to 50m step 5m limitk(8, http_requests{instance=~"(histogram_2|0)"})
+    {__name__="http_requests", group="canary", instance="histogram_2", job="api-server"} {{count:20 sum:20}}x10
+    {__name__="http_requests", group="production", instance="0", job="api-server"} 0+10x10
+    {__name__="http_requests", group="canary", instance="0", job="api-server"} 0+30x10
+
+eval instant at 50m count(limitk(2, http_requests{instance=~"histogram_[0-9]"}))
+    {} 2
+
+eval range from 0 to 50m step 5m count(limitk(2, http_requests{instance=~"histogram_[0-9]"}))
+    {} 2+0x10
+
+eval instant at 50m count(limitk(1000, http_requests{instance=~"histogram_[0-9]"}))
+    {} 2
+
+eval range from 0 to 50m step 5m count(limitk(1000, http_requests{instance=~"histogram_[0-9]"}))
+    {} 2+0x10
 
 # limit_ratio
 eval range from 0 to 50m step 5m count(limit_ratio(0.0, http_requests))
@@ -42,9 +75,9 @@ eval range from 0 to 50m step 5m count(limit_ratio(0.0, http_requests))
 
 # limitk(2, ...) should always return a 2-count subset of the timeseries (hence the AND'ing)
 eval range from 0 to 50m step 5m count(limitk(2, http_requests) and http_requests)
-        {} 2+0x10
+    {} 2+0x10
 
-# Tests for limit_ratio
+# Tests for limit_ratio.
 #
 # NB: below 0.5 ratio will depend on some hashing "luck" (also there's no guarantee that
 # an integer comes from: total number of series * ratio), as it depends on:
@@ -56,50 +89,50 @@ eval range from 0 to 50m step 5m count(limitk(2, http_requests) and http_request
 #
 # See `AddRatioSample()` in promql/engine.go for more details.
 
-# Half~ish samples: verify we get "near" 3 (of 0.5 * 6)
-eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) and http_requests) <= bool (3+1)
-        {} 1+0x10
+# Half~ish samples: verify we get "near" 3 (of 0.5 * 6).
+eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) and http_requests) <= bool (4+1)
+    {} 1+0x10
 
-eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) and http_requests) >= bool (3-1)
-        {} 1+0x10
+eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) and http_requests) >= bool (4-1)
+    {} 1+0x10
 
-# All samples
+# All samples.
 eval range from 0 to 50m step 5m count(limit_ratio(1.0, http_requests) and http_requests)
-        {} 6+0x10
+    {} 8+0x10
 
-# All samples
+# All samples.
 eval range from 0 to 50m step 5m count(limit_ratio(-1.0, http_requests) and http_requests)
-        {} 6+0x10
+    {} 8+0x10
 
-# Capped to 1.0 -> all samples
+# Capped to 1.0 -> all samples.
 eval_warn range from 0 to 50m step 5m count(limit_ratio(1.1, http_requests) and http_requests)
-        {} 6+0x10
+    {} 8+0x10
 
-# Capped to -1.0 -> all samples
+# Capped to -1.0 -> all samples.
 eval_warn range from 0 to 50m step 5m count(limit_ratio(-1.1, http_requests) and http_requests)
-        {} 6+0x10
+    {} 8+0x10
 
-# Verify that limit_ratio(value) and limit_ratio(1.0-value) return the "complement" of each other
-# Complement below for [0.2, -0.8]
+# Verify that limit_ratio(value) and limit_ratio(1.0-value) return the "complement" of each other.
+# Complement below for [0.2, -0.8].
 #
-# Complement 1of2: `or` should return all samples
+# Complement 1of2: `or` should return all samples.
 eval range from 0 to 50m step 5m count(limit_ratio(0.2, http_requests) or limit_ratio(-0.8, http_requests))
-        {} 6+0x10
+    {} 8+0x10
 
-# Complement 2of2: `and` should return no samples
+# Complement 2of2: `and` should return no samples.
 eval range from 0 to 50m step 5m count(limit_ratio(0.2, http_requests) and limit_ratio(-0.8, http_requests))
 # empty
 
-# Complement below for [0.5, -0.5]
+# Complement below for [0.5, -0.5].
 eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) or limit_ratio(-0.5, http_requests))
-        {} 6+0x10
+    {} 8+0x10
 
 eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) and limit_ratio(-0.5, http_requests))
 # empty
 
-# Complement below for [0.8, -0.2]
+# Complement below for [0.8, -0.2].
 eval range from 0 to 50m step 5m count(limit_ratio(0.8, http_requests) or limit_ratio(-0.2, http_requests))
-        {} 6+0x10
+    {} 8+0x10
 
 eval range from 0 to 50m step 5m count(limit_ratio(0.8, http_requests) and limit_ratio(-0.2, http_requests))
 # empty
@@ -107,13 +140,19 @@ eval range from 0 to 50m step 5m count(limit_ratio(0.8, http_requests) and limit
 # Complement below for [some_ratio, 1.0 - some_ratio], some_ratio derived from time(),
 # using a small prime number to avoid rounded ratio values, and a small set of them.
 eval range from 0 to 50m step 5m count(limit_ratio(time() % 17/17, http_requests) or limit_ratio(1.0 - (time() % 17/17), http_requests))
-        {} 6+0x10
+    {} 8+0x10
 
 eval range from 0 to 50m step 5m count(limit_ratio(time() % 17/17, http_requests) and limit_ratio(1.0 - (time() % 17/17), http_requests))
 # empty
 
-# Poor man's normality check: ok (loaded samples follow a nice linearity over labels and time)
-# The check giving: 1 (i.e. true)
-eval range from 0 to 50m step 5m abs(avg(limit_ratio(0.5, http_requests)) - avg(limit_ratio(-0.5, http_requests))) <= bool stddev(http_requests)
+# Poor man's normality check: ok (loaded samples follow a nice linearity over labels and time).
+# The check giving: 1 (i.e. true).
+eval range from 0 to 50m step 5m abs(avg(limit_ratio(0.5, http_requests{instance!~"histogram_[0-9]"})) - avg(limit_ratio(-0.5, http_requests{instance!~"histogram_[0-9]"}))) <= bool stddev(http_requests{instance!~"histogram_[0-9]"})
 	{} 1+0x10
 
+# All specified histograms are included for r=1.
+eval instant at 50m limit_ratio(1, http_requests{instance="histogram_1"})
+    {__name__="http_requests", group="canary", instance="histogram_1", job="api-server"} {{count:10 sum:10}}
+
+eval range from 0 to 50m step 5m limit_ratio(1, http_requests{instance="histogram_1"})
+    {__name__="http_requests", group="canary", instance="histogram_1", job="api-server"} {{count:10 sum:10}}x10