From 38bb6ece254024d3b490056e92aa7b777c8b845f Mon Sep 17 00:00:00 2001 From: Neeraj Gartia <80708727+NeerajGartia21@users.noreply.github.com> Date: Tue, 26 Nov 2024 23:42:36 +0530 Subject: [PATCH] [BUGFIX] PromQL: Fix behaviour of some aggregations with histograms (#15432) promql: fix some aggregations for histograms This PR fixes the behaviour of `topk`,`bottomk`, `limitk` and `limit_ratio` with histograms. The fixed behaviour are as follows: - For `topk` and `bottomk` histograms are ignored and add info annotations added. - For `limitk` and `limit_ratio` histograms are included in the results(if applicable). Signed-off-by: Neeraj Gartia --- promql/engine.go | 53 ++++++++-- promql/promqltest/testdata/aggregators.test | 43 ++++++++ promql/promqltest/testdata/limit.test | 111 +++++++++++++------- 3 files changed, 161 insertions(+), 46 deletions(-) diff --git a/promql/engine.go b/promql/engine.go index fc8256dd79..88fa5e8141 100644 --- a/promql/engine.go +++ b/promql/engine.go @@ -3187,18 +3187,19 @@ func (ev *evaluator) aggregationK(e *parser.AggregateExpr, k int, r float64, inp seriesLoop: for si := range inputMatrix { - f, _, ok := ev.nextValues(enh.Ts, &inputMatrix[si]) + f, h, ok := ev.nextValues(enh.Ts, &inputMatrix[si]) if !ok { continue } - s = Sample{Metric: inputMatrix[si].Metric, F: f, DropName: inputMatrix[si].DropName} + s = Sample{Metric: inputMatrix[si].Metric, F: f, H: h, DropName: inputMatrix[si].DropName} group := &groups[seriesToResult[si]] // Initialize this group if it's the first time we've seen it. if !group.seen { // LIMIT_RATIO is a special case, as we may not add this very sample to the heap, // while we also don't know the final size of it. - if op == parser.LIMIT_RATIO { + switch op { + case parser.LIMIT_RATIO: *group = groupedAggregation{ seen: true, heap: make(vectorByValueHeap, 0), @@ -3206,12 +3207,34 @@ seriesLoop: if ratiosampler.AddRatioSample(r, &s) { heap.Push(&group.heap, &s) } - } else { + case parser.LIMITK: *group = groupedAggregation{ seen: true, heap: make(vectorByValueHeap, 1, k), } group.heap[0] = s + case parser.TOPK: + *group = groupedAggregation{ + seen: true, + heap: make(vectorByValueHeap, 0, k), + } + if s.H != nil { + group.seen = false + annos.Add(annotations.NewHistogramIgnoredInAggregationInfo("topk", e.PosRange)) + } else { + heap.Push(&group.heap, &s) + } + case parser.BOTTOMK: + *group = groupedAggregation{ + seen: true, + heap: make(vectorByValueHeap, 0, k), + } + if s.H != nil { + group.seen = false + annos.Add(annotations.NewHistogramIgnoredInAggregationInfo("bottomk", e.PosRange)) + } else { + heap.Push(&group.heap, &s) + } } continue } @@ -3220,6 +3243,9 @@ seriesLoop: case parser.TOPK: // We build a heap of up to k elements, with the smallest element at heap[0]. switch { + case s.H != nil: + // Ignore histogram sample and add info annotation. + annos.Add(annotations.NewHistogramIgnoredInAggregationInfo("topk", e.PosRange)) case len(group.heap) < k: heap.Push(&group.heap, &s) case group.heap[0].F < s.F || (math.IsNaN(group.heap[0].F) && !math.IsNaN(s.F)): @@ -3233,6 +3259,9 @@ seriesLoop: case parser.BOTTOMK: // We build a heap of up to k elements, with the biggest element at heap[0]. switch { + case s.H != nil: + // Ignore histogram sample and add info annotation. + annos.Add(annotations.NewHistogramIgnoredInAggregationInfo("bottomk", e.PosRange)) case len(group.heap) < k: heap.Push((*vectorByReverseValueHeap)(&group.heap), &s) case group.heap[0].F > s.F || (math.IsNaN(group.heap[0].F) && !math.IsNaN(s.F)): @@ -3275,10 +3304,14 @@ seriesLoop: mat = make(Matrix, 0, len(groups)) } - add := func(lbls labels.Labels, f float64, dropName bool) { + add := func(lbls labels.Labels, f float64, h *histogram.FloatHistogram, dropName bool) { // If this could be an instant query, add directly to the matrix so the result is in consistent order. if ev.endTimestamp == ev.startTimestamp { - mat = append(mat, Series{Metric: lbls, Floats: []FPoint{{T: enh.Ts, F: f}}, DropName: dropName}) + if h != nil { + mat = append(mat, Series{Metric: lbls, Histograms: []HPoint{{T: enh.Ts, H: h}}, DropName: dropName}) + } else { + mat = append(mat, Series{Metric: lbls, Floats: []FPoint{{T: enh.Ts, F: f}}, DropName: dropName}) + } } else { // Otherwise the results are added into seriess elements. hash := lbls.Hash() @@ -3286,7 +3319,7 @@ seriesLoop: if !ok { ss = Series{Metric: lbls, DropName: dropName} } - addToSeries(&ss, enh.Ts, f, nil, numSteps) + addToSeries(&ss, enh.Ts, f, h, numSteps) seriess[hash] = ss } } @@ -3301,7 +3334,7 @@ seriesLoop: sort.Sort(sort.Reverse(aggr.heap)) } for _, v := range aggr.heap { - add(v.Metric, v.F, v.DropName) + add(v.Metric, v.F, v.H, v.DropName) } case parser.BOTTOMK: @@ -3310,12 +3343,12 @@ seriesLoop: sort.Sort(sort.Reverse((*vectorByReverseValueHeap)(&aggr.heap))) } for _, v := range aggr.heap { - add(v.Metric, v.F, v.DropName) + add(v.Metric, v.F, v.H, v.DropName) } case parser.LIMITK, parser.LIMIT_RATIO: for _, v := range aggr.heap { - add(v.Metric, v.F, v.DropName) + add(v.Metric, v.F, v.H, v.DropName) } } } diff --git a/promql/promqltest/testdata/aggregators.test b/promql/promqltest/testdata/aggregators.test index 00f393a865..244e69273b 100644 --- a/promql/promqltest/testdata/aggregators.test +++ b/promql/promqltest/testdata/aggregators.test @@ -272,6 +272,8 @@ load 5m http_requests{job="app-server", instance="1", group="production"} 0+60x10 http_requests{job="app-server", instance="0", group="canary"} 0+70x10 http_requests{job="app-server", instance="1", group="canary"} 0+80x10 + http_requests_histogram{job="app-server", instance="2", group="canary"} {{schema:0 sum:10 count:10}}x11 + http_requests_histogram{job="api-server", instance="3", group="production"} {{schema:0 sum:20 count:20}}x11 foo 3+0x10 eval_ordered instant at 50m topk(3, http_requests) @@ -338,6 +340,47 @@ eval_ordered instant at 50m topk(scalar(foo), http_requests) http_requests{group="canary", instance="0", job="app-server"} 700 http_requests{group="production", instance="1", job="app-server"} 600 +# Tests for histogram: should ignore histograms. +eval_info instant at 50m topk(100, http_requests_histogram) + #empty + +eval_info range from 0 to 50m step 5m topk(100, http_requests_histogram) + #empty + +eval_info instant at 50m topk(1, {__name__=~"http_requests(_histogram)?"}) + {__name__="http_requests", group="canary", instance="1", job="app-server"} 800 + +eval_info instant at 50m count(topk(1000, {__name__=~"http_requests(_histogram)?"})) + {} 9 + +eval_info range from 0 to 50m step 5m count(topk(1000, {__name__=~"http_requests(_histogram)?"})) + {} 9x10 + +eval_info instant at 50m topk by (instance) (1, {__name__=~"http_requests(_histogram)?"}) + {__name__="http_requests", group="canary", instance="0", job="app-server"} 700 + {__name__="http_requests", group="canary", instance="1", job="app-server"} 800 + {__name__="http_requests", group="production", instance="2", job="api-server"} NaN + +eval_info instant at 50m bottomk(100, http_requests_histogram) + #empty + +eval_info range from 0 to 50m step 5m bottomk(100, http_requests_histogram) + #empty + +eval_info instant at 50m bottomk(1, {__name__=~"http_requests(_histogram)?"}) + {__name__="http_requests", group="production", instance="0", job="api-server"} 100 + +eval_info instant at 50m count(bottomk(1000, {__name__=~"http_requests(_histogram)?"})) + {} 9 + +eval_info range from 0 to 50m step 5m count(bottomk(1000, {__name__=~"http_requests(_histogram)?"})) + {} 9x10 + +eval_info instant at 50m bottomk by (instance) (1, {__name__=~"http_requests(_histogram)?"}) + {__name__="http_requests", group="production", instance="0", job="api-server"} 100 + {__name__="http_requests", group="production", instance="1", job="api-server"} 200 + {__name__="http_requests", group="production", instance="2", job="api-server"} NaN + clear # Tests for count_values. diff --git a/promql/promqltest/testdata/limit.test b/promql/promqltest/testdata/limit.test index 0ab363f9ae..e6dd007af4 100644 --- a/promql/promqltest/testdata/limit.test +++ b/promql/promqltest/testdata/limit.test @@ -9,6 +9,8 @@ load 5m http_requests{job="api-server", instance="1", group="canary"} 0+40x10 http_requests{job="api-server", instance="2", group="canary"} 0+50x10 http_requests{job="api-server", instance="3", group="canary"} 0+60x10 + http_requests{job="api-server", instance="histogram_1", group="canary"} {{schema:0 sum:10 count:10}}x11 + http_requests{job="api-server", instance="histogram_2", group="canary"} {{schema:0 sum:20 count:20}}x11 eval instant at 50m count(limitk by (group) (0, http_requests)) # empty @@ -16,25 +18,56 @@ eval instant at 50m count(limitk by (group) (0, http_requests)) eval instant at 50m count(limitk by (group) (-1, http_requests)) # empty -# Exercise k==1 special case (as sample is added before the main series loop +# Exercise k==1 special case (as sample is added before the main series loop). eval instant at 50m count(limitk by (group) (1, http_requests) and http_requests) - {} 2 + {} 2 eval instant at 50m count(limitk by (group) (2, http_requests) and http_requests) - {} 4 + {} 4 eval instant at 50m count(limitk(100, http_requests) and http_requests) - {} 6 + {} 8 -# Exercise k==1 special case (as sample is added before the main series loop +# Exercise k==1 special case (as sample is added before the main series loop). eval instant at 50m count(limitk by (group) (1, http_requests) and http_requests) - {} 2 + {} 2 eval instant at 50m count(limitk by (group) (2, http_requests) and http_requests) - {} 4 + {} 4 eval instant at 50m count(limitk(100, http_requests) and http_requests) - {} 6 + {} 8 + +# Test for histograms. +# k==1: verify that histogram is included in the result. +eval instant at 50m limitk(1, http_requests{instance="histogram_1"}) + {__name__="http_requests", group="canary", instance="histogram_1", job="api-server"} {{count:10 sum:10}} + +eval range from 0 to 50m step 5m limitk(1, http_requests{instance="histogram_1"}) + {__name__="http_requests", group="canary", instance="histogram_1", job="api-server"} {{count:10 sum:10}}x10 + +# Histogram is included with mix of floats as well. +eval instant at 50m limitk(8, http_requests{instance=~"(histogram_2|0)"}) + {__name__="http_requests", group="canary", instance="histogram_2", job="api-server"} {{count:20 sum:20}} + {__name__="http_requests", group="production", instance="0", job="api-server"} 100 + {__name__="http_requests", group="canary", instance="0", job="api-server"} 300 + +eval range from 0 to 50m step 5m limitk(8, http_requests{instance=~"(histogram_2|0)"}) + {__name__="http_requests", group="canary", instance="histogram_2", job="api-server"} {{count:20 sum:20}}x10 + {__name__="http_requests", group="production", instance="0", job="api-server"} 0+10x10 + {__name__="http_requests", group="canary", instance="0", job="api-server"} 0+30x10 + +eval instant at 50m count(limitk(2, http_requests{instance=~"histogram_[0-9]"})) + {} 2 + +eval range from 0 to 50m step 5m count(limitk(2, http_requests{instance=~"histogram_[0-9]"})) + {} 2+0x10 + +eval instant at 50m count(limitk(1000, http_requests{instance=~"histogram_[0-9]"})) + {} 2 + +eval range from 0 to 50m step 5m count(limitk(1000, http_requests{instance=~"histogram_[0-9]"})) + {} 2+0x10 # limit_ratio eval range from 0 to 50m step 5m count(limit_ratio(0.0, http_requests)) @@ -42,9 +75,9 @@ eval range from 0 to 50m step 5m count(limit_ratio(0.0, http_requests)) # limitk(2, ...) should always return a 2-count subset of the timeseries (hence the AND'ing) eval range from 0 to 50m step 5m count(limitk(2, http_requests) and http_requests) - {} 2+0x10 + {} 2+0x10 -# Tests for limit_ratio +# Tests for limit_ratio. # # NB: below 0.5 ratio will depend on some hashing "luck" (also there's no guarantee that # an integer comes from: total number of series * ratio), as it depends on: @@ -56,50 +89,50 @@ eval range from 0 to 50m step 5m count(limitk(2, http_requests) and http_request # # See `AddRatioSample()` in promql/engine.go for more details. -# Half~ish samples: verify we get "near" 3 (of 0.5 * 6) -eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) and http_requests) <= bool (3+1) - {} 1+0x10 +# Half~ish samples: verify we get "near" 3 (of 0.5 * 6). +eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) and http_requests) <= bool (4+1) + {} 1+0x10 -eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) and http_requests) >= bool (3-1) - {} 1+0x10 +eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) and http_requests) >= bool (4-1) + {} 1+0x10 -# All samples +# All samples. eval range from 0 to 50m step 5m count(limit_ratio(1.0, http_requests) and http_requests) - {} 6+0x10 + {} 8+0x10 -# All samples +# All samples. eval range from 0 to 50m step 5m count(limit_ratio(-1.0, http_requests) and http_requests) - {} 6+0x10 + {} 8+0x10 -# Capped to 1.0 -> all samples +# Capped to 1.0 -> all samples. eval_warn range from 0 to 50m step 5m count(limit_ratio(1.1, http_requests) and http_requests) - {} 6+0x10 + {} 8+0x10 -# Capped to -1.0 -> all samples +# Capped to -1.0 -> all samples. eval_warn range from 0 to 50m step 5m count(limit_ratio(-1.1, http_requests) and http_requests) - {} 6+0x10 + {} 8+0x10 -# Verify that limit_ratio(value) and limit_ratio(1.0-value) return the "complement" of each other -# Complement below for [0.2, -0.8] +# Verify that limit_ratio(value) and limit_ratio(1.0-value) return the "complement" of each other. +# Complement below for [0.2, -0.8]. # -# Complement 1of2: `or` should return all samples +# Complement 1of2: `or` should return all samples. eval range from 0 to 50m step 5m count(limit_ratio(0.2, http_requests) or limit_ratio(-0.8, http_requests)) - {} 6+0x10 + {} 8+0x10 -# Complement 2of2: `and` should return no samples +# Complement 2of2: `and` should return no samples. eval range from 0 to 50m step 5m count(limit_ratio(0.2, http_requests) and limit_ratio(-0.8, http_requests)) # empty -# Complement below for [0.5, -0.5] +# Complement below for [0.5, -0.5]. eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) or limit_ratio(-0.5, http_requests)) - {} 6+0x10 + {} 8+0x10 eval range from 0 to 50m step 5m count(limit_ratio(0.5, http_requests) and limit_ratio(-0.5, http_requests)) # empty -# Complement below for [0.8, -0.2] +# Complement below for [0.8, -0.2]. eval range from 0 to 50m step 5m count(limit_ratio(0.8, http_requests) or limit_ratio(-0.2, http_requests)) - {} 6+0x10 + {} 8+0x10 eval range from 0 to 50m step 5m count(limit_ratio(0.8, http_requests) and limit_ratio(-0.2, http_requests)) # empty @@ -107,13 +140,19 @@ eval range from 0 to 50m step 5m count(limit_ratio(0.8, http_requests) and limit # Complement below for [some_ratio, 1.0 - some_ratio], some_ratio derived from time(), # using a small prime number to avoid rounded ratio values, and a small set of them. eval range from 0 to 50m step 5m count(limit_ratio(time() % 17/17, http_requests) or limit_ratio(1.0 - (time() % 17/17), http_requests)) - {} 6+0x10 + {} 8+0x10 eval range from 0 to 50m step 5m count(limit_ratio(time() % 17/17, http_requests) and limit_ratio(1.0 - (time() % 17/17), http_requests)) # empty -# Poor man's normality check: ok (loaded samples follow a nice linearity over labels and time) -# The check giving: 1 (i.e. true) -eval range from 0 to 50m step 5m abs(avg(limit_ratio(0.5, http_requests)) - avg(limit_ratio(-0.5, http_requests))) <= bool stddev(http_requests) +# Poor man's normality check: ok (loaded samples follow a nice linearity over labels and time). +# The check giving: 1 (i.e. true). +eval range from 0 to 50m step 5m abs(avg(limit_ratio(0.5, http_requests{instance!~"histogram_[0-9]"})) - avg(limit_ratio(-0.5, http_requests{instance!~"histogram_[0-9]"}))) <= bool stddev(http_requests{instance!~"histogram_[0-9]"}) {} 1+0x10 +# All specified histograms are included for r=1. +eval instant at 50m limit_ratio(1, http_requests{instance="histogram_1"}) + {__name__="http_requests", group="canary", instance="histogram_1", job="api-server"} {{count:10 sum:10}} + +eval range from 0 to 50m step 5m limit_ratio(1, http_requests{instance="histogram_1"}) + {__name__="http_requests", group="canary", instance="histogram_1", job="api-server"} {{count:10 sum:10}}x10