use Welford/Knuth method to compute standard deviation and variance (#4533)

* use Welford/Knuth method to compute standard deviation and variance, avoids float precision issues * use better method for calculating avg and avg_over_time Signed-off-by: Dan Cech <dcech@grafana.com>
2025-03-11 07:59:57 +00:00 · 2018-08-26 18:28:47 +09:00 · 2018-08-26 18:28:47 +09:00 · 9f4cb06a37
commit 9f4cb06a37
parent 7d01ead689
4 changed files with 66 additions and 32 deletions
--- a/promql/engine.go
+++ b/promql/engine.go
@ -1464,12 +1464,12 @@ func intersection(ls1, ls2 labels.Labels) labels.Labels {
 }
 type groupedAggregation struct {
-	labels           labels.Labels
+	labels      labels.Labels
-	value            float64
+	value       float64
-	valuesSquaredSum float64
+	mean        float64
-	groupCount       int
+	groupCount  int
-	heap             vectorByValueHeap
+	heap        vectorByValueHeap
-	reverseHeap      vectorByReverseValueHeap
+	reverseHeap vectorByReverseValueHeap
 }
 // aggregation evaluates an aggregation operation on a Vector.
@ -1540,17 +1540,19 @@ func (ev *evaluator) aggregation(op ItemType, grouping []string, without bool, p
 				sort.Sort(m)
 			}
 			result[groupingKey] = &groupedAggregation{
-				labels:           m,
+				labels:     m,
-				value:            s.V,
+				value:      s.V,
-				valuesSquaredSum: s.V * s.V,
+				mean:       s.V,
-				groupCount:       1,
+				groupCount: 1,
 			}
 			inputVecLen := int64(len(vec))
 			resultSize := k
 			if k > inputVecLen {
 				resultSize = inputVecLen
 			}
-			if op == itemTopK || op == itemQuantile {
+			if op == itemStdvar || op == itemStddev {
 				result[groupingKey].value = 0.0
 			} else if op == itemTopK || op == itemQuantile {
 				result[groupingKey].heap = make(vectorByValueHeap, 0, resultSize)
 				heap.Push(&result[groupingKey].heap, &Sample{
 					Point:  Point{V: s.V},
@ -1571,8 +1573,8 @@ func (ev *evaluator) aggregation(op ItemType, grouping []string, without bool, p
 			group.value += s.V
 		case itemAvg:
 			group.value += s.V
 			group.groupCount++
 			group.mean += (s.V - group.mean) / float64(group.groupCount)
 		case itemMax:
 			if group.value < s.V || math.IsNaN(group.value) {
@ -1588,9 +1590,10 @@ func (ev *evaluator) aggregation(op ItemType, grouping []string, without bool, p
 			group.groupCount++
 		case itemStdvar, itemStddev:
 			group.value += s.V
 			group.valuesSquaredSum += s.V * s.V
 			group.groupCount++
 			delta := s.V - group.mean
 			group.mean += delta / float64(group.groupCount)
 			group.value += delta * (s.V - group.mean)
 		case itemTopK:
 			if int64(len(group.heap)) < k || group.heap[0].V < s.V || math.IsNaN(group.heap[0].V) {
@ -1626,18 +1629,16 @@ func (ev *evaluator) aggregation(op ItemType, grouping []string, without bool, p
 	for _, aggr := range result {
 		switch op {
 		case itemAvg:
-			aggr.value = aggr.value / float64(aggr.groupCount)
+			aggr.value = aggr.mean
 		case itemCount, itemCountValues:
 			aggr.value = float64(aggr.groupCount)
 		case itemStdvar:
-			avg := aggr.value / float64(aggr.groupCount)
+			aggr.value = aggr.value / float64(aggr.groupCount)
 			aggr.value = aggr.valuesSquaredSum/float64(aggr.groupCount) - avg*avg
 		case itemStddev:
-			avg := aggr.value / float64(aggr.groupCount)
+			aggr.value = math.Sqrt(aggr.value / float64(aggr.groupCount))
 			aggr.value = math.Sqrt(aggr.valuesSquaredSum/float64(aggr.groupCount) - avg*avg)
 		case itemTopK:
 			// The heap keeps the lowest value on top, so reverse it.
--- a/promql/functions.go
+++ b/promql/functions.go
@ -371,11 +371,12 @@ func aggrOverTime(vals []Value, enh *EvalNodeHelper, aggrFn func([]Point) float6
 // === avg_over_time(Matrix ValueTypeMatrix) Vector ===
 func funcAvgOverTime(vals []Value, args Expressions, enh *EvalNodeHelper) Vector {
 	return aggrOverTime(vals, enh, func(values []Point) float64 {
-		var sum float64
+		var mean, count float64
 		for _, v := range values {
-			sum += v.V
+			count++
 			mean += (v.V - mean) / count
 		}
-		return sum / float64(len(values))
+		return mean
 	})
 }
@ -444,28 +445,28 @@ func funcQuantileOverTime(vals []Value, args Expressions, enh *EvalNodeHelper) V
 // === stddev_over_time(Matrix ValueTypeMatrix) Vector ===
 func funcStddevOverTime(vals []Value, args Expressions, enh *EvalNodeHelper) Vector {
 	return aggrOverTime(vals, enh, func(values []Point) float64 {
-		var sum, squaredSum, count float64
+		var aux, count, mean float64
 		for _, v := range values {
 			sum += v.V
 			squaredSum += v.V * v.V
 			count++
 			delta := v.V - mean
 			mean += delta / count
 			aux += delta * (v.V - mean)
 		}
-		avg := sum / count
+		return math.Sqrt(aux / count)
 		return math.Sqrt(squaredSum/count - avg*avg)
 	})
 }
 // === stdvar_over_time(Matrix ValueTypeMatrix) Vector ===
 func funcStdvarOverTime(vals []Value, args Expressions, enh *EvalNodeHelper) Vector {
 	return aggrOverTime(vals, enh, func(values []Point) float64 {
-		var sum, squaredSum, count float64
+		var aux, count, mean float64
 		for _, v := range values {
 			sum += v.V
 			squaredSum += v.V * v.V
 			count++
 			delta := v.V - mean
 			mean += delta / count
 			aux += delta * (v.V - mean)
 		}
-		avg := sum / count
+		return aux / count
 		return squaredSum/count - avg*avg
 	})
 }
--- a/promql/testdata/aggregators.test
+++ b/promql/testdata/aggregators.test
@ -90,6 +90,19 @@ eval instant at 50m stdvar by (instance)(http_requests)
  {instance="0"} 50000
  {instance="1"} 50000
 # Float precision test for standard deviation and variance
 clear
 load 5m
  http_requests{job="api-server", instance="0", group="production"} 0+1.33x10
  http_requests{job="api-server", instance="1", group="production"} 0+1.33x10
  http_requests{job="api-server", instance="0", group="canary"} 0+1.33x10
 eval instant at 50m stddev(http_requests)
  {} 0.0
 eval instant at 50m stdvar(http_requests)
  {} 0.0
 # Regression test for missing separator byte in labelsToGroupingKey.
--- a/promql/testdata/functions.test
+++ b/promql/testdata/functions.test
@ -374,6 +374,14 @@ eval instant at 8000s holt_winters(http_requests[1m], 0.01, 0.1)
 	{job="api-server", instance="0", group="canary"} 24000
 	{job="api-server", instance="1", group="canary"} -32000
 # Tests for avg_over_time
 clear
 load 10s
  metric 1 2 3 4 5
 eval instant at 1m avg_over_time(metric[1m])
  {} 3
 # Tests for stddev_over_time and stdvar_over_time.
 clear
 load 10s
@ -385,6 +393,17 @@ eval instant at 1m stdvar_over_time(metric[1m])
 eval instant at 1m stddev_over_time(metric[1m])
  {} 3.249615
 # Tests for stddev_over_time and stdvar_over_time #4927.
 clear
 load 10s
  metric 1.5990505637277868 1.5990505637277868 1.5990505637277868
 eval instant at 1m stdvar_over_time(metric[1m])
  {} 0
 eval instant at 1m stddev_over_time(metric[1m])
  {} 0
 # Tests for quantile_over_time
 clear