From e399395b01d3d384fb7e806b5a64d685fb038e09 Mon Sep 17 00:00:00 2001 From: George Krajcsovits Date: Wed, 1 Nov 2023 18:30:34 +0100 Subject: [PATCH] Native histograms vs labels (#13005) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Document le and quantile label transition due to native histograms Fixes: #12984 For full explanation see the related issue. The le and quantile labels are formatted as float with trailing .0 for whole number values when native histograms is enabled, e.g. 10.0. This changes the resulting series in Prometheus if previously we scraped the whole number itself, e.g. 10 over the text format. Signed-off-by: György Krajcsovits --------- Signed-off-by: György Krajcsovits Signed-off-by: George Krajcsovits --- docs/feature_flags.md | 56 ++++++++++++++++++- scrape/scrape_test.go | 125 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+), 1 deletion(-) diff --git a/docs/feature_flags.md b/docs/feature_flags.md index 1cf54c47f..f580c959f 100644 --- a/docs/feature_flags.md +++ b/docs/feature_flags.md @@ -125,7 +125,61 @@ histogram (albeit via the text format). With this flag enabled, Prometheus will still ingest those conventional histograms that do not come with a corresponding native histogram. However, if a native histogram is present, Prometheus will ignore the corresponding conventional histogram, with the -notable exception of exemplars, which are always ingested. +notable exception of exemplars, which are always ingested. To keep the +conventional histograms as well, enable `scrape_classic_histograms` in the +scrape job. + +_Note about the format of `le` and `quantile` label values:_ + +In certain situations, the protobuf parsing changes the number formatting of +the `le` labels of conventional histograms and the `quantile` labels of +summaries. Typically, this happens if the scraped target is instrumented with +[client_golang](https://github.com/prometheus/client_golang) provided that +[promhttp.HandlerOpts.EnableOpenMetrics](https://pkg.go.dev/github.com/prometheus/client_golang/prometheus/promhttp#HandlerOpts) +is set to `false`. In such a case, integer label values are represented in the +text format as such, e.g. `quantile="1"` or `le="2"`. However, the protobuf parsing +changes the representation to float-like (following the OpenMetrics +specification), so the examples above become `quantile="1.0"` and `le="2.0"` after +ingestion into Prometheus, which changes the identity of the metric compared to +what was ingested before via the text format. + +The effect of this change is that alerts, recording rules and dashboards that +directly reference label values as whole numbers such as `le="1"` will stop +working. + +Aggregation by the `le` and `quantile` labels for vectors that contain the old and +new formatting will lead to unexpected results, and range vectors that span the +transition between the different formatting will contain additional series. +The most common use case for both is the quantile calculation via +`histogram_quantile`, e.g. +`histogram_quantile(0.95, sum by (le) (rate(histogram_bucket[10m])))`. +The `histogram_quantile` function already tries to mitigate the effects to some +extent, but there will be inaccuracies, in particular for shorter ranges that +cover only a few samples. + +Ways to deal with this change either globally or on a per metric basis: + +- Fix references to integer `le`, `quantile` label values, but otherwise do +nothing and accept that some queries that span the transition time will produce +inaccurate or unexpected results. +_This is the recommended solution, to get consistently normalized label values._ +Also Prometheus 3.0 is expected to enforce normalization of these label values. +- Use `metric_relabel_config` to retain the old labels when scraping targets. +This should **only** be applied to metrics that currently produce such labels. + + +```yaml + metric_relabel_configs: + - source_labels: + - quantile + target_label: quantile + regex: (\d+)\.0+ + - source_labels: + - le + - __name__ + target_label: le + regex: (\d+)\.0+;.*_bucket +``` ## OTLP Receiver diff --git a/scrape/scrape_test.go b/scrape/scrape_test.go index 7be3f3461..08cf37ede 100644 --- a/scrape/scrape_test.go +++ b/scrape/scrape_test.go @@ -3679,6 +3679,131 @@ func TestTargetScrapeIntervalAndTimeoutRelabel(t *testing.T) { require.Equal(t, "750ms", sp.ActiveTargets()[0].labels.Get(model.ScrapeTimeoutLabel)) } +// Testing whether we can remove trailing .0 from histogram 'le' and summary 'quantile' labels. +func TestLeQuantileReLabel(t *testing.T) { + simpleStorage := teststorage.New(t) + defer simpleStorage.Close() + + config := &config.ScrapeConfig{ + JobName: "test", + MetricRelabelConfigs: []*relabel.Config{ + { + SourceLabels: model.LabelNames{"le", "__name__"}, + Regex: relabel.MustNewRegexp("(\\d+)\\.0+;.*_bucket"), + Replacement: relabel.DefaultRelabelConfig.Replacement, + Separator: relabel.DefaultRelabelConfig.Separator, + TargetLabel: "le", + Action: relabel.Replace, + }, + { + SourceLabels: model.LabelNames{"quantile"}, + Regex: relabel.MustNewRegexp("(\\d+)\\.0+"), + Replacement: relabel.DefaultRelabelConfig.Replacement, + Separator: relabel.DefaultRelabelConfig.Separator, + TargetLabel: "quantile", + Action: relabel.Replace, + }, + }, + SampleLimit: 100, + Scheme: "http", + ScrapeInterval: model.Duration(100 * time.Millisecond), + ScrapeTimeout: model.Duration(100 * time.Millisecond), + } + + metricsText := ` +# HELP test_histogram This is a histogram with default buckets +# TYPE test_histogram histogram +test_histogram_bucket{address="0.0.0.0",port="5001",le="0.005"} 0 +test_histogram_bucket{address="0.0.0.0",port="5001",le="0.01"} 0 +test_histogram_bucket{address="0.0.0.0",port="5001",le="0.025"} 0 +test_histogram_bucket{address="0.0.0.0",port="5001",le="0.05"} 0 +test_histogram_bucket{address="0.0.0.0",port="5001",le="0.1"} 0 +test_histogram_bucket{address="0.0.0.0",port="5001",le="0.25"} 0 +test_histogram_bucket{address="0.0.0.0",port="5001",le="0.5"} 0 +test_histogram_bucket{address="0.0.0.0",port="5001",le="1.0"} 0 +test_histogram_bucket{address="0.0.0.0",port="5001",le="2.5"} 0 +test_histogram_bucket{address="0.0.0.0",port="5001",le="5.0"} 0 +test_histogram_bucket{address="0.0.0.0",port="5001",le="10.0"} 0 +test_histogram_bucket{address="0.0.0.0",port="5001",le="+Inf"} 0 +test_histogram_sum{address="0.0.0.0",port="5001"} 0 +test_histogram_count{address="0.0.0.0",port="5001"} 0 +# HELP test_summary Number of inflight requests sampled at a regular interval. Quantile buckets keep track of inflight requests over the last 60s. +# TYPE test_summary summary +test_summary{quantile="0.5"} 0 +test_summary{quantile="0.9"} 0 +test_summary{quantile="0.95"} 0 +test_summary{quantile="0.99"} 0 +test_summary{quantile="1.0"} 1 +test_summary_sum 1 +test_summary_count 199 +` + + // The expected "le" values do not have the trailing ".0". + expectedLeValues := []string{"0.005", "0.01", "0.025", "0.05", "0.1", "0.25", "0.5", "1", "2.5", "5", "10", "+Inf"} + + // The expected "quantile" values do not have the trailing ".0". + expectedQuantileValues := []string{"0.5", "0.9", "0.95", "0.99", "1"} + + scrapeCount := 0 + scraped := make(chan bool) + + ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + fmt.Fprint(w, metricsText) + scrapeCount++ + if scrapeCount > 2 { + close(scraped) + } + })) + defer ts.Close() + + sp, err := newScrapePool(config, simpleStorage, 0, nil, &Options{}, newTestScrapeMetrics(t)) + require.NoError(t, err) + defer sp.stop() + + testURL, err := url.Parse(ts.URL) + require.NoError(t, err) + sp.Sync([]*targetgroup.Group{ + { + Targets: []model.LabelSet{{model.AddressLabel: model.LabelValue(testURL.Host)}}, + }, + }) + require.Equal(t, 1, len(sp.ActiveTargets())) + + select { + case <-time.After(5 * time.Second): + t.Fatalf("target was not scraped") + case <-scraped: + } + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + q, err := simpleStorage.Querier(time.Time{}.UnixNano(), time.Now().UnixNano()) + require.NoError(t, err) + defer q.Close() + + checkValues := func(labelName string, expectedValues []string, series storage.SeriesSet) { + foundLeValues := map[string]bool{} + + for series.Next() { + s := series.At() + v := s.Labels().Get(labelName) + require.NotContains(t, foundLeValues, v, "duplicate label value found") + foundLeValues[v] = true + } + + require.Equal(t, len(expectedValues), len(foundLeValues), "number of label values not as expected") + for _, v := range expectedValues { + require.Contains(t, foundLeValues, v, "label value not found") + } + } + + series := q.Select(ctx, false, nil, labels.MustNewMatcher(labels.MatchRegexp, "__name__", "test_histogram_bucket")) + checkValues("le", expectedLeValues, series) + + series = q.Select(ctx, false, nil, labels.MustNewMatcher(labels.MatchRegexp, "__name__", "test_summary")) + checkValues("quantile", expectedQuantileValues, series) +} + func TestScrapeLoopRunCreatesStaleMarkersOnFailedScrapeForTimestampedMetrics(t *testing.T) { appender := &collectResultAppender{} var (