Native histograms vs labels (#13005)

* Document le and quantile label transition due to native histograms

Fixes: #12984

For full explanation see the related issue. The le and quantile labels
are formatted as float with trailing .0 for whole number values when
native histograms is enabled, e.g. 10.0. This changes the resulting series
in Prometheus if previously we scraped the whole number itself, e.g. 10
over the text format.

Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com>

---------

Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com>
Signed-off-by: George Krajcsovits <krajorama@users.noreply.github.com>
This commit is contained in:
George Krajcsovits 2023-11-01 18:30:34 +01:00 committed by GitHub
parent a43669e611
commit e399395b01
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 180 additions and 1 deletions

View File

@ -125,7 +125,61 @@ histogram (albeit via the text format). With this flag enabled, Prometheus will
still ingest those conventional histograms that do not come with a still ingest those conventional histograms that do not come with a
corresponding native histogram. However, if a native histogram is present, corresponding native histogram. However, if a native histogram is present,
Prometheus will ignore the corresponding conventional histogram, with the Prometheus will ignore the corresponding conventional histogram, with the
notable exception of exemplars, which are always ingested. notable exception of exemplars, which are always ingested. To keep the
conventional histograms as well, enable `scrape_classic_histograms` in the
scrape job.
_Note about the format of `le` and `quantile` label values:_
In certain situations, the protobuf parsing changes the number formatting of
the `le` labels of conventional histograms and the `quantile` labels of
summaries. Typically, this happens if the scraped target is instrumented with
[client_golang](https://github.com/prometheus/client_golang) provided that
[promhttp.HandlerOpts.EnableOpenMetrics](https://pkg.go.dev/github.com/prometheus/client_golang/prometheus/promhttp#HandlerOpts)
is set to `false`. In such a case, integer label values are represented in the
text format as such, e.g. `quantile="1"` or `le="2"`. However, the protobuf parsing
changes the representation to float-like (following the OpenMetrics
specification), so the examples above become `quantile="1.0"` and `le="2.0"` after
ingestion into Prometheus, which changes the identity of the metric compared to
what was ingested before via the text format.
The effect of this change is that alerts, recording rules and dashboards that
directly reference label values as whole numbers such as `le="1"` will stop
working.
Aggregation by the `le` and `quantile` labels for vectors that contain the old and
new formatting will lead to unexpected results, and range vectors that span the
transition between the different formatting will contain additional series.
The most common use case for both is the quantile calculation via
`histogram_quantile`, e.g.
`histogram_quantile(0.95, sum by (le) (rate(histogram_bucket[10m])))`.
The `histogram_quantile` function already tries to mitigate the effects to some
extent, but there will be inaccuracies, in particular for shorter ranges that
cover only a few samples.
Ways to deal with this change either globally or on a per metric basis:
- Fix references to integer `le`, `quantile` label values, but otherwise do
nothing and accept that some queries that span the transition time will produce
inaccurate or unexpected results.
_This is the recommended solution, to get consistently normalized label values._
Also Prometheus 3.0 is expected to enforce normalization of these label values.
- Use `metric_relabel_config` to retain the old labels when scraping targets.
This should **only** be applied to metrics that currently produce such labels.
<!-- The following config snippet is unit tested in scrape/scrape_test.go. -->
```yaml
metric_relabel_configs:
- source_labels:
- quantile
target_label: quantile
regex: (\d+)\.0+
- source_labels:
- le
- __name__
target_label: le
regex: (\d+)\.0+;.*_bucket
```
## OTLP Receiver ## OTLP Receiver

View File

@ -3679,6 +3679,131 @@ func TestTargetScrapeIntervalAndTimeoutRelabel(t *testing.T) {
require.Equal(t, "750ms", sp.ActiveTargets()[0].labels.Get(model.ScrapeTimeoutLabel)) require.Equal(t, "750ms", sp.ActiveTargets()[0].labels.Get(model.ScrapeTimeoutLabel))
} }
// Testing whether we can remove trailing .0 from histogram 'le' and summary 'quantile' labels.
func TestLeQuantileReLabel(t *testing.T) {
simpleStorage := teststorage.New(t)
defer simpleStorage.Close()
config := &config.ScrapeConfig{
JobName: "test",
MetricRelabelConfigs: []*relabel.Config{
{
SourceLabels: model.LabelNames{"le", "__name__"},
Regex: relabel.MustNewRegexp("(\\d+)\\.0+;.*_bucket"),
Replacement: relabel.DefaultRelabelConfig.Replacement,
Separator: relabel.DefaultRelabelConfig.Separator,
TargetLabel: "le",
Action: relabel.Replace,
},
{
SourceLabels: model.LabelNames{"quantile"},
Regex: relabel.MustNewRegexp("(\\d+)\\.0+"),
Replacement: relabel.DefaultRelabelConfig.Replacement,
Separator: relabel.DefaultRelabelConfig.Separator,
TargetLabel: "quantile",
Action: relabel.Replace,
},
},
SampleLimit: 100,
Scheme: "http",
ScrapeInterval: model.Duration(100 * time.Millisecond),
ScrapeTimeout: model.Duration(100 * time.Millisecond),
}
metricsText := `
# HELP test_histogram This is a histogram with default buckets
# TYPE test_histogram histogram
test_histogram_bucket{address="0.0.0.0",port="5001",le="0.005"} 0
test_histogram_bucket{address="0.0.0.0",port="5001",le="0.01"} 0
test_histogram_bucket{address="0.0.0.0",port="5001",le="0.025"} 0
test_histogram_bucket{address="0.0.0.0",port="5001",le="0.05"} 0
test_histogram_bucket{address="0.0.0.0",port="5001",le="0.1"} 0
test_histogram_bucket{address="0.0.0.0",port="5001",le="0.25"} 0
test_histogram_bucket{address="0.0.0.0",port="5001",le="0.5"} 0
test_histogram_bucket{address="0.0.0.0",port="5001",le="1.0"} 0
test_histogram_bucket{address="0.0.0.0",port="5001",le="2.5"} 0
test_histogram_bucket{address="0.0.0.0",port="5001",le="5.0"} 0
test_histogram_bucket{address="0.0.0.0",port="5001",le="10.0"} 0
test_histogram_bucket{address="0.0.0.0",port="5001",le="+Inf"} 0
test_histogram_sum{address="0.0.0.0",port="5001"} 0
test_histogram_count{address="0.0.0.0",port="5001"} 0
# HELP test_summary Number of inflight requests sampled at a regular interval. Quantile buckets keep track of inflight requests over the last 60s.
# TYPE test_summary summary
test_summary{quantile="0.5"} 0
test_summary{quantile="0.9"} 0
test_summary{quantile="0.95"} 0
test_summary{quantile="0.99"} 0
test_summary{quantile="1.0"} 1
test_summary_sum 1
test_summary_count 199
`
// The expected "le" values do not have the trailing ".0".
expectedLeValues := []string{"0.005", "0.01", "0.025", "0.05", "0.1", "0.25", "0.5", "1", "2.5", "5", "10", "+Inf"}
// The expected "quantile" values do not have the trailing ".0".
expectedQuantileValues := []string{"0.5", "0.9", "0.95", "0.99", "1"}
scrapeCount := 0
scraped := make(chan bool)
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
fmt.Fprint(w, metricsText)
scrapeCount++
if scrapeCount > 2 {
close(scraped)
}
}))
defer ts.Close()
sp, err := newScrapePool(config, simpleStorage, 0, nil, &Options{}, newTestScrapeMetrics(t))
require.NoError(t, err)
defer sp.stop()
testURL, err := url.Parse(ts.URL)
require.NoError(t, err)
sp.Sync([]*targetgroup.Group{
{
Targets: []model.LabelSet{{model.AddressLabel: model.LabelValue(testURL.Host)}},
},
})
require.Equal(t, 1, len(sp.ActiveTargets()))
select {
case <-time.After(5 * time.Second):
t.Fatalf("target was not scraped")
case <-scraped:
}
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
q, err := simpleStorage.Querier(time.Time{}.UnixNano(), time.Now().UnixNano())
require.NoError(t, err)
defer q.Close()
checkValues := func(labelName string, expectedValues []string, series storage.SeriesSet) {
foundLeValues := map[string]bool{}
for series.Next() {
s := series.At()
v := s.Labels().Get(labelName)
require.NotContains(t, foundLeValues, v, "duplicate label value found")
foundLeValues[v] = true
}
require.Equal(t, len(expectedValues), len(foundLeValues), "number of label values not as expected")
for _, v := range expectedValues {
require.Contains(t, foundLeValues, v, "label value not found")
}
}
series := q.Select(ctx, false, nil, labels.MustNewMatcher(labels.MatchRegexp, "__name__", "test_histogram_bucket"))
checkValues("le", expectedLeValues, series)
series = q.Select(ctx, false, nil, labels.MustNewMatcher(labels.MatchRegexp, "__name__", "test_summary"))
checkValues("quantile", expectedQuantileValues, series)
}
func TestScrapeLoopRunCreatesStaleMarkersOnFailedScrapeForTimestampedMetrics(t *testing.T) { func TestScrapeLoopRunCreatesStaleMarkersOnFailedScrapeForTimestampedMetrics(t *testing.T) {
appender := &collectResultAppender{} appender := &collectResultAppender{}
var ( var (