Native histograms vs labels (#13005)
* Document le and quantile label transition due to native histograms Fixes: #12984 For full explanation see the related issue. The le and quantile labels are formatted as float with trailing .0 for whole number values when native histograms is enabled, e.g. 10.0. This changes the resulting series in Prometheus if previously we scraped the whole number itself, e.g. 10 over the text format. Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com> --------- Signed-off-by: György Krajcsovits <gyorgy.krajcsovits@grafana.com> Signed-off-by: George Krajcsovits <krajorama@users.noreply.github.com>
This commit is contained in:
parent
a43669e611
commit
e399395b01
|
@ -125,7 +125,61 @@ histogram (albeit via the text format). With this flag enabled, Prometheus will
|
|||
still ingest those conventional histograms that do not come with a
|
||||
corresponding native histogram. However, if a native histogram is present,
|
||||
Prometheus will ignore the corresponding conventional histogram, with the
|
||||
notable exception of exemplars, which are always ingested.
|
||||
notable exception of exemplars, which are always ingested. To keep the
|
||||
conventional histograms as well, enable `scrape_classic_histograms` in the
|
||||
scrape job.
|
||||
|
||||
_Note about the format of `le` and `quantile` label values:_
|
||||
|
||||
In certain situations, the protobuf parsing changes the number formatting of
|
||||
the `le` labels of conventional histograms and the `quantile` labels of
|
||||
summaries. Typically, this happens if the scraped target is instrumented with
|
||||
[client_golang](https://github.com/prometheus/client_golang) provided that
|
||||
[promhttp.HandlerOpts.EnableOpenMetrics](https://pkg.go.dev/github.com/prometheus/client_golang/prometheus/promhttp#HandlerOpts)
|
||||
is set to `false`. In such a case, integer label values are represented in the
|
||||
text format as such, e.g. `quantile="1"` or `le="2"`. However, the protobuf parsing
|
||||
changes the representation to float-like (following the OpenMetrics
|
||||
specification), so the examples above become `quantile="1.0"` and `le="2.0"` after
|
||||
ingestion into Prometheus, which changes the identity of the metric compared to
|
||||
what was ingested before via the text format.
|
||||
|
||||
The effect of this change is that alerts, recording rules and dashboards that
|
||||
directly reference label values as whole numbers such as `le="1"` will stop
|
||||
working.
|
||||
|
||||
Aggregation by the `le` and `quantile` labels for vectors that contain the old and
|
||||
new formatting will lead to unexpected results, and range vectors that span the
|
||||
transition between the different formatting will contain additional series.
|
||||
The most common use case for both is the quantile calculation via
|
||||
`histogram_quantile`, e.g.
|
||||
`histogram_quantile(0.95, sum by (le) (rate(histogram_bucket[10m])))`.
|
||||
The `histogram_quantile` function already tries to mitigate the effects to some
|
||||
extent, but there will be inaccuracies, in particular for shorter ranges that
|
||||
cover only a few samples.
|
||||
|
||||
Ways to deal with this change either globally or on a per metric basis:
|
||||
|
||||
- Fix references to integer `le`, `quantile` label values, but otherwise do
|
||||
nothing and accept that some queries that span the transition time will produce
|
||||
inaccurate or unexpected results.
|
||||
_This is the recommended solution, to get consistently normalized label values._
|
||||
Also Prometheus 3.0 is expected to enforce normalization of these label values.
|
||||
- Use `metric_relabel_config` to retain the old labels when scraping targets.
|
||||
This should **only** be applied to metrics that currently produce such labels.
|
||||
|
||||
<!-- The following config snippet is unit tested in scrape/scrape_test.go. -->
|
||||
```yaml
|
||||
metric_relabel_configs:
|
||||
- source_labels:
|
||||
- quantile
|
||||
target_label: quantile
|
||||
regex: (\d+)\.0+
|
||||
- source_labels:
|
||||
- le
|
||||
- __name__
|
||||
target_label: le
|
||||
regex: (\d+)\.0+;.*_bucket
|
||||
```
|
||||
|
||||
## OTLP Receiver
|
||||
|
||||
|
|
|
@ -3679,6 +3679,131 @@ func TestTargetScrapeIntervalAndTimeoutRelabel(t *testing.T) {
|
|||
require.Equal(t, "750ms", sp.ActiveTargets()[0].labels.Get(model.ScrapeTimeoutLabel))
|
||||
}
|
||||
|
||||
// Testing whether we can remove trailing .0 from histogram 'le' and summary 'quantile' labels.
|
||||
func TestLeQuantileReLabel(t *testing.T) {
|
||||
simpleStorage := teststorage.New(t)
|
||||
defer simpleStorage.Close()
|
||||
|
||||
config := &config.ScrapeConfig{
|
||||
JobName: "test",
|
||||
MetricRelabelConfigs: []*relabel.Config{
|
||||
{
|
||||
SourceLabels: model.LabelNames{"le", "__name__"},
|
||||
Regex: relabel.MustNewRegexp("(\\d+)\\.0+;.*_bucket"),
|
||||
Replacement: relabel.DefaultRelabelConfig.Replacement,
|
||||
Separator: relabel.DefaultRelabelConfig.Separator,
|
||||
TargetLabel: "le",
|
||||
Action: relabel.Replace,
|
||||
},
|
||||
{
|
||||
SourceLabels: model.LabelNames{"quantile"},
|
||||
Regex: relabel.MustNewRegexp("(\\d+)\\.0+"),
|
||||
Replacement: relabel.DefaultRelabelConfig.Replacement,
|
||||
Separator: relabel.DefaultRelabelConfig.Separator,
|
||||
TargetLabel: "quantile",
|
||||
Action: relabel.Replace,
|
||||
},
|
||||
},
|
||||
SampleLimit: 100,
|
||||
Scheme: "http",
|
||||
ScrapeInterval: model.Duration(100 * time.Millisecond),
|
||||
ScrapeTimeout: model.Duration(100 * time.Millisecond),
|
||||
}
|
||||
|
||||
metricsText := `
|
||||
# HELP test_histogram This is a histogram with default buckets
|
||||
# TYPE test_histogram histogram
|
||||
test_histogram_bucket{address="0.0.0.0",port="5001",le="0.005"} 0
|
||||
test_histogram_bucket{address="0.0.0.0",port="5001",le="0.01"} 0
|
||||
test_histogram_bucket{address="0.0.0.0",port="5001",le="0.025"} 0
|
||||
test_histogram_bucket{address="0.0.0.0",port="5001",le="0.05"} 0
|
||||
test_histogram_bucket{address="0.0.0.0",port="5001",le="0.1"} 0
|
||||
test_histogram_bucket{address="0.0.0.0",port="5001",le="0.25"} 0
|
||||
test_histogram_bucket{address="0.0.0.0",port="5001",le="0.5"} 0
|
||||
test_histogram_bucket{address="0.0.0.0",port="5001",le="1.0"} 0
|
||||
test_histogram_bucket{address="0.0.0.0",port="5001",le="2.5"} 0
|
||||
test_histogram_bucket{address="0.0.0.0",port="5001",le="5.0"} 0
|
||||
test_histogram_bucket{address="0.0.0.0",port="5001",le="10.0"} 0
|
||||
test_histogram_bucket{address="0.0.0.0",port="5001",le="+Inf"} 0
|
||||
test_histogram_sum{address="0.0.0.0",port="5001"} 0
|
||||
test_histogram_count{address="0.0.0.0",port="5001"} 0
|
||||
# HELP test_summary Number of inflight requests sampled at a regular interval. Quantile buckets keep track of inflight requests over the last 60s.
|
||||
# TYPE test_summary summary
|
||||
test_summary{quantile="0.5"} 0
|
||||
test_summary{quantile="0.9"} 0
|
||||
test_summary{quantile="0.95"} 0
|
||||
test_summary{quantile="0.99"} 0
|
||||
test_summary{quantile="1.0"} 1
|
||||
test_summary_sum 1
|
||||
test_summary_count 199
|
||||
`
|
||||
|
||||
// The expected "le" values do not have the trailing ".0".
|
||||
expectedLeValues := []string{"0.005", "0.01", "0.025", "0.05", "0.1", "0.25", "0.5", "1", "2.5", "5", "10", "+Inf"}
|
||||
|
||||
// The expected "quantile" values do not have the trailing ".0".
|
||||
expectedQuantileValues := []string{"0.5", "0.9", "0.95", "0.99", "1"}
|
||||
|
||||
scrapeCount := 0
|
||||
scraped := make(chan bool)
|
||||
|
||||
ts := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
fmt.Fprint(w, metricsText)
|
||||
scrapeCount++
|
||||
if scrapeCount > 2 {
|
||||
close(scraped)
|
||||
}
|
||||
}))
|
||||
defer ts.Close()
|
||||
|
||||
sp, err := newScrapePool(config, simpleStorage, 0, nil, &Options{}, newTestScrapeMetrics(t))
|
||||
require.NoError(t, err)
|
||||
defer sp.stop()
|
||||
|
||||
testURL, err := url.Parse(ts.URL)
|
||||
require.NoError(t, err)
|
||||
sp.Sync([]*targetgroup.Group{
|
||||
{
|
||||
Targets: []model.LabelSet{{model.AddressLabel: model.LabelValue(testURL.Host)}},
|
||||
},
|
||||
})
|
||||
require.Equal(t, 1, len(sp.ActiveTargets()))
|
||||
|
||||
select {
|
||||
case <-time.After(5 * time.Second):
|
||||
t.Fatalf("target was not scraped")
|
||||
case <-scraped:
|
||||
}
|
||||
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
defer cancel()
|
||||
q, err := simpleStorage.Querier(time.Time{}.UnixNano(), time.Now().UnixNano())
|
||||
require.NoError(t, err)
|
||||
defer q.Close()
|
||||
|
||||
checkValues := func(labelName string, expectedValues []string, series storage.SeriesSet) {
|
||||
foundLeValues := map[string]bool{}
|
||||
|
||||
for series.Next() {
|
||||
s := series.At()
|
||||
v := s.Labels().Get(labelName)
|
||||
require.NotContains(t, foundLeValues, v, "duplicate label value found")
|
||||
foundLeValues[v] = true
|
||||
}
|
||||
|
||||
require.Equal(t, len(expectedValues), len(foundLeValues), "number of label values not as expected")
|
||||
for _, v := range expectedValues {
|
||||
require.Contains(t, foundLeValues, v, "label value not found")
|
||||
}
|
||||
}
|
||||
|
||||
series := q.Select(ctx, false, nil, labels.MustNewMatcher(labels.MatchRegexp, "__name__", "test_histogram_bucket"))
|
||||
checkValues("le", expectedLeValues, series)
|
||||
|
||||
series = q.Select(ctx, false, nil, labels.MustNewMatcher(labels.MatchRegexp, "__name__", "test_summary"))
|
||||
checkValues("quantile", expectedQuantileValues, series)
|
||||
}
|
||||
|
||||
func TestScrapeLoopRunCreatesStaleMarkersOnFailedScrapeForTimestampedMetrics(t *testing.T) {
|
||||
appender := &collectResultAppender{}
|
||||
var (
|
||||
|
|
Loading…
Reference in New Issue