mirror of
https://github.com/prometheus/prometheus
synced 2024-12-24 15:32:48 +00:00
chore: add an alert based on the metric prometheus_sd_kubernetes_failures_total
that was introcued in https://github.com/prometheus/prometheus/pull/13554 The same motivation for adding the metric applies: To avoid silent SD failures, as existing logs may not be regularly checked and can be missed. Signed-off-by: machine424 <ayoubmrini424@gmail.com> Co-authored-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
parent
5c417684f8
commit
f9ca6c4ae6
@ -34,6 +34,20 @@
|
||||
description: 'Prometheus %(prometheusName)s has failed to refresh SD with mechanism {{$labels.mechanism}}.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusKubernetesListWatchFailures',
|
||||
expr: |||
|
||||
increase(prometheus_sd_kubernetes_failures_total{%(prometheusSelector)s}[5m]) > 0
|
||||
||| % $._config,
|
||||
'for': '15m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Requests in Kubernetes SD are failing.',
|
||||
description: 'Kubernetes service discovery of Prometheus %(prometheusName)s is experiencing {{ printf "%%.0f" $value }} failures with LIST/WATCH requests to the Kubernetes API in the last 5 minutes.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusNotificationQueueRunningFull',
|
||||
expr: |||
|
||||
|
Loading…
Reference in New Issue
Block a user