Protect gauge-based alerts against failed scrapes

Signed-off-by: beorn7 <beorn@grafana.com>
This commit is contained in:
beorn7 2019-06-28 16:46:19 +02:00
parent 52707535b8
commit 9a2177949d
1 changed files with 14 additions and 6 deletions

View File

@ -7,9 +7,11 @@
{
alert: 'PrometheusBadConfig',
expr: |||
prometheus_config_last_reload_successful{%(prometheusSelector)s} == 0
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_config_last_reload_successful{%(prometheusSelector)s}[5m]) == 0
||| % $._config,
'for': '15m',
'for': '10m',
labels: {
severity: 'critical',
},
@ -21,10 +23,12 @@
{
alert: 'PrometheusNotificationQueueRunningFull',
expr: |||
# Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30)
>
prometheus_notifications_queue_capacity{%(prometheusSelector)s}
min_over_time(prometheus_notifications_queue_capacity{%(prometheusSelector)s}[5m])
)
||| % $._config,
'for': '15m',
@ -79,7 +83,9 @@
{
alert: 'PrometheusNotConnectedToAlertmanagers',
expr: |||
prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s} < 1
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s}[5m]) < 1
||| % $._config,
'for': '10m',
labels: {
@ -201,10 +207,12 @@
{
alert: 'PrometheusRemoteWriteBehind',
expr: |||
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s}
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s}[5m])
- on(job, instance) group_right
prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s}
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s}[5m])
)
> 120
||| % $._config,