From 371ca9ff46c11c2a81e09a5219f9ed25c803cb77 Mon Sep 17 00:00:00 2001 From: beorn7 Date: Wed, 11 Nov 2020 00:14:54 +0100 Subject: [PATCH] prometheus-mixin: add HA-group aware alerts There is certainly a potential to add more of these. This is mostly meant to introduce the concept and cover a few critical parts. Signed-off-by: beorn7 --- .../prometheus-mixin/alerts.libsonnet | 137 +++++++++++++++--- .../prometheus-mixin/config.libsonnet | 15 ++ 2 files changed, 132 insertions(+), 20 deletions(-) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet index ffb7971bc..e07616ef7 100644 --- a/documentation/prometheus-mixin/alerts.libsonnet +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -60,26 +60,6 @@ description: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}}.' % $._config, }, }, - { - alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager', - expr: ||| - min without(alertmanager) ( - rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) - / - rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) - ) - * 100 - > 3 - ||| % $._config, - 'for': '15m', - labels: { - severity: 'critical', - }, - annotations: { - summary: 'Prometheus encounters more than 3% errors sending alerts to any Alertmanager.', - description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from Prometheus %(prometheusName)s to any Alertmanager.' % $._config, - }, - }, { alert: 'PrometheusNotConnectedToAlertmanagers', expr: ||| @@ -281,6 +261,123 @@ description: 'Prometheus %(prometheusName)s has dropped {{ printf "%%.0f" $value }} targets because the number of targets exceeded the configured target_limit.' % $._config, }, }, + ] + if $._config.prometheusHAGroupLabels == '' then self.rulesWithoutHA else self.rulesWithHA, + rulesWithoutHA:: [ + { + alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager', + expr: ||| + min without (alertmanager) ( + rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) + / + rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) + ) + * 100 + > 3 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Prometheus encounters more than 3% errors sending alerts to any Alertmanager.', + description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from Prometheus %(prometheusName)s to any Alertmanager.' % $._config, + }, + }, + ], + rulesWithHA:: [ + { + alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager', + expr: ||| + min by (%(prometheusHAGroupLabels)s) ( + rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) + / + rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) + ) + * 100 + > 3 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Each Prometheus server in an HA group encounters more than 3% errors sending alerts to any Alertmanager.', + description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from any Prometheus server in HA group %(prometheusHAGroupName)s to any Alertmanager.' % $._config, + }, + }, + { + alert: 'PrometheusHAGroupNotIngestingSamples', + expr: ||| + max by (%(prometheusHAGroupLabels)s) ( + rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) + and + ( + sum without(scrape_job) (prometheus_target_metadata_cache_entries{%(prometheusSelector)s}) > 0 + or + sum without(rule_group) (prometheus_rule_group_rules{%(prometheusSelector)s}) > 0 + ) + ) + <= 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'A whole Prometheus HA group is not ingesting samples.', + description: 'None of the Prometheus instances in HA group %(prometheusHAGroupName)s is ingesting any samples.' % $._config, + }, + }, + // Both the following critical alerts, PrometheusHAGroupDown and + // PrometheusHAGroupCrashlooping, fire if a whole HA group is + // unhealthy. It is implied that a generic warning alert is in place + // for individual instances being down or crashlooping. + { + alert: 'PrometheusHAGroupDown', + expr: ||| + ( + count by (%(prometheusHAGroupLabels)s) ( + avg_over_time(up{%(prometheusSelector)s}[5m]) < 0.5 + ) + / + count by (%(prometheusHAGroupLabels)s) ( + up{%(prometheusSelector)s} + ) + ) + > 0.5 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'More than half of the Prometheus instances within the same HA group are down.', + description: '{{ $value | humanizePercentage }} of Prometheus instances within the %(prometheusHAGroupName)s HA group have been up for less than half of the last 5m.' % $._config, + }, + }, + { + alert: 'PrometheusHAGroupCrashlooping', + expr: ||| + ( + count by (%(prometheusHAGroupLabels)s) ( + changes(process_start_time_seconds{%(prometheusSelector)s}[30m]) > 4 + ) + / + count by (%(prometheusHAGroupLabels)s) ( + up{%(prometheusSelector)s} + ) + ) + > 0.5 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'More than half of the Prometheus instances within the same HA group are crashlooping.', + description: '{{ $value | humanizePercentage }} of Prometheus instances within the %(prometheusHAGroupName)s HA group have restarted at least 5 times in the last 30m.' % $._config, + }, + }, ], }, ], diff --git a/documentation/prometheus-mixin/config.libsonnet b/documentation/prometheus-mixin/config.libsonnet index 27614289a..5c4d0123d 100644 --- a/documentation/prometheus-mixin/config.libsonnet +++ b/documentation/prometheus-mixin/config.libsonnet @@ -5,6 +5,16 @@ // servers. prometheusSelector: 'job="prometheus"', + // prometheusHAGroupLabels is a string with comma-separated labels + // that are common labels of instances belonging to the same + // high-availability group of Prometheus servers, i.e. identically + // configured Prometheus servers. Include not only enough labels + // to identify the members of the HA group, but also all common + // labels you want to keep for resulting HA-group-level alerts. + // + // If this is set to an empty string, no HA-related alerts are applied. + prometheusHAGroupLabels: '', + // prometheusName is inserted into annotations to name the Prometheus // instance affected by the alert. prometheusName: '{{$labels.instance}}', @@ -12,5 +22,10 @@ // Operator, you can make use of the configured target labels for // nicer naming: // prometheusNameTemplate: '{{$labels.namespace}}/{{$labels.pod}}' + + // prometheusHAGroupName is inserted into annotations to name an + // HA group. All labels used here must also be present in + // prometheusHAGroupLabels above. + prometheusHAGroupName: '{{$labels.job}}', }, }