From ce108378d4c8d580804452dcc3a31222067793b2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Rabenstein?= Date: Wed, 23 Dec 2020 15:15:38 +0100 Subject: [PATCH] Fix and improve AlertmanagerClusterFailedToSendAlerts (#2437) The alert was just looking at the minimum across integrations. So a complete failure of one integration would be masked by a still worknig other integration. With this fix, the `integration` label is retained (as it was already expected by the `description`), and thus any failing integration will trigger the alert. In addition, an `alertmanagerCriticalIntegrationsRegEx` is provided that allows to mark integrations as critical. Integrations that are not used to deliver critical alerts, or those that are just there for auditing and logging purposes can now be configured to only trigger a warning alert if they fail. Signed-off-by: beorn7 --- doc/alertmanager-mixin/alerts.libsonnet | 27 +++++++++++++++++++++---- doc/alertmanager-mixin/config.libsonnet | 8 ++++++++ 2 files changed, 31 insertions(+), 4 deletions(-) diff --git a/doc/alertmanager-mixin/alerts.libsonnet b/doc/alertmanager-mixin/alerts.libsonnet index eb57ace4..a60428a1 100644 --- a/doc/alertmanager-mixin/alerts.libsonnet +++ b/doc/alertmanager-mixin/alerts.libsonnet @@ -60,10 +60,10 @@ { alert: 'AlertmanagerClusterFailedToSendAlerts', expr: ||| - min by (%(alertmanagerClusterLabels)s) ( - rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m]) + min by (%(alertmanagerClusterLabels)s, integration) ( + rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m]) / - rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m]) + rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m]) ) > 0.01 ||| % $._config, @@ -72,7 +72,26 @@ severity: 'critical', }, annotations: { - summary: 'All Alertmanager instances in a cluster failed to send notifications.', + summary: 'All Alertmanager instances in a cluster failed to send notifications to a critical integration.', + description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config, + }, + }, + { + alert: 'AlertmanagerClusterFailedToSendAlerts', + expr: ||| + min by (%(alertmanagerClusterLabels)s, integration) ( + rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m]) + / + rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m]) + ) + > 0.01 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.', description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config, }, }, diff --git a/doc/alertmanager-mixin/config.libsonnet b/doc/alertmanager-mixin/config.libsonnet index 001886ca..a9d8b558 100644 --- a/doc/alertmanager-mixin/config.libsonnet +++ b/doc/alertmanager-mixin/config.libsonnet @@ -24,5 +24,13 @@ // Alertmanager cluster. All labels used here must also be present // in alertmanagerClusterLabels above. alertmanagerClusterName: '{{$labels.job}}', + + // alertmanagerCriticalIntegrationsRegEx is matched against the + // value of the `integration` label to determine if the + // AlertmanagerClusterFailedToSendAlerts is critical or merely a + // warning. This can be used to avoid paging about a failed + // integration that is itself not used for critical alerts. + // Example: @'pagerduty|webhook' + alertmanagerCriticalIntegrationsRegEx: @'.*', }, }