Fix and improve AlertmanagerClusterFailedToSendAlerts (#2437)

The alert was just looking at the minimum across integrations. So a
complete failure of one integration would be masked by a still worknig
other integration. With this fix, the `integration` label is retained
(as it was already expected by the `description`), and thus any
failing integration will trigger the alert.

In addition, an `alertmanagerCriticalIntegrationsRegEx` is provided
that allows to mark integrations as critical. Integrations that are
not used to deliver critical alerts, or those that are just there for
auditing and logging purposes can now be configured to only trigger a
warning alert if they fail.

Signed-off-by: beorn7 <beorn@grafana.com>
This commit is contained in:
Björn Rabenstein 2020-12-23 15:15:38 +01:00 committed by GitHub
parent 9e1e4fa420
commit ce108378d4
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 31 additions and 4 deletions

View File

@ -60,10 +60,10 @@
{ {
alert: 'AlertmanagerClusterFailedToSendAlerts', alert: 'AlertmanagerClusterFailedToSendAlerts',
expr: ||| expr: |||
min by (%(alertmanagerClusterLabels)s) ( min by (%(alertmanagerClusterLabels)s, integration) (
rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m]) rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
/ /
rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m]) rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
) )
> 0.01 > 0.01
||| % $._config, ||| % $._config,
@ -72,7 +72,26 @@
severity: 'critical', severity: 'critical',
}, },
annotations: { annotations: {
summary: 'All Alertmanager instances in a cluster failed to send notifications.', summary: 'All Alertmanager instances in a cluster failed to send notifications to a critical integration.',
description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
},
},
{
alert: 'AlertmanagerClusterFailedToSendAlerts',
expr: |||
min by (%(alertmanagerClusterLabels)s, integration) (
rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
/
rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
)
> 0.01
||| % $._config,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.',
description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config, description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
}, },
}, },

View File

@ -24,5 +24,13 @@
// Alertmanager cluster. All labels used here must also be present // Alertmanager cluster. All labels used here must also be present
// in alertmanagerClusterLabels above. // in alertmanagerClusterLabels above.
alertmanagerClusterName: '{{$labels.job}}', alertmanagerClusterName: '{{$labels.job}}',
// alertmanagerCriticalIntegrationsRegEx is matched against the
// value of the `integration` label to determine if the
// AlertmanagerClusterFailedToSendAlerts is critical or merely a
// warning. This can be used to avoid paging about a failed
// integration that is itself not used for critical alerts.
// Example: @'pagerduty|webhook'
alertmanagerCriticalIntegrationsRegEx: @'.*',
}, },
} }