diff --git a/doc/alertmanager-mixin/alerts.libsonnet b/doc/alertmanager-mixin/alerts.libsonnet index eb57ace4..a60428a1 100644 --- a/doc/alertmanager-mixin/alerts.libsonnet +++ b/doc/alertmanager-mixin/alerts.libsonnet @@ -60,10 +60,10 @@ { alert: 'AlertmanagerClusterFailedToSendAlerts', expr: ||| - min by (%(alertmanagerClusterLabels)s) ( - rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m]) + min by (%(alertmanagerClusterLabels)s, integration) ( + rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m]) / - rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m]) + rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m]) ) > 0.01 ||| % $._config, @@ -72,7 +72,26 @@ severity: 'critical', }, annotations: { - summary: 'All Alertmanager instances in a cluster failed to send notifications.', + summary: 'All Alertmanager instances in a cluster failed to send notifications to a critical integration.', + description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config, + }, + }, + { + alert: 'AlertmanagerClusterFailedToSendAlerts', + expr: ||| + min by (%(alertmanagerClusterLabels)s, integration) ( + rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m]) + / + rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m]) + ) + > 0.01 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.', description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config, }, }, diff --git a/doc/alertmanager-mixin/config.libsonnet b/doc/alertmanager-mixin/config.libsonnet index 001886ca..a9d8b558 100644 --- a/doc/alertmanager-mixin/config.libsonnet +++ b/doc/alertmanager-mixin/config.libsonnet @@ -24,5 +24,13 @@ // Alertmanager cluster. All labels used here must also be present // in alertmanagerClusterLabels above. alertmanagerClusterName: '{{$labels.job}}', + + // alertmanagerCriticalIntegrationsRegEx is matched against the + // value of the `integration` label to determine if the + // AlertmanagerClusterFailedToSendAlerts is critical or merely a + // warning. This can be used to avoid paging about a failed + // integration that is itself not used for critical alerts. + // Example: @'pagerduty|webhook' + alertmanagerCriticalIntegrationsRegEx: @'.*', }, }