Fix and improve AlertmanagerClusterFailedToSendAlerts (#2437)
The alert was just looking at the minimum across integrations. So a complete failure of one integration would be masked by a still worknig other integration. With this fix, the `integration` label is retained (as it was already expected by the `description`), and thus any failing integration will trigger the alert. In addition, an `alertmanagerCriticalIntegrationsRegEx` is provided that allows to mark integrations as critical. Integrations that are not used to deliver critical alerts, or those that are just there for auditing and logging purposes can now be configured to only trigger a warning alert if they fail. Signed-off-by: beorn7 <beorn@grafana.com>
This commit is contained in:
parent
9e1e4fa420
commit
ce108378d4
|
@ -60,10 +60,10 @@
|
|||
{
|
||||
alert: 'AlertmanagerClusterFailedToSendAlerts',
|
||||
expr: |||
|
||||
min by (%(alertmanagerClusterLabels)s) (
|
||||
rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])
|
||||
min by (%(alertmanagerClusterLabels)s, integration) (
|
||||
rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m])
|
||||
rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
||| % $._config,
|
||||
|
@ -72,7 +72,26 @@
|
|||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'All Alertmanager instances in a cluster failed to send notifications.',
|
||||
summary: 'All Alertmanager instances in a cluster failed to send notifications to a critical integration.',
|
||||
description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'AlertmanagerClusterFailedToSendAlerts',
|
||||
expr: |||
|
||||
min by (%(alertmanagerClusterLabels)s, integration) (
|
||||
rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
|
||||
)
|
||||
> 0.01
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.',
|
||||
description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
|
||||
},
|
||||
},
|
||||
|
|
|
@ -24,5 +24,13 @@
|
|||
// Alertmanager cluster. All labels used here must also be present
|
||||
// in alertmanagerClusterLabels above.
|
||||
alertmanagerClusterName: '{{$labels.job}}',
|
||||
|
||||
// alertmanagerCriticalIntegrationsRegEx is matched against the
|
||||
// value of the `integration` label to determine if the
|
||||
// AlertmanagerClusterFailedToSendAlerts is critical or merely a
|
||||
// warning. This can be used to avoid paging about a failed
|
||||
// integration that is itself not used for critical alerts.
|
||||
// Example: @'pagerduty|webhook'
|
||||
alertmanagerCriticalIntegrationsRegEx: @'.*',
|
||||
},
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue