alertmanager/doc/alertmanager-mixin/alerts.libsonnet

170 lines
7.6 KiB
Plaintext

{
prometheusAlerts+:: {
groups+: [
{
name: 'alertmanager.rules',
rules: [
{
alert: 'AlertmanagerFailedReload',
expr: |||
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{%(alertmanagerSelector)s}[5m]) == 0
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Reloading an Alertmanager configuration has failed.',
description: 'Configuration has failed to load for %(alertmanagerName)s.' % $._config,
},
},
{
alert: 'AlertmanagerMembersInconsistent',
expr: |||
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m])
< on (%(alertmanagerClusterLabels)s) group_left
count by (%(alertmanagerClusterLabels)s) (max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m]))
||| % $._config,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'A member of an Alertmanager cluster has not found all other cluster members.',
description: 'Alertmanager %(alertmanagerName)s has only found {{ $value }} members of the %(alertmanagerClusterName)s cluster.' % $._config,
},
},
{
alert: 'AlertmanagerFailedToSendAlerts',
expr: |||
(
rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m])
)
> 0.01
||| % $._config,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'An Alertmanager instance failed to send notifications.',
description: 'Alertmanager %(alertmanagerName)s failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.' % $._config,
},
},
{
alert: 'AlertmanagerClusterFailedToSendAlerts',
expr: |||
min by (%(alertmanagerClusterLabels)s, integration) (
rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
)
> 0.01
||| % $._config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'All Alertmanager instances in a cluster failed to send notifications to a critical integration.',
description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
},
},
{
alert: 'AlertmanagerClusterFailedToSendAlerts',
expr: |||
min by (%(alertmanagerClusterLabels)s, integration) (
rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
/
ignoring (reason) group_left rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
)
> 0.01
||| % $._config,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.',
description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
},
},
{
alert: 'AlertmanagerConfigInconsistent',
expr: |||
count by (%(alertmanagerClusterLabels)s) (
count_values by (%(alertmanagerClusterLabels)s) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})
)
!= 1
||| % $._config,
'for': '20m', // A config change across an Alertmanager cluster can take its time. But it's really bad if it persists for too long.
labels: {
severity: 'critical',
},
annotations: {
summary: 'Alertmanager instances within the same cluster have different configurations.',
description: 'Alertmanager instances within the %(alertmanagerClusterName)s cluster have different configurations.' % $._config,
},
},
// Both the following critical alerts, AlertmanagerClusterDown and
// AlertmanagerClusterCrashlooping, fire if a whole cluster is
// unhealthy. It is implied that a generic warning alert is in place
// for individual instances being down or crashlooping.
{
alert: 'AlertmanagerClusterDown',
expr: |||
(
count by (%(alertmanagerClusterLabels)s) (
avg_over_time(up{%(alertmanagerSelector)s}[5m]) < 0.5
)
/
count by (%(alertmanagerClusterLabels)s) (
up{%(alertmanagerSelector)s}
)
)
>= 0.5
||| % $._config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Half or more of the Alertmanager instances within the same cluster are down.',
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have been up for less than half of the last 5m.' % $._config,
},
},
{
alert: 'AlertmanagerClusterCrashlooping',
expr: |||
(
count by (%(alertmanagerClusterLabels)s) (
changes(process_start_time_seconds{%(alertmanagerSelector)s}[10m]) > 4
)
/
count by (%(alertmanagerClusterLabels)s) (
up{%(alertmanagerSelector)s}
)
)
>= 0.5
||| % $._config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Half or more of the Alertmanager instances within the same cluster are crashlooping.',
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have restarted at least 5 times in the last 10m.' % $._config,
},
},
],
},
],
},
}