summary: 'All Alertmanager instances in a cluster failed to send notifications to a critical integration.',
description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
},
},
{
alert: 'AlertmanagerClusterFailedToSendAlerts',
expr: |||
min by (%(alertmanagerClusterLabels)s, integration) (
description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
},
},
{
alert: 'AlertmanagerConfigInconsistent',
expr: |||
count by (%(alertmanagerClusterLabels)s) (
count_values by (%(alertmanagerClusterLabels)s) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})
)
!= 1
||| % $._config,
'for': '20m', // A config change across an Alertmanager cluster can take its time. But it's really bad if it persists for too long.
labels: {
severity: 'critical',
},
annotations: {
summary: 'Alertmanager instances within the same cluster have different configurations.',
description: 'Alertmanager instances within the %(alertmanagerClusterName)s cluster have different configurations.' % $._config,
},
},
// Both the following critical alerts, AlertmanagerClusterDown and
// AlertmanagerClusterCrashlooping, fire if a whole cluster is
// unhealthy. It is implied that a generic warning alert is in place
// for individual instances being down or crashlooping.
summary: 'Half or more of the Alertmanager instances within the same cluster are down.',
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have been up for less than half of the last 5m.' % $._config,
summary: 'Half or more of the Alertmanager instances within the same cluster are crashlooping.',
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have restarted at least 5 times in the last 10m.' % $._config,