alertmanager/doc/alertmanager-mixin/alerts.libsonnet

{
  prometheusAlerts+:: {
    groups+: [
      {
        name: 'alertmanager.rules',
        rules: [
          {
            alert: 'AlertmanagerFailedReload',
            expr: |||
              # Without max_over_time, failed scrapes could create false negatives, see
              # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
              max_over_time(alertmanager_config_last_reload_successful{%(alertmanagerSelector)s}[5m]) == 0
            ||| % $._config,
            'for': '10m',
            labels: {
              severity: 'critical',
            },
            annotations: {
              summary: 'Reloading an Alertmanager configuration has failed.',
              description: 'Configuration has failed to load for %(alertmanagerName)s.' % $._config,
            },
          },
          {
            alert: 'AlertmanagerMembersInconsistent',
            expr: |||
              # Without max_over_time, failed scrapes could create false negatives, see
              # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
                max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m])
              < on (%(alertmanagerClusterLabels)s) group_left
                count by (%(alertmanagerClusterLabels)s) (max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m]))
            ||| % $._config,
            'for': '15m',
            labels: {
              severity: 'critical',
            },
            annotations: {
              summary: 'A member of an Alertmanager cluster has not found all other cluster members.',
              description: 'Alertmanager %(alertmanagerName)s has only found {{ $value }} members of the %(alertmanagerClusterName)s cluster.' % $._config,
            },
          },
          {
            alert: 'AlertmanagerFailedToSendAlerts',
            expr: |||
              (
                rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])
              /
                ignoring (reason) group_left rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m])
              )
              > 0.01
            ||| % $._config,
            'for': '5m',
            labels: {
              severity: 'warning',
            },
            annotations: {
              summary: 'An Alertmanager instance failed to send notifications.',
              description: 'Alertmanager %(alertmanagerName)s failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.' % $._config,
            },
          },
          {
            alert: 'AlertmanagerClusterFailedToSendAlerts',
            expr: |||
              min by (%(alertmanagerClusterLabels)s, integration) (
                rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
              /
                ignoring (reason) group_left rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
              )
              > 0.01
            ||| % $._config,
            'for': '5m',
            labels: {
              severity: 'critical',
            },
            annotations: {
              summary: 'All Alertmanager instances in a cluster failed to send notifications to a critical integration.',
              description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
            },
          },
          {
            alert: 'AlertmanagerClusterFailedToSendAlerts',
            expr: |||
              min by (%(alertmanagerClusterLabels)s, integration) (
                rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
              /
                ignoring (reason) group_left rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
              )
              > 0.01
            ||| % $._config,
            'for': '5m',
            labels: {
              severity: 'warning',
            },
            annotations: {
              summary: 'All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.',
              description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
            },
          },
          {
            alert: 'AlertmanagerConfigInconsistent',
            expr: |||
              count by (%(alertmanagerClusterLabels)s) (
                count_values by (%(alertmanagerClusterLabels)s) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})
              )
              != 1
            ||| % $._config,
            'for': '20m',  // A config change across an Alertmanager cluster can take its time. But it's really bad if it persists for too long.
            labels: {
              severity: 'critical',
            },
            annotations: {
              summary: 'Alertmanager instances within the same cluster have different configurations.',
              description: 'Alertmanager instances within the %(alertmanagerClusterName)s cluster have different configurations.' % $._config,
            },
          },
          // Both the following critical alerts, AlertmanagerClusterDown and
          // AlertmanagerClusterCrashlooping, fire if a whole cluster is
          // unhealthy. It is implied that a generic warning alert is in place
          // for individual instances being down or crashlooping.
          {
            alert: 'AlertmanagerClusterDown',
            expr: |||
              (
                count by (%(alertmanagerClusterLabels)s) (
                  avg_over_time(up{%(alertmanagerSelector)s}[5m]) < 0.5
                )
              /
                count by (%(alertmanagerClusterLabels)s) (
                  up{%(alertmanagerSelector)s}
                )
              )
              >= 0.5
            ||| % $._config,
            'for': '5m',
            labels: {
              severity: 'critical',
            },
            annotations: {
              summary: 'Half or more of the Alertmanager instances within the same cluster are down.',
              description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have been up for less than half of the last 5m.' % $._config,
            },
          },
          {
            alert: 'AlertmanagerClusterCrashlooping',
            expr: |||
              (
                count by (%(alertmanagerClusterLabels)s) (
                  changes(process_start_time_seconds{%(alertmanagerSelector)s}[10m]) > 4
                )
              /
                count by (%(alertmanagerClusterLabels)s) (
                  up{%(alertmanagerSelector)s}
                )
              )
              >= 0.5
            ||| % $._config,
            'for': '5m',
            labels: {
              severity: 'critical',
            },
            annotations: {
              summary: 'Half or more of the Alertmanager instances within the same cluster are crashlooping.',
              description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have restarted at least 5 times in the last 10m.' % $._config,
            },
          },
        ],
      },
    ],
  },
}
Beginnings of an Alertmanager mixin. (#1629) Add an Alertmanager mixin Signed-off-by: beorn7 <beorn@grafana.com> Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com> Co-authored-by: beorn7 <beorn@grafana.com> Co-authored-by: Simon Pasquier <spasquie@redhat.com> 2020-12-03 14:57:42 +00:00			`{`
			`prometheusAlerts+:: {`
			`groups+: [`
			`{`
			`name: 'alertmanager.rules',`
			`rules: [`
			`{`
			`alert: 'AlertmanagerFailedReload',`
			`expr: \|\|\|`
			`# Without max_over_time, failed scrapes could create false negatives, see`
			`# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.`
			`max_over_time(alertmanager_config_last_reload_successful{%(alertmanagerSelector)s}[5m]) == 0`
			`\|\|\| % $._config,`
			`'for': '10m',`
			`labels: {`
			`severity: 'critical',`
			`},`
			`annotations: {`
			`summary: 'Reloading an Alertmanager configuration has failed.',`
			`description: 'Configuration has failed to load for %(alertmanagerName)s.' % $._config,`
			`},`
			`},`
			`{`
			`alert: 'AlertmanagerMembersInconsistent',`
			`expr: \|\|\|`
			`# Without max_over_time, failed scrapes could create false negatives, see`
			`# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.`
			`max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m])`
			`< on (%(alertmanagerClusterLabels)s) group_left`
			`count by (%(alertmanagerClusterLabels)s) (max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m]))`
			`\|\|\| % $._config,`
Improve the AlertmanagerMembersInconsistent alert The expression alertmanager_cluster_members{job="alertmanager"}[5m]) is assumed to return one series for each alertmanager instance in the cluster. When running inside Kubernetes, alertmanager pods can get evicted and rescheduled. This can change the instance label and produce a new series for that alertmanager instance. When the same pod gets evicted several times in a row, there will be a short interval in which Prometheus will return values from both the new series and the old series. As a result, counting the number of series for the alertmanager_cluster_members metric will overestimate the number of instances in the given cluster. This commit modifies the the AlertmanagerMembersInconsistent alert to increase the for clause to 15m in order to reduce the probability of a false positive. Signed-off-by: fpetkovski <filip.petkovsky@gmail.com> 2021-06-10 11:26:36 +00:00			`'for': '15m',`
Beginnings of an Alertmanager mixin. (#1629) Add an Alertmanager mixin Signed-off-by: beorn7 <beorn@grafana.com> Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com> Co-authored-by: beorn7 <beorn@grafana.com> Co-authored-by: Simon Pasquier <spasquie@redhat.com> 2020-12-03 14:57:42 +00:00			`labels: {`
			`severity: 'critical',`
			`},`
			`annotations: {`
			`summary: 'A member of an Alertmanager cluster has not found all other cluster members.',`
			`description: 'Alertmanager %(alertmanagerName)s has only found {{ $value }} members of the %(alertmanagerClusterName)s cluster.' % $._config,`
			`},`
			`},`
			`{`
			`alert: 'AlertmanagerFailedToSendAlerts',`
			`expr: \|\|\|`
			`(`
			`rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])`
			`/`
fix label mismatch for alertmanager_notifications_failed_total Signed-off-by: chengzw <chengzw258@163.com> 2023-11-10 12:50:33 +00:00			`ignoring (reason) group_left rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m])`
Beginnings of an Alertmanager mixin. (#1629) Add an Alertmanager mixin Signed-off-by: beorn7 <beorn@grafana.com> Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com> Co-authored-by: beorn7 <beorn@grafana.com> Co-authored-by: Simon Pasquier <spasquie@redhat.com> 2020-12-03 14:57:42 +00:00			`)`
			`> 0.01`
			`\|\|\| % $._config,`
			`'for': '5m',`
			`labels: {`
			`severity: 'warning',`
			`},`
			`annotations: {`
			`summary: 'An Alertmanager instance failed to send notifications.',`
			`description: 'Alertmanager %(alertmanagerName)s failed to send {{ $value \| humanizePercentage }} of notifications to {{ $labels.integration }}.' % $._config,`
			`},`
			`},`
			`{`
			`alert: 'AlertmanagerClusterFailedToSendAlerts',`
			`expr: \|\|\|`
Fix and improve AlertmanagerClusterFailedToSendAlerts (#2437) The alert was just looking at the minimum across integrations. So a complete failure of one integration would be masked by a still worknig other integration. With this fix, the `integration` label is retained (as it was already expected by the `description`), and thus any failing integration will trigger the alert. In addition, an `alertmanagerCriticalIntegrationsRegEx` is provided that allows to mark integrations as critical. Integrations that are not used to deliver critical alerts, or those that are just there for auditing and logging purposes can now be configured to only trigger a warning alert if they fail. Signed-off-by: beorn7 <beorn@grafana.com> 2020-12-23 14:15:38 +00:00			`min by (%(alertmanagerClusterLabels)s, integration) (`
			rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
Beginnings of an Alertmanager mixin. (#1629) Add an Alertmanager mixin Signed-off-by: beorn7 <beorn@grafana.com> Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com> Co-authored-by: beorn7 <beorn@grafana.com> Co-authored-by: Simon Pasquier <spasquie@redhat.com> 2020-12-03 14:57:42 +00:00			`/`
fix label mismatch for alertmanager_notifications_failed_total Signed-off-by: chengzw <chengzw258@163.com> 2023-11-10 12:50:33 +00:00			ignoring (reason) group_left rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration=~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
Beginnings of an Alertmanager mixin. (#1629) Add an Alertmanager mixin Signed-off-by: beorn7 <beorn@grafana.com> Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com> Co-authored-by: beorn7 <beorn@grafana.com> Co-authored-by: Simon Pasquier <spasquie@redhat.com> 2020-12-03 14:57:42 +00:00			`)`
			`> 0.01`
			`\|\|\| % $._config,`
			`'for': '5m',`
			`labels: {`
			`severity: 'critical',`
			`},`
			`annotations: {`
Fix and improve AlertmanagerClusterFailedToSendAlerts (#2437) The alert was just looking at the minimum across integrations. So a complete failure of one integration would be masked by a still worknig other integration. With this fix, the `integration` label is retained (as it was already expected by the `description`), and thus any failing integration will trigger the alert. In addition, an `alertmanagerCriticalIntegrationsRegEx` is provided that allows to mark integrations as critical. Integrations that are not used to deliver critical alerts, or those that are just there for auditing and logging purposes can now be configured to only trigger a warning alert if they fail. Signed-off-by: beorn7 <beorn@grafana.com> 2020-12-23 14:15:38 +00:00			`summary: 'All Alertmanager instances in a cluster failed to send notifications to a critical integration.',`
			`description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value \| humanizePercentage }}.' % $._config,`
			`},`
			`},`
			`{`
			`alert: 'AlertmanagerClusterFailedToSendAlerts',`
			`expr: \|\|\|`
			`min by (%(alertmanagerClusterLabels)s, integration) (`
			rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
			`/`
fix label mismatch for alertmanager_notifications_failed_total Signed-off-by: chengzw <chengzw258@163.com> 2023-11-10 12:50:33 +00:00			ignoring (reason) group_left rate(alertmanager_notifications_total{%(alertmanagerSelector)s, integration!~`%(alertmanagerCriticalIntegrationsRegEx)s`}[5m])
Fix and improve AlertmanagerClusterFailedToSendAlerts (#2437) The alert was just looking at the minimum across integrations. So a complete failure of one integration would be masked by a still worknig other integration. With this fix, the `integration` label is retained (as it was already expected by the `description`), and thus any failing integration will trigger the alert. In addition, an `alertmanagerCriticalIntegrationsRegEx` is provided that allows to mark integrations as critical. Integrations that are not used to deliver critical alerts, or those that are just there for auditing and logging purposes can now be configured to only trigger a warning alert if they fail. Signed-off-by: beorn7 <beorn@grafana.com> 2020-12-23 14:15:38 +00:00			`)`
			`> 0.01`
			`\|\|\| % $._config,`
			`'for': '5m',`
			`labels: {`
			`severity: 'warning',`
			`},`
			`annotations: {`
			`summary: 'All Alertmanager instances in a cluster failed to send notifications to a non-critical integration.',`
Beginnings of an Alertmanager mixin. (#1629) Add an Alertmanager mixin Signed-off-by: beorn7 <beorn@grafana.com> Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com> Co-authored-by: beorn7 <beorn@grafana.com> Co-authored-by: Simon Pasquier <spasquie@redhat.com> 2020-12-03 14:57:42 +00:00			`description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value \| humanizePercentage }}.' % $._config,`
			`},`
			`},`
			`{`
			`alert: 'AlertmanagerConfigInconsistent',`
			`expr: \|\|\|`
			`count by (%(alertmanagerClusterLabels)s) (`
			`count_values by (%(alertmanagerClusterLabels)s) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})`
			`)`
			`!= 1`
			`\|\|\| % $._config,`
			`'for': '20m', // A config change across an Alertmanager cluster can take its time. But it's really bad if it persists for too long.`
			`labels: {`
			`severity: 'critical',`
			`},`
			`annotations: {`
			`summary: 'Alertmanager instances within the same cluster have different configurations.',`
			`description: 'Alertmanager instances within the %(alertmanagerClusterName)s cluster have different configurations.' % $._config,`
			`},`
			`},`
			`// Both the following critical alerts, AlertmanagerClusterDown and`
			`// AlertmanagerClusterCrashlooping, fire if a whole cluster is`
			`// unhealthy. It is implied that a generic warning alert is in place`
			`// for individual instances being down or crashlooping.`
			`{`
			`alert: 'AlertmanagerClusterDown',`
			`expr: \|\|\|`
			`(`
			`count by (%(alertmanagerClusterLabels)s) (`
			`avg_over_time(up{%(alertmanagerSelector)s}[5m]) < 0.5`
			`)`
			`/`
			`count by (%(alertmanagerClusterLabels)s) (`
			`up{%(alertmanagerSelector)s}`
			`)`
			`)`
			`>= 0.5`
			`\|\|\| % $._config,`
			`'for': '5m',`
			`labels: {`
			`severity: 'critical',`
			`},`
			`annotations: {`
			`summary: 'Half or more of the Alertmanager instances within the same cluster are down.',`
			`description: '{{ $value \| humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have been up for less than half of the last 5m.' % $._config,`
			`},`
			`},`
			`{`
			`alert: 'AlertmanagerClusterCrashlooping',`
			`expr: \|\|\|`
			`(`
			`count by (%(alertmanagerClusterLabels)s) (`
			`changes(process_start_time_seconds{%(alertmanagerSelector)s}[10m]) > 4`
			`)`
			`/`
			`count by (%(alertmanagerClusterLabels)s) (`
			`up{%(alertmanagerSelector)s}`
			`)`
			`)`
			`>= 0.5`
			`\|\|\| % $._config,`
			`'for': '5m',`
			`labels: {`
			`severity: 'critical',`
			`},`
			`annotations: {`
			`summary: 'Half or more of the Alertmanager instances within the same cluster are crashlooping.',`
			`description: '{{ $value \| humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have restarted at least 5 times in the last 10m.' % $._config,`
			`},`
			`},`
			`],`
			`},`
			`],`
			`},`
			`}`