Beginnings of an Alertmanager mixin. (#1629)

Add an Alertmanager mixin

Signed-off-by: beorn7 <beorn@grafana.com>
Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com>
Co-authored-by: beorn7 <beorn@grafana.com>
Co-authored-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
Tom Wilkie 2020-12-03 14:57:42 +00:00 committed by GitHub
parent 042d7b34f0
commit 6c5dee008f
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 258 additions and 0 deletions

View File

@ -99,6 +99,17 @@ jobs:
- store_test_results:
path: test-results
mixin:
docker:
# Whenever the Go version is updated here, .promu.yml should also be updated.
- image: circleci/golang:1.14
steps:
- checkout
- run: cd doc/alertmanager-mixin; go install github.com/monitoring-mixins/mixtool/cmd/mixtool
- run: cd doc/alertmanager-mixin; go install github.com/google/go-jsonnet/cmd/jsonnetfmt
- run: cd doc/alertmanager-mixin; make lint
workflows:
version: 2
alertmanager:
@ -116,6 +127,10 @@ workflows:
filters:
tags:
only: /.*/
- mixin:
filters:
tags:
only: /.*/
- prometheus/publish_master:
context: org-context
requires:

View File

@ -0,0 +1,24 @@
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s
ALERTMANAGER_ALERTS := alertmanager_alerts.yaml
default: build
all: fmt build
fmt:
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
xargs -n 1 -- $(JSONNET_FMT) -i
lint: build
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
while read f; do \
$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
done
mixtool lint mixin.libsonnet
build:
mixtool generate alerts mixin.libsonnet > $(ALERTMANAGER_ALERTS)
clean:
rm -rf $(ALERTMANAGER_ALERTS)

View File

@ -0,0 +1,30 @@
# Alertmanager Mixin
The Alertmanager Mixin is a set of configurable, reusable, and extensible
alerts (and eventually dashboards) for Alertmanager.
The alerts are designed to monitor a cluster of Alertmanager instances. To make
them work as expected, the Prometheus server the alerts are evaluated on has to
scrape all Alertmanager instances of the cluster, even if those instances are
distributed over different locations. All Alertmanager instances in the same
Alertmanager cluster must have the same `job` label. In turn, if monitoring
multiple different Alertmanager clusters, instances from different clusters
must have a different `job` label.
The most basic use of the Alertmanager Mixin is to create a YAML file with the
alerts from it. To do so, you need to have `jsonnetfmt` and `mixtool` installed. If you have a working Go development environment, it's
easiest to run the following:
```bash
$ go get github.com/monitoring-mixins/mixtool/cmd/mixtool
$ go get github.com/google/go-jsonnet/cmd/jsonnetfmt
```
Edit `config.libsonnet` to match your environment and then build
`alertmanager_alerts.yaml` with the alerts by running:
```bash
$ make build
```
For instructions on more advanced uses of mixins, see https://github.com/monitoring-mixins/docs.

View File

@ -0,0 +1 @@
std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts)

View File

@ -0,0 +1,150 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'alertmanager.rules',
rules: [
{
alert: 'AlertmanagerFailedReload',
expr: |||
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_config_last_reload_successful{%(alertmanagerSelector)s}[5m]) == 0
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Reloading an Alertmanager configuration has failed.',
description: 'Configuration has failed to load for %(alertmanagerName)s.' % $._config,
},
},
{
alert: 'AlertmanagerMembersInconsistent',
expr: |||
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m])
< on (%(alertmanagerClusterLabels)s) group_left
count by (%(alertmanagerClusterLabels)s) (max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m]))
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'A member of an Alertmanager cluster has not found all other cluster members.',
description: 'Alertmanager %(alertmanagerName)s has only found {{ $value }} members of the %(alertmanagerClusterName)s cluster.' % $._config,
},
},
{
alert: 'AlertmanagerFailedToSendAlerts',
expr: |||
(
rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])
/
rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m])
)
> 0.01
||| % $._config,
'for': '5m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'An Alertmanager instance failed to send notifications.',
description: 'Alertmanager %(alertmanagerName)s failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.' % $._config,
},
},
{
alert: 'AlertmanagerClusterFailedToSendAlerts',
expr: |||
min by (%(alertmanagerClusterLabels)s) (
rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])
/
rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m])
)
> 0.01
||| % $._config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'All Alertmanager instances in a cluster failed to send notifications.',
description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
},
},
{
alert: 'AlertmanagerConfigInconsistent',
expr: |||
count by (%(alertmanagerClusterLabels)s) (
count_values by (%(alertmanagerClusterLabels)s) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})
)
!= 1
||| % $._config,
'for': '20m', // A config change across an Alertmanager cluster can take its time. But it's really bad if it persists for too long.
labels: {
severity: 'critical',
},
annotations: {
summary: 'Alertmanager instances within the same cluster have different configurations.',
description: 'Alertmanager instances within the %(alertmanagerClusterName)s cluster have different configurations.' % $._config,
},
},
// Both the following critical alerts, AlertmanagerClusterDown and
// AlertmanagerClusterCrashlooping, fire if a whole cluster is
// unhealthy. It is implied that a generic warning alert is in place
// for individual instances being down or crashlooping.
{
alert: 'AlertmanagerClusterDown',
expr: |||
(
count by (%(alertmanagerClusterLabels)s) (
avg_over_time(up{%(alertmanagerSelector)s}[5m]) < 0.5
)
/
count by (%(alertmanagerClusterLabels)s) (
up{%(alertmanagerSelector)s}
)
)
>= 0.5
||| % $._config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Half or more of the Alertmanager instances within the same cluster are down.',
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have been up for less than half of the last 5m.' % $._config,
},
},
{
alert: 'AlertmanagerClusterCrashlooping',
expr: |||
(
count by (%(alertmanagerClusterLabels)s) (
changes(process_start_time_seconds{%(alertmanagerSelector)s}[10m]) > 4
)
/
count by (%(alertmanagerClusterLabels)s) (
up{%(alertmanagerSelector)s}
)
)
>= 0.5
||| % $._config,
'for': '5m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Half or more of the Alertmanager instances within the same cluster are crashlooping.',
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have restarted at least 5 times in the last 10m.' % $._config,
},
},
],
},
],
},
}

View File

@ -0,0 +1,28 @@
{
_config+:: {
// alertmanagerSelector is inserted as part of the label selector in
// PromQL queries to identify metrics collected from Alertmanager
// servers.
alertmanagerSelector: 'job="alertmanager"',
// alertmanagerClusterLabels is a string with comma-separated
// labels that are common labels of instances belonging to the
// same Alertmanager cluster. Include not only enough labels to
// identify cluster members, but also all common labels you want
// to keep for resulting cluster-level alerts.
alertmanagerClusterLabels: 'job',
// alertmanagerName is inserted into annotations to name the Alertmanager
// instance affected by the alert.
alertmanagerName: '{{$labels.instance}}',
// If you run Alertmanager on Kubernetes with the Prometheus
// Operator, you can make use of the configured target labels for
// nicer naming:
// alertmanagerName: '{{$labels.namespace}}/{{$labels.pod}}'
// alertmanagerClusterName is inserted into annotations to name an
// Alertmanager cluster. All labels used here must also be present
// in alertmanagerClusterLabels above.
alertmanagerClusterName: '{{$labels.job}}',
},
}

View File

@ -0,0 +1,8 @@
module github.com/prometheus/alertmanager/doc/alertmanager-mixin
go 1.14
require (
github.com/google/go-jsonnet v0.17.0 // indirect
github.com/monitoring-mixins/mixtool v0.0.0-20201127170310-63dca667103c // indirect
)

View File

@ -0,0 +1,2 @@
(import 'config.libsonnet') +
(import 'alerts.libsonnet')