diff --git a/.circleci/config.yml b/.circleci/config.yml index 8a626c5e..aa6b45a6 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -99,6 +99,17 @@ jobs: - store_test_results: path: test-results + mixin: + docker: + # Whenever the Go version is updated here, .promu.yml should also be updated. + - image: circleci/golang:1.14 + + steps: + - checkout + - run: cd doc/alertmanager-mixin; go install github.com/monitoring-mixins/mixtool/cmd/mixtool + - run: cd doc/alertmanager-mixin; go install github.com/google/go-jsonnet/cmd/jsonnetfmt + - run: cd doc/alertmanager-mixin; make lint + workflows: version: 2 alertmanager: @@ -116,6 +127,10 @@ workflows: filters: tags: only: /.*/ + - mixin: + filters: + tags: + only: /.*/ - prometheus/publish_master: context: org-context requires: diff --git a/doc/alertmanager-mixin/Makefile b/doc/alertmanager-mixin/Makefile new file mode 100644 index 00000000..360ddf2d --- /dev/null +++ b/doc/alertmanager-mixin/Makefile @@ -0,0 +1,24 @@ +JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s +ALERTMANAGER_ALERTS := alertmanager_alerts.yaml + +default: build + +all: fmt build + +fmt: + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + xargs -n 1 -- $(JSONNET_FMT) -i + +lint: build + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + while read f; do \ + $(JSONNET_FMT) "$$f" | diff -u "$$f" -; \ + done + + mixtool lint mixin.libsonnet + +build: + mixtool generate alerts mixin.libsonnet > $(ALERTMANAGER_ALERTS) + +clean: + rm -rf $(ALERTMANAGER_ALERTS) diff --git a/doc/alertmanager-mixin/README.md b/doc/alertmanager-mixin/README.md new file mode 100644 index 00000000..d3190d75 --- /dev/null +++ b/doc/alertmanager-mixin/README.md @@ -0,0 +1,30 @@ +# Alertmanager Mixin + +The Alertmanager Mixin is a set of configurable, reusable, and extensible +alerts (and eventually dashboards) for Alertmanager. + +The alerts are designed to monitor a cluster of Alertmanager instances. To make +them work as expected, the Prometheus server the alerts are evaluated on has to +scrape all Alertmanager instances of the cluster, even if those instances are +distributed over different locations. All Alertmanager instances in the same +Alertmanager cluster must have the same `job` label. In turn, if monitoring +multiple different Alertmanager clusters, instances from different clusters +must have a different `job` label. + +The most basic use of the Alertmanager Mixin is to create a YAML file with the +alerts from it. To do so, you need to have `jsonnetfmt` and `mixtool` installed. If you have a working Go development environment, it's +easiest to run the following: + +```bash +$ go get github.com/monitoring-mixins/mixtool/cmd/mixtool +$ go get github.com/google/go-jsonnet/cmd/jsonnetfmt +``` + +Edit `config.libsonnet` to match your environment and then build +`alertmanager_alerts.yaml` with the alerts by running: + +```bash +$ make build +``` + +For instructions on more advanced uses of mixins, see https://github.com/monitoring-mixins/docs. diff --git a/doc/alertmanager-mixin/alerts.jsonnet b/doc/alertmanager-mixin/alerts.jsonnet new file mode 100644 index 00000000..75e7c1b2 --- /dev/null +++ b/doc/alertmanager-mixin/alerts.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts) diff --git a/doc/alertmanager-mixin/alerts.libsonnet b/doc/alertmanager-mixin/alerts.libsonnet new file mode 100644 index 00000000..eb57ace4 --- /dev/null +++ b/doc/alertmanager-mixin/alerts.libsonnet @@ -0,0 +1,150 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'alertmanager.rules', + rules: [ + { + alert: 'AlertmanagerFailedReload', + expr: ||| + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_config_last_reload_successful{%(alertmanagerSelector)s}[5m]) == 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Reloading an Alertmanager configuration has failed.', + description: 'Configuration has failed to load for %(alertmanagerName)s.' % $._config, + }, + }, + { + alert: 'AlertmanagerMembersInconsistent', + expr: ||| + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m]) + < on (%(alertmanagerClusterLabels)s) group_left + count by (%(alertmanagerClusterLabels)s) (max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m])) + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'A member of an Alertmanager cluster has not found all other cluster members.', + description: 'Alertmanager %(alertmanagerName)s has only found {{ $value }} members of the %(alertmanagerClusterName)s cluster.' % $._config, + }, + }, + { + alert: 'AlertmanagerFailedToSendAlerts', + expr: ||| + ( + rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m]) + / + rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m]) + ) + > 0.01 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'An Alertmanager instance failed to send notifications.', + description: 'Alertmanager %(alertmanagerName)s failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.' % $._config, + }, + }, + { + alert: 'AlertmanagerClusterFailedToSendAlerts', + expr: ||| + min by (%(alertmanagerClusterLabels)s) ( + rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m]) + / + rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m]) + ) + > 0.01 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'All Alertmanager instances in a cluster failed to send notifications.', + description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config, + }, + }, + { + alert: 'AlertmanagerConfigInconsistent', + expr: ||| + count by (%(alertmanagerClusterLabels)s) ( + count_values by (%(alertmanagerClusterLabels)s) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s}) + ) + != 1 + ||| % $._config, + 'for': '20m', // A config change across an Alertmanager cluster can take its time. But it's really bad if it persists for too long. + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Alertmanager instances within the same cluster have different configurations.', + description: 'Alertmanager instances within the %(alertmanagerClusterName)s cluster have different configurations.' % $._config, + }, + }, + // Both the following critical alerts, AlertmanagerClusterDown and + // AlertmanagerClusterCrashlooping, fire if a whole cluster is + // unhealthy. It is implied that a generic warning alert is in place + // for individual instances being down or crashlooping. + { + alert: 'AlertmanagerClusterDown', + expr: ||| + ( + count by (%(alertmanagerClusterLabels)s) ( + avg_over_time(up{%(alertmanagerSelector)s}[5m]) < 0.5 + ) + / + count by (%(alertmanagerClusterLabels)s) ( + up{%(alertmanagerSelector)s} + ) + ) + >= 0.5 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Half or more of the Alertmanager instances within the same cluster are down.', + description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have been up for less than half of the last 5m.' % $._config, + }, + }, + { + alert: 'AlertmanagerClusterCrashlooping', + expr: ||| + ( + count by (%(alertmanagerClusterLabels)s) ( + changes(process_start_time_seconds{%(alertmanagerSelector)s}[10m]) > 4 + ) + / + count by (%(alertmanagerClusterLabels)s) ( + up{%(alertmanagerSelector)s} + ) + ) + >= 0.5 + ||| % $._config, + 'for': '5m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Half or more of the Alertmanager instances within the same cluster are crashlooping.', + description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have restarted at least 5 times in the last 10m.' % $._config, + }, + }, + ], + }, + ], + }, +} diff --git a/doc/alertmanager-mixin/config.libsonnet b/doc/alertmanager-mixin/config.libsonnet new file mode 100644 index 00000000..001886ca --- /dev/null +++ b/doc/alertmanager-mixin/config.libsonnet @@ -0,0 +1,28 @@ +{ + _config+:: { + // alertmanagerSelector is inserted as part of the label selector in + // PromQL queries to identify metrics collected from Alertmanager + // servers. + alertmanagerSelector: 'job="alertmanager"', + + // alertmanagerClusterLabels is a string with comma-separated + // labels that are common labels of instances belonging to the + // same Alertmanager cluster. Include not only enough labels to + // identify cluster members, but also all common labels you want + // to keep for resulting cluster-level alerts. + alertmanagerClusterLabels: 'job', + + // alertmanagerName is inserted into annotations to name the Alertmanager + // instance affected by the alert. + alertmanagerName: '{{$labels.instance}}', + // If you run Alertmanager on Kubernetes with the Prometheus + // Operator, you can make use of the configured target labels for + // nicer naming: + // alertmanagerName: '{{$labels.namespace}}/{{$labels.pod}}' + + // alertmanagerClusterName is inserted into annotations to name an + // Alertmanager cluster. All labels used here must also be present + // in alertmanagerClusterLabels above. + alertmanagerClusterName: '{{$labels.job}}', + }, +} diff --git a/doc/alertmanager-mixin/go.mod b/doc/alertmanager-mixin/go.mod new file mode 100644 index 00000000..e4d814ac --- /dev/null +++ b/doc/alertmanager-mixin/go.mod @@ -0,0 +1,8 @@ +module github.com/prometheus/alertmanager/doc/alertmanager-mixin + +go 1.14 + +require ( + github.com/google/go-jsonnet v0.17.0 // indirect + github.com/monitoring-mixins/mixtool v0.0.0-20201127170310-63dca667103c // indirect +) diff --git a/doc/alertmanager-mixin/mixin.libsonnet b/doc/alertmanager-mixin/mixin.libsonnet new file mode 100644 index 00000000..95efe331 --- /dev/null +++ b/doc/alertmanager-mixin/mixin.libsonnet @@ -0,0 +1,2 @@ +(import 'config.libsonnet') + +(import 'alerts.libsonnet')