mirror of
https://github.com/prometheus/alertmanager
synced 2025-02-16 10:37:09 +00:00
Beginnings of an Alertmanager mixin. (#1629)
Add an Alertmanager mixin Signed-off-by: beorn7 <beorn@grafana.com> Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com> Co-authored-by: beorn7 <beorn@grafana.com> Co-authored-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
parent
042d7b34f0
commit
6c5dee008f
@ -99,6 +99,17 @@ jobs:
|
|||||||
- store_test_results:
|
- store_test_results:
|
||||||
path: test-results
|
path: test-results
|
||||||
|
|
||||||
|
mixin:
|
||||||
|
docker:
|
||||||
|
# Whenever the Go version is updated here, .promu.yml should also be updated.
|
||||||
|
- image: circleci/golang:1.14
|
||||||
|
|
||||||
|
steps:
|
||||||
|
- checkout
|
||||||
|
- run: cd doc/alertmanager-mixin; go install github.com/monitoring-mixins/mixtool/cmd/mixtool
|
||||||
|
- run: cd doc/alertmanager-mixin; go install github.com/google/go-jsonnet/cmd/jsonnetfmt
|
||||||
|
- run: cd doc/alertmanager-mixin; make lint
|
||||||
|
|
||||||
workflows:
|
workflows:
|
||||||
version: 2
|
version: 2
|
||||||
alertmanager:
|
alertmanager:
|
||||||
@ -116,6 +127,10 @@ workflows:
|
|||||||
filters:
|
filters:
|
||||||
tags:
|
tags:
|
||||||
only: /.*/
|
only: /.*/
|
||||||
|
- mixin:
|
||||||
|
filters:
|
||||||
|
tags:
|
||||||
|
only: /.*/
|
||||||
- prometheus/publish_master:
|
- prometheus/publish_master:
|
||||||
context: org-context
|
context: org-context
|
||||||
requires:
|
requires:
|
||||||
|
24
doc/alertmanager-mixin/Makefile
Normal file
24
doc/alertmanager-mixin/Makefile
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s
|
||||||
|
ALERTMANAGER_ALERTS := alertmanager_alerts.yaml
|
||||||
|
|
||||||
|
default: build
|
||||||
|
|
||||||
|
all: fmt build
|
||||||
|
|
||||||
|
fmt:
|
||||||
|
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
|
||||||
|
xargs -n 1 -- $(JSONNET_FMT) -i
|
||||||
|
|
||||||
|
lint: build
|
||||||
|
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
|
||||||
|
while read f; do \
|
||||||
|
$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
|
||||||
|
done
|
||||||
|
|
||||||
|
mixtool lint mixin.libsonnet
|
||||||
|
|
||||||
|
build:
|
||||||
|
mixtool generate alerts mixin.libsonnet > $(ALERTMANAGER_ALERTS)
|
||||||
|
|
||||||
|
clean:
|
||||||
|
rm -rf $(ALERTMANAGER_ALERTS)
|
30
doc/alertmanager-mixin/README.md
Normal file
30
doc/alertmanager-mixin/README.md
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
# Alertmanager Mixin
|
||||||
|
|
||||||
|
The Alertmanager Mixin is a set of configurable, reusable, and extensible
|
||||||
|
alerts (and eventually dashboards) for Alertmanager.
|
||||||
|
|
||||||
|
The alerts are designed to monitor a cluster of Alertmanager instances. To make
|
||||||
|
them work as expected, the Prometheus server the alerts are evaluated on has to
|
||||||
|
scrape all Alertmanager instances of the cluster, even if those instances are
|
||||||
|
distributed over different locations. All Alertmanager instances in the same
|
||||||
|
Alertmanager cluster must have the same `job` label. In turn, if monitoring
|
||||||
|
multiple different Alertmanager clusters, instances from different clusters
|
||||||
|
must have a different `job` label.
|
||||||
|
|
||||||
|
The most basic use of the Alertmanager Mixin is to create a YAML file with the
|
||||||
|
alerts from it. To do so, you need to have `jsonnetfmt` and `mixtool` installed. If you have a working Go development environment, it's
|
||||||
|
easiest to run the following:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ go get github.com/monitoring-mixins/mixtool/cmd/mixtool
|
||||||
|
$ go get github.com/google/go-jsonnet/cmd/jsonnetfmt
|
||||||
|
```
|
||||||
|
|
||||||
|
Edit `config.libsonnet` to match your environment and then build
|
||||||
|
`alertmanager_alerts.yaml` with the alerts by running:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ make build
|
||||||
|
```
|
||||||
|
|
||||||
|
For instructions on more advanced uses of mixins, see https://github.com/monitoring-mixins/docs.
|
1
doc/alertmanager-mixin/alerts.jsonnet
Normal file
1
doc/alertmanager-mixin/alerts.jsonnet
Normal file
@ -0,0 +1 @@
|
|||||||
|
std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts)
|
150
doc/alertmanager-mixin/alerts.libsonnet
Normal file
150
doc/alertmanager-mixin/alerts.libsonnet
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
{
|
||||||
|
prometheusAlerts+:: {
|
||||||
|
groups+: [
|
||||||
|
{
|
||||||
|
name: 'alertmanager.rules',
|
||||||
|
rules: [
|
||||||
|
{
|
||||||
|
alert: 'AlertmanagerFailedReload',
|
||||||
|
expr: |||
|
||||||
|
# Without max_over_time, failed scrapes could create false negatives, see
|
||||||
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
max_over_time(alertmanager_config_last_reload_successful{%(alertmanagerSelector)s}[5m]) == 0
|
||||||
|
||| % $._config,
|
||||||
|
'for': '10m',
|
||||||
|
labels: {
|
||||||
|
severity: 'critical',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'Reloading an Alertmanager configuration has failed.',
|
||||||
|
description: 'Configuration has failed to load for %(alertmanagerName)s.' % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'AlertmanagerMembersInconsistent',
|
||||||
|
expr: |||
|
||||||
|
# Without max_over_time, failed scrapes could create false negatives, see
|
||||||
|
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||||
|
max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m])
|
||||||
|
< on (%(alertmanagerClusterLabels)s) group_left
|
||||||
|
count by (%(alertmanagerClusterLabels)s) (max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m]))
|
||||||
|
||| % $._config,
|
||||||
|
'for': '10m',
|
||||||
|
labels: {
|
||||||
|
severity: 'critical',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'A member of an Alertmanager cluster has not found all other cluster members.',
|
||||||
|
description: 'Alertmanager %(alertmanagerName)s has only found {{ $value }} members of the %(alertmanagerClusterName)s cluster.' % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'AlertmanagerFailedToSendAlerts',
|
||||||
|
expr: |||
|
||||||
|
(
|
||||||
|
rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])
|
||||||
|
/
|
||||||
|
rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m])
|
||||||
|
)
|
||||||
|
> 0.01
|
||||||
|
||| % $._config,
|
||||||
|
'for': '5m',
|
||||||
|
labels: {
|
||||||
|
severity: 'warning',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'An Alertmanager instance failed to send notifications.',
|
||||||
|
description: 'Alertmanager %(alertmanagerName)s failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.' % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'AlertmanagerClusterFailedToSendAlerts',
|
||||||
|
expr: |||
|
||||||
|
min by (%(alertmanagerClusterLabels)s) (
|
||||||
|
rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])
|
||||||
|
/
|
||||||
|
rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m])
|
||||||
|
)
|
||||||
|
> 0.01
|
||||||
|
||| % $._config,
|
||||||
|
'for': '5m',
|
||||||
|
labels: {
|
||||||
|
severity: 'critical',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'All Alertmanager instances in a cluster failed to send notifications.',
|
||||||
|
description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'AlertmanagerConfigInconsistent',
|
||||||
|
expr: |||
|
||||||
|
count by (%(alertmanagerClusterLabels)s) (
|
||||||
|
count_values by (%(alertmanagerClusterLabels)s) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})
|
||||||
|
)
|
||||||
|
!= 1
|
||||||
|
||| % $._config,
|
||||||
|
'for': '20m', // A config change across an Alertmanager cluster can take its time. But it's really bad if it persists for too long.
|
||||||
|
labels: {
|
||||||
|
severity: 'critical',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'Alertmanager instances within the same cluster have different configurations.',
|
||||||
|
description: 'Alertmanager instances within the %(alertmanagerClusterName)s cluster have different configurations.' % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
// Both the following critical alerts, AlertmanagerClusterDown and
|
||||||
|
// AlertmanagerClusterCrashlooping, fire if a whole cluster is
|
||||||
|
// unhealthy. It is implied that a generic warning alert is in place
|
||||||
|
// for individual instances being down or crashlooping.
|
||||||
|
{
|
||||||
|
alert: 'AlertmanagerClusterDown',
|
||||||
|
expr: |||
|
||||||
|
(
|
||||||
|
count by (%(alertmanagerClusterLabels)s) (
|
||||||
|
avg_over_time(up{%(alertmanagerSelector)s}[5m]) < 0.5
|
||||||
|
)
|
||||||
|
/
|
||||||
|
count by (%(alertmanagerClusterLabels)s) (
|
||||||
|
up{%(alertmanagerSelector)s}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
>= 0.5
|
||||||
|
||| % $._config,
|
||||||
|
'for': '5m',
|
||||||
|
labels: {
|
||||||
|
severity: 'critical',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'Half or more of the Alertmanager instances within the same cluster are down.',
|
||||||
|
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have been up for less than half of the last 5m.' % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
{
|
||||||
|
alert: 'AlertmanagerClusterCrashlooping',
|
||||||
|
expr: |||
|
||||||
|
(
|
||||||
|
count by (%(alertmanagerClusterLabels)s) (
|
||||||
|
changes(process_start_time_seconds{%(alertmanagerSelector)s}[10m]) > 4
|
||||||
|
)
|
||||||
|
/
|
||||||
|
count by (%(alertmanagerClusterLabels)s) (
|
||||||
|
up{%(alertmanagerSelector)s}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
>= 0.5
|
||||||
|
||| % $._config,
|
||||||
|
'for': '5m',
|
||||||
|
labels: {
|
||||||
|
severity: 'critical',
|
||||||
|
},
|
||||||
|
annotations: {
|
||||||
|
summary: 'Half or more of the Alertmanager instances within the same cluster are crashlooping.',
|
||||||
|
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have restarted at least 5 times in the last 10m.' % $._config,
|
||||||
|
},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
}
|
28
doc/alertmanager-mixin/config.libsonnet
Normal file
28
doc/alertmanager-mixin/config.libsonnet
Normal file
@ -0,0 +1,28 @@
|
|||||||
|
{
|
||||||
|
_config+:: {
|
||||||
|
// alertmanagerSelector is inserted as part of the label selector in
|
||||||
|
// PromQL queries to identify metrics collected from Alertmanager
|
||||||
|
// servers.
|
||||||
|
alertmanagerSelector: 'job="alertmanager"',
|
||||||
|
|
||||||
|
// alertmanagerClusterLabels is a string with comma-separated
|
||||||
|
// labels that are common labels of instances belonging to the
|
||||||
|
// same Alertmanager cluster. Include not only enough labels to
|
||||||
|
// identify cluster members, but also all common labels you want
|
||||||
|
// to keep for resulting cluster-level alerts.
|
||||||
|
alertmanagerClusterLabels: 'job',
|
||||||
|
|
||||||
|
// alertmanagerName is inserted into annotations to name the Alertmanager
|
||||||
|
// instance affected by the alert.
|
||||||
|
alertmanagerName: '{{$labels.instance}}',
|
||||||
|
// If you run Alertmanager on Kubernetes with the Prometheus
|
||||||
|
// Operator, you can make use of the configured target labels for
|
||||||
|
// nicer naming:
|
||||||
|
// alertmanagerName: '{{$labels.namespace}}/{{$labels.pod}}'
|
||||||
|
|
||||||
|
// alertmanagerClusterName is inserted into annotations to name an
|
||||||
|
// Alertmanager cluster. All labels used here must also be present
|
||||||
|
// in alertmanagerClusterLabels above.
|
||||||
|
alertmanagerClusterName: '{{$labels.job}}',
|
||||||
|
},
|
||||||
|
}
|
8
doc/alertmanager-mixin/go.mod
Normal file
8
doc/alertmanager-mixin/go.mod
Normal file
@ -0,0 +1,8 @@
|
|||||||
|
module github.com/prometheus/alertmanager/doc/alertmanager-mixin
|
||||||
|
|
||||||
|
go 1.14
|
||||||
|
|
||||||
|
require (
|
||||||
|
github.com/google/go-jsonnet v0.17.0 // indirect
|
||||||
|
github.com/monitoring-mixins/mixtool v0.0.0-20201127170310-63dca667103c // indirect
|
||||||
|
)
|
2
doc/alertmanager-mixin/mixin.libsonnet
Normal file
2
doc/alertmanager-mixin/mixin.libsonnet
Normal file
@ -0,0 +1,2 @@
|
|||||||
|
(import 'config.libsonnet') +
|
||||||
|
(import 'alerts.libsonnet')
|
Loading…
Reference in New Issue
Block a user