Beginnings of an Alertmanager mixin. (#1629)
Add an Alertmanager mixin Signed-off-by: beorn7 <beorn@grafana.com> Co-authored-by: Tom Wilkie <tom.wilkie@gmail.com> Co-authored-by: beorn7 <beorn@grafana.com> Co-authored-by: Simon Pasquier <spasquie@redhat.com>
This commit is contained in:
parent
042d7b34f0
commit
6c5dee008f
|
@ -99,6 +99,17 @@ jobs:
|
|||
- store_test_results:
|
||||
path: test-results
|
||||
|
||||
mixin:
|
||||
docker:
|
||||
# Whenever the Go version is updated here, .promu.yml should also be updated.
|
||||
- image: circleci/golang:1.14
|
||||
|
||||
steps:
|
||||
- checkout
|
||||
- run: cd doc/alertmanager-mixin; go install github.com/monitoring-mixins/mixtool/cmd/mixtool
|
||||
- run: cd doc/alertmanager-mixin; go install github.com/google/go-jsonnet/cmd/jsonnetfmt
|
||||
- run: cd doc/alertmanager-mixin; make lint
|
||||
|
||||
workflows:
|
||||
version: 2
|
||||
alertmanager:
|
||||
|
@ -116,6 +127,10 @@ workflows:
|
|||
filters:
|
||||
tags:
|
||||
only: /.*/
|
||||
- mixin:
|
||||
filters:
|
||||
tags:
|
||||
only: /.*/
|
||||
- prometheus/publish_master:
|
||||
context: org-context
|
||||
requires:
|
||||
|
|
|
@ -0,0 +1,24 @@
|
|||
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s
|
||||
ALERTMANAGER_ALERTS := alertmanager_alerts.yaml
|
||||
|
||||
default: build
|
||||
|
||||
all: fmt build
|
||||
|
||||
fmt:
|
||||
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
|
||||
xargs -n 1 -- $(JSONNET_FMT) -i
|
||||
|
||||
lint: build
|
||||
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
|
||||
while read f; do \
|
||||
$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
|
||||
done
|
||||
|
||||
mixtool lint mixin.libsonnet
|
||||
|
||||
build:
|
||||
mixtool generate alerts mixin.libsonnet > $(ALERTMANAGER_ALERTS)
|
||||
|
||||
clean:
|
||||
rm -rf $(ALERTMANAGER_ALERTS)
|
|
@ -0,0 +1,30 @@
|
|||
# Alertmanager Mixin
|
||||
|
||||
The Alertmanager Mixin is a set of configurable, reusable, and extensible
|
||||
alerts (and eventually dashboards) for Alertmanager.
|
||||
|
||||
The alerts are designed to monitor a cluster of Alertmanager instances. To make
|
||||
them work as expected, the Prometheus server the alerts are evaluated on has to
|
||||
scrape all Alertmanager instances of the cluster, even if those instances are
|
||||
distributed over different locations. All Alertmanager instances in the same
|
||||
Alertmanager cluster must have the same `job` label. In turn, if monitoring
|
||||
multiple different Alertmanager clusters, instances from different clusters
|
||||
must have a different `job` label.
|
||||
|
||||
The most basic use of the Alertmanager Mixin is to create a YAML file with the
|
||||
alerts from it. To do so, you need to have `jsonnetfmt` and `mixtool` installed. If you have a working Go development environment, it's
|
||||
easiest to run the following:
|
||||
|
||||
```bash
|
||||
$ go get github.com/monitoring-mixins/mixtool/cmd/mixtool
|
||||
$ go get github.com/google/go-jsonnet/cmd/jsonnetfmt
|
||||
```
|
||||
|
||||
Edit `config.libsonnet` to match your environment and then build
|
||||
`alertmanager_alerts.yaml` with the alerts by running:
|
||||
|
||||
```bash
|
||||
$ make build
|
||||
```
|
||||
|
||||
For instructions on more advanced uses of mixins, see https://github.com/monitoring-mixins/docs.
|
|
@ -0,0 +1 @@
|
|||
std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts)
|
|
@ -0,0 +1,150 @@
|
|||
{
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'alertmanager.rules',
|
||||
rules: [
|
||||
{
|
||||
alert: 'AlertmanagerFailedReload',
|
||||
expr: |||
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_config_last_reload_successful{%(alertmanagerSelector)s}[5m]) == 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Reloading an Alertmanager configuration has failed.',
|
||||
description: 'Configuration has failed to load for %(alertmanagerName)s.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'AlertmanagerMembersInconsistent',
|
||||
expr: |||
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m])
|
||||
< on (%(alertmanagerClusterLabels)s) group_left
|
||||
count by (%(alertmanagerClusterLabels)s) (max_over_time(alertmanager_cluster_members{%(alertmanagerSelector)s}[5m]))
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'A member of an Alertmanager cluster has not found all other cluster members.',
|
||||
description: 'Alertmanager %(alertmanagerName)s has only found {{ $value }} members of the %(alertmanagerClusterName)s cluster.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'AlertmanagerFailedToSendAlerts',
|
||||
expr: |||
|
||||
(
|
||||
rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m])
|
||||
)
|
||||
> 0.01
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'An Alertmanager instance failed to send notifications.',
|
||||
description: 'Alertmanager %(alertmanagerName)s failed to send {{ $value | humanizePercentage }} of notifications to {{ $labels.integration }}.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'AlertmanagerClusterFailedToSendAlerts',
|
||||
expr: |||
|
||||
min by (%(alertmanagerClusterLabels)s) (
|
||||
rate(alertmanager_notifications_failed_total{%(alertmanagerSelector)s}[5m])
|
||||
/
|
||||
rate(alertmanager_notifications_total{%(alertmanagerSelector)s}[5m])
|
||||
)
|
||||
> 0.01
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'All Alertmanager instances in a cluster failed to send notifications.',
|
||||
description: 'The minimum notification failure rate to {{ $labels.integration }} sent from any instance in the %(alertmanagerClusterName)s cluster is {{ $value | humanizePercentage }}.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'AlertmanagerConfigInconsistent',
|
||||
expr: |||
|
||||
count by (%(alertmanagerClusterLabels)s) (
|
||||
count_values by (%(alertmanagerClusterLabels)s) ("config_hash", alertmanager_config_hash{%(alertmanagerSelector)s})
|
||||
)
|
||||
!= 1
|
||||
||| % $._config,
|
||||
'for': '20m', // A config change across an Alertmanager cluster can take its time. But it's really bad if it persists for too long.
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Alertmanager instances within the same cluster have different configurations.',
|
||||
description: 'Alertmanager instances within the %(alertmanagerClusterName)s cluster have different configurations.' % $._config,
|
||||
},
|
||||
},
|
||||
// Both the following critical alerts, AlertmanagerClusterDown and
|
||||
// AlertmanagerClusterCrashlooping, fire if a whole cluster is
|
||||
// unhealthy. It is implied that a generic warning alert is in place
|
||||
// for individual instances being down or crashlooping.
|
||||
{
|
||||
alert: 'AlertmanagerClusterDown',
|
||||
expr: |||
|
||||
(
|
||||
count by (%(alertmanagerClusterLabels)s) (
|
||||
avg_over_time(up{%(alertmanagerSelector)s}[5m]) < 0.5
|
||||
)
|
||||
/
|
||||
count by (%(alertmanagerClusterLabels)s) (
|
||||
up{%(alertmanagerSelector)s}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Half or more of the Alertmanager instances within the same cluster are down.',
|
||||
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have been up for less than half of the last 5m.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'AlertmanagerClusterCrashlooping',
|
||||
expr: |||
|
||||
(
|
||||
count by (%(alertmanagerClusterLabels)s) (
|
||||
changes(process_start_time_seconds{%(alertmanagerSelector)s}[10m]) > 4
|
||||
)
|
||||
/
|
||||
count by (%(alertmanagerClusterLabels)s) (
|
||||
up{%(alertmanagerSelector)s}
|
||||
)
|
||||
)
|
||||
>= 0.5
|
||||
||| % $._config,
|
||||
'for': '5m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Half or more of the Alertmanager instances within the same cluster are crashlooping.',
|
||||
description: '{{ $value | humanizePercentage }} of Alertmanager instances within the %(alertmanagerClusterName)s cluster have restarted at least 5 times in the last 10m.' % $._config,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
|
@ -0,0 +1,28 @@
|
|||
{
|
||||
_config+:: {
|
||||
// alertmanagerSelector is inserted as part of the label selector in
|
||||
// PromQL queries to identify metrics collected from Alertmanager
|
||||
// servers.
|
||||
alertmanagerSelector: 'job="alertmanager"',
|
||||
|
||||
// alertmanagerClusterLabels is a string with comma-separated
|
||||
// labels that are common labels of instances belonging to the
|
||||
// same Alertmanager cluster. Include not only enough labels to
|
||||
// identify cluster members, but also all common labels you want
|
||||
// to keep for resulting cluster-level alerts.
|
||||
alertmanagerClusterLabels: 'job',
|
||||
|
||||
// alertmanagerName is inserted into annotations to name the Alertmanager
|
||||
// instance affected by the alert.
|
||||
alertmanagerName: '{{$labels.instance}}',
|
||||
// If you run Alertmanager on Kubernetes with the Prometheus
|
||||
// Operator, you can make use of the configured target labels for
|
||||
// nicer naming:
|
||||
// alertmanagerName: '{{$labels.namespace}}/{{$labels.pod}}'
|
||||
|
||||
// alertmanagerClusterName is inserted into annotations to name an
|
||||
// Alertmanager cluster. All labels used here must also be present
|
||||
// in alertmanagerClusterLabels above.
|
||||
alertmanagerClusterName: '{{$labels.job}}',
|
||||
},
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
module github.com/prometheus/alertmanager/doc/alertmanager-mixin
|
||||
|
||||
go 1.14
|
||||
|
||||
require (
|
||||
github.com/google/go-jsonnet v0.17.0 // indirect
|
||||
github.com/monitoring-mixins/mixtool v0.0.0-20201127170310-63dca667103c // indirect
|
||||
)
|
|
@ -0,0 +1,2 @@
|
|||
(import 'config.libsonnet') +
|
||||
(import 'alerts.libsonnet')
|
Loading…
Reference in New Issue