[mixins] Alertmanager Overview dashboard (#2540)

* Implements a Grafana dashboard to the mixin.

The dashboard aims to show an overview of the overall health of Alertmanager.

Signed-off-by: ArthurSens <arthursens2005@gmail.com>
This commit is contained in:
Arthur Silva Sens 2021-06-07 14:54:22 -03:00 committed by GitHub
parent 8b584eb226
commit 8598683b24
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 218 additions and 11 deletions

View File

@ -106,9 +106,10 @@ jobs:
steps:
- checkout
- run: go install github.com/monitoring-mixins/mixtool/cmd/mixtool@latest
- run: go install github.com/google/go-jsonnet/cmd/jsonnetfmt@latest
- run: make -C doc/alertmanager-mixin lint
- run: cd doc/alertmanager-mixin; go install github.com/monitoring-mixins/mixtool/cmd/mixtool@latest
- run: cd doc/alertmanager-mixin; go install github.com/google/go-jsonnet/cmd/jsonnetfmt@latest
- run: cd doc/alertmanager-mixin; go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest
- run: cd doc/alertmanager-mixin; make lint
workflows:
version: 2

2
doc/alertmanager-mixin/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
vendor
dashboards_out

View File

@ -1,10 +1,13 @@
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 1 --string-style s --comment-style s
ALERTMANAGER_ALERTS := alertmanager_alerts.yaml
default: build
default: vendor build dashboards_out
all: fmt build
vendor:
jb install
fmt:
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
xargs -n 1 -- $(JSONNET_FMT) -i
@ -17,7 +20,12 @@ lint: build
mixtool lint mixin.libsonnet
build:
dashboards_out: mixin.libsonnet config.libsonnet $(wildcard dashboards/*)
@mkdir -p dashboards_out
jsonnet -J vendor -m dashboards_out dashboards.jsonnet
build: vendor
mixtool generate alerts mixin.libsonnet > $(ALERTMANAGER_ALERTS)
clean:

View File

@ -1,5 +1,6 @@
{
_config+:: {
local c = self,
// alertmanagerSelector is inserted as part of the label selector in
// PromQL queries to identify metrics collected from Alertmanager
// servers.
@ -12,13 +13,17 @@
// to keep for resulting cluster-level alerts.
alertmanagerClusterLabels: 'job',
// alertmanagerName is inserted into annotations to name the Alertmanager
// instance affected by the alert.
alertmanagerName: '{{$labels.instance}}',
// alertmanagerNameLabels is a string with comma-separated
// labels used to identify different alertmanagers within the same
// Alertmanager HA cluster.
// If you run Alertmanager on Kubernetes with the Prometheus
// Operator, you can make use of the configured target labels for
// nicer naming:
// alertmanagerName: '{{$labels.namespace}}/{{$labels.pod}}'
// alertmanagerNameLabels: 'namespace,pod'
alertmanagerNameLabels: 'instance',
// alertmanagerName is an identifier for alerts. By default, it is built from 'alertmanagerNameLabels'.
alertmanagerName: std.join('/', ['{{$labels.%s}}' % [label] for label in std.split(c.alertmanagerNameLabels, ',')]),
// alertmanagerClusterName is inserted into annotations to name an
// Alertmanager cluster. All labels used here must also be present
@ -32,5 +37,8 @@
// integration that is itself not used for critical alerts.
// Example: @'pagerduty|webhook'
alertmanagerCriticalIntegrationsRegEx: @'.*',
dashboardNamePrefix: 'Alertmanager / ',
dashboardTags: ['alertmanager-mixin'],
},
}

View File

@ -0,0 +1,6 @@
local dashboards = (import 'mixin.libsonnet').grafanaDashboards;
{
[name]: dashboards[name]
for name in std.objectFields(dashboards)
}

View File

@ -0,0 +1 @@
(import './dashboards/overview.libsonnet')

View File

@ -0,0 +1,149 @@
local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
local dashboard = grafana.dashboard;
local row = grafana.row;
local prometheus = grafana.prometheus;
local template = grafana.template;
local graphPanel = grafana.graphPanel;
{
grafanaDashboards+:: {
local amQuerySelector = std.join(',', ['%s="$%s"' % [label, label] for label in std.split($._config.alertmanagerClusterLabels, ',')]),
local amNameDashboardLegend = std.join('/', ['{{%s}}' % [label] for label in std.split($._config.alertmanagerNameLabels, ',')]),
local alertmanagerClusterSelectorTemplates =
[
template.new(
name=label,
datasource='$datasource',
query='label_values(alertmanager_alerts, %s)' % label,
current='',
refresh=2,
includeAll=false,
sort=1
)
for label in std.split($._config.alertmanagerClusterLabels, ',')
],
local integrationTemplate =
template.new(
name='integration',
datasource='$datasource',
query='label_values(alertmanager_notifications_total{integration=~"%s"}, integration)' % $._config.alertmanagerCriticalIntegrationsRegEx,
current='all',
hide='2', // Always hide
refresh=2,
includeAll=true,
sort=1
),
'alertmanager-overview.json':
local alerts =
graphPanel.new(
'Alerts',
datasource='$datasource',
span=6,
format='none',
stack=true,
fill=1,
legend_show=false,
)
.addTarget(prometheus.target('sum(alertmanager_alerts{%(amQuerySelector)s}) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s' % $._config { amNameDashboardLegend: amNameDashboardLegend }));
local alertsRate =
graphPanel.new(
'Alerts receive rate',
datasource='$datasource',
span=6,
format='ops',
stack=true,
fill=1,
legend_show=false,
)
.addTarget(prometheus.target('sum(rate(alertmanager_alerts_received_total{%(amQuerySelector)s}[5m])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Received' % $._config { amNameDashboardLegend: amNameDashboardLegend }))
.addTarget(prometheus.target('sum(rate(alertmanager_alerts_invalid_total{%(amQuerySelector)s}[5m])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Invalid' % $._config { amNameDashboardLegend: amNameDashboardLegend }));
local notifications =
graphPanel.new(
'$integration: Notifications Send Rate',
datasource='$datasource',
format='ops',
stack=true,
fill=1,
legend_show=false,
repeat='integration'
)
.addTarget(prometheus.target('sum(rate(alertmanager_notifications_total{%(amQuerySelector)s, integration="$integration"}[5m])) by (integration,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Total' % $._config { amNameDashboardLegend: amNameDashboardLegend }))
.addTarget(prometheus.target('sum(rate(alertmanager_notifications_failed_total{%(amQuerySelector)s, integration="$integration"}[5m])) by (integration,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Failed' % $._config { amNameDashboardLegend: amNameDashboardLegend }));
local notificationDuration =
graphPanel.new(
'$integration: Notification Duration',
datasource='$datasource',
format='s',
stack=false,
fill=1,
legend_show=false,
repeat='integration'
)
.addTarget(prometheus.target(
|||
histogram_quantile(0.99,
sum(rate(alertmanager_notification_latency_seconds_bucket{%(amQuerySelector)s, integration="$integration"}[5m])) by (le,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)
)
||| % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s 99th Percentile' % $._config { amNameDashboardLegend: amNameDashboardLegend }
))
.addTarget(prometheus.target(
|||
histogram_quantile(0.50,
sum(rate(alertmanager_notification_latency_seconds_bucket{%(amQuerySelector)s, integration="$integration"}[5m])) by (le,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)
)
||| % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Median' % $._config { amNameDashboardLegend: amNameDashboardLegend }
))
.addTarget(prometheus.target(
|||
sum(rate(alertmanager_notification_latency_seconds_sum{%(amQuerySelector)s, integration="$integration"}[5m])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)
/
sum(rate(alertmanager_notification_latency_seconds_count{%(amQuerySelector)s, integration="$integration"}[5m])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)
||| % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Average' % $._config { amNameDashboardLegend: amNameDashboardLegend }
));
dashboard.new(
'%sOverview' % $._config.dashboardNamePrefix,
time_from='now-1h',
tags=($._config.dashboardTags),
timezone='utc',
refresh='30s',
graphTooltip='shared_crosshair',
uid='alertmanager-overview'
)
.addTemplate(
{
current: {
text: 'Prometheus',
value: 'Prometheus',
},
hide: 0,
label: null,
name: 'datasource',
options: [],
query: 'prometheus',
refresh: 1,
regex: '',
type: 'datasource',
},
)
.addTemplates(alertmanagerClusterSelectorTemplates)
.addTemplate(integrationTemplate)
.addRow(
row.new('Alerts')
.addPanel(alerts)
.addPanel(alertsRate)
)
.addRow(
row.new('Notifications')
.addPanel(notifications)
.addPanel(notificationDuration)
),
},
}

View File

@ -0,0 +1,15 @@
{
"version": 1,
"dependencies": [
{
"source": {
"git": {
"remote": "https://github.com/grafana/grafonnet-lib.git",
"subdir": "grafonnet"
}
},
"version": "master"
}
],
"legacyImports": false
}

View File

@ -0,0 +1,16 @@
{
"version": 1,
"dependencies": [
{
"source": {
"git": {
"remote": "https://github.com/grafana/grafonnet-lib.git",
"subdir": "grafonnet"
}
},
"version": "55cf4ee53ced2b6d3ce96ecce9fb813b4465be98",
"sum": "4/sUV0Kk+o8I+wlYxL9R6EPhL/NiLfYHk+NXlU64RUk="
}
],
"legacyImports": false
}

View File

@ -1,2 +1,3 @@
(import 'config.libsonnet') +
(import 'alerts.libsonnet')
(import 'alerts.libsonnet') +
(import 'dashboards.libsonnet')