[mixins] Alertmanager Overview dashboard (#2540)
* Implements a Grafana dashboard to the mixin. The dashboard aims to show an overview of the overall health of Alertmanager. Signed-off-by: ArthurSens <arthursens2005@gmail.com>
This commit is contained in:
parent
8b584eb226
commit
8598683b24
|
@ -106,9 +106,10 @@ jobs:
|
|||
|
||||
steps:
|
||||
- checkout
|
||||
- run: go install github.com/monitoring-mixins/mixtool/cmd/mixtool@latest
|
||||
- run: go install github.com/google/go-jsonnet/cmd/jsonnetfmt@latest
|
||||
- run: make -C doc/alertmanager-mixin lint
|
||||
- run: cd doc/alertmanager-mixin; go install github.com/monitoring-mixins/mixtool/cmd/mixtool@latest
|
||||
- run: cd doc/alertmanager-mixin; go install github.com/google/go-jsonnet/cmd/jsonnetfmt@latest
|
||||
- run: cd doc/alertmanager-mixin; go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest
|
||||
- run: cd doc/alertmanager-mixin; make lint
|
||||
|
||||
workflows:
|
||||
version: 2
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
vendor
|
||||
dashboards_out
|
|
@ -1,10 +1,13 @@
|
|||
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s
|
||||
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 1 --string-style s --comment-style s
|
||||
ALERTMANAGER_ALERTS := alertmanager_alerts.yaml
|
||||
|
||||
default: build
|
||||
default: vendor build dashboards_out
|
||||
|
||||
all: fmt build
|
||||
|
||||
vendor:
|
||||
jb install
|
||||
|
||||
fmt:
|
||||
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
|
||||
xargs -n 1 -- $(JSONNET_FMT) -i
|
||||
|
@ -17,7 +20,12 @@ lint: build
|
|||
|
||||
mixtool lint mixin.libsonnet
|
||||
|
||||
build:
|
||||
|
||||
dashboards_out: mixin.libsonnet config.libsonnet $(wildcard dashboards/*)
|
||||
@mkdir -p dashboards_out
|
||||
jsonnet -J vendor -m dashboards_out dashboards.jsonnet
|
||||
|
||||
build: vendor
|
||||
mixtool generate alerts mixin.libsonnet > $(ALERTMANAGER_ALERTS)
|
||||
|
||||
clean:
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
{
|
||||
_config+:: {
|
||||
local c = self,
|
||||
// alertmanagerSelector is inserted as part of the label selector in
|
||||
// PromQL queries to identify metrics collected from Alertmanager
|
||||
// servers.
|
||||
|
@ -12,13 +13,17 @@
|
|||
// to keep for resulting cluster-level alerts.
|
||||
alertmanagerClusterLabels: 'job',
|
||||
|
||||
// alertmanagerName is inserted into annotations to name the Alertmanager
|
||||
// instance affected by the alert.
|
||||
alertmanagerName: '{{$labels.instance}}',
|
||||
// alertmanagerNameLabels is a string with comma-separated
|
||||
// labels used to identify different alertmanagers within the same
|
||||
// Alertmanager HA cluster.
|
||||
// If you run Alertmanager on Kubernetes with the Prometheus
|
||||
// Operator, you can make use of the configured target labels for
|
||||
// nicer naming:
|
||||
// alertmanagerName: '{{$labels.namespace}}/{{$labels.pod}}'
|
||||
// alertmanagerNameLabels: 'namespace,pod'
|
||||
alertmanagerNameLabels: 'instance',
|
||||
|
||||
// alertmanagerName is an identifier for alerts. By default, it is built from 'alertmanagerNameLabels'.
|
||||
alertmanagerName: std.join('/', ['{{$labels.%s}}' % [label] for label in std.split(c.alertmanagerNameLabels, ',')]),
|
||||
|
||||
// alertmanagerClusterName is inserted into annotations to name an
|
||||
// Alertmanager cluster. All labels used here must also be present
|
||||
|
@ -32,5 +37,8 @@
|
|||
// integration that is itself not used for critical alerts.
|
||||
// Example: @'pagerduty|webhook'
|
||||
alertmanagerCriticalIntegrationsRegEx: @'.*',
|
||||
|
||||
dashboardNamePrefix: 'Alertmanager / ',
|
||||
dashboardTags: ['alertmanager-mixin'],
|
||||
},
|
||||
}
|
||||
|
|
|
@ -0,0 +1,6 @@
|
|||
local dashboards = (import 'mixin.libsonnet').grafanaDashboards;
|
||||
|
||||
{
|
||||
[name]: dashboards[name]
|
||||
for name in std.objectFields(dashboards)
|
||||
}
|
|
@ -0,0 +1 @@
|
|||
(import './dashboards/overview.libsonnet')
|
|
@ -0,0 +1,149 @@
|
|||
local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
|
||||
local dashboard = grafana.dashboard;
|
||||
local row = grafana.row;
|
||||
local prometheus = grafana.prometheus;
|
||||
local template = grafana.template;
|
||||
local graphPanel = grafana.graphPanel;
|
||||
|
||||
{
|
||||
grafanaDashboards+:: {
|
||||
|
||||
local amQuerySelector = std.join(',', ['%s="$%s"' % [label, label] for label in std.split($._config.alertmanagerClusterLabels, ',')]),
|
||||
local amNameDashboardLegend = std.join('/', ['{{%s}}' % [label] for label in std.split($._config.alertmanagerNameLabels, ',')]),
|
||||
|
||||
local alertmanagerClusterSelectorTemplates =
|
||||
[
|
||||
template.new(
|
||||
name=label,
|
||||
datasource='$datasource',
|
||||
query='label_values(alertmanager_alerts, %s)' % label,
|
||||
current='',
|
||||
refresh=2,
|
||||
includeAll=false,
|
||||
sort=1
|
||||
)
|
||||
for label in std.split($._config.alertmanagerClusterLabels, ',')
|
||||
],
|
||||
|
||||
local integrationTemplate =
|
||||
template.new(
|
||||
name='integration',
|
||||
datasource='$datasource',
|
||||
query='label_values(alertmanager_notifications_total{integration=~"%s"}, integration)' % $._config.alertmanagerCriticalIntegrationsRegEx,
|
||||
current='all',
|
||||
hide='2', // Always hide
|
||||
refresh=2,
|
||||
includeAll=true,
|
||||
sort=1
|
||||
),
|
||||
|
||||
'alertmanager-overview.json':
|
||||
local alerts =
|
||||
graphPanel.new(
|
||||
'Alerts',
|
||||
datasource='$datasource',
|
||||
span=6,
|
||||
format='none',
|
||||
stack=true,
|
||||
fill=1,
|
||||
legend_show=false,
|
||||
)
|
||||
.addTarget(prometheus.target('sum(alertmanager_alerts{%(amQuerySelector)s}) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s' % $._config { amNameDashboardLegend: amNameDashboardLegend }));
|
||||
|
||||
local alertsRate =
|
||||
graphPanel.new(
|
||||
'Alerts receive rate',
|
||||
datasource='$datasource',
|
||||
span=6,
|
||||
format='ops',
|
||||
stack=true,
|
||||
fill=1,
|
||||
legend_show=false,
|
||||
)
|
||||
.addTarget(prometheus.target('sum(rate(alertmanager_alerts_received_total{%(amQuerySelector)s}[5m])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Received' % $._config { amNameDashboardLegend: amNameDashboardLegend }))
|
||||
.addTarget(prometheus.target('sum(rate(alertmanager_alerts_invalid_total{%(amQuerySelector)s}[5m])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Invalid' % $._config { amNameDashboardLegend: amNameDashboardLegend }));
|
||||
|
||||
local notifications =
|
||||
graphPanel.new(
|
||||
'$integration: Notifications Send Rate',
|
||||
datasource='$datasource',
|
||||
format='ops',
|
||||
stack=true,
|
||||
fill=1,
|
||||
legend_show=false,
|
||||
repeat='integration'
|
||||
)
|
||||
.addTarget(prometheus.target('sum(rate(alertmanager_notifications_total{%(amQuerySelector)s, integration="$integration"}[5m])) by (integration,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Total' % $._config { amNameDashboardLegend: amNameDashboardLegend }))
|
||||
.addTarget(prometheus.target('sum(rate(alertmanager_notifications_failed_total{%(amQuerySelector)s, integration="$integration"}[5m])) by (integration,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Failed' % $._config { amNameDashboardLegend: amNameDashboardLegend }));
|
||||
|
||||
local notificationDuration =
|
||||
graphPanel.new(
|
||||
'$integration: Notification Duration',
|
||||
datasource='$datasource',
|
||||
format='s',
|
||||
stack=false,
|
||||
fill=1,
|
||||
legend_show=false,
|
||||
repeat='integration'
|
||||
)
|
||||
.addTarget(prometheus.target(
|
||||
|||
|
||||
histogram_quantile(0.99,
|
||||
sum(rate(alertmanager_notification_latency_seconds_bucket{%(amQuerySelector)s, integration="$integration"}[5m])) by (le,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)
|
||||
)
|
||||
||| % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s 99th Percentile' % $._config { amNameDashboardLegend: amNameDashboardLegend }
|
||||
))
|
||||
.addTarget(prometheus.target(
|
||||
|||
|
||||
histogram_quantile(0.50,
|
||||
sum(rate(alertmanager_notification_latency_seconds_bucket{%(amQuerySelector)s, integration="$integration"}[5m])) by (le,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)
|
||||
)
|
||||
||| % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Median' % $._config { amNameDashboardLegend: amNameDashboardLegend }
|
||||
))
|
||||
.addTarget(prometheus.target(
|
||||
|||
|
||||
sum(rate(alertmanager_notification_latency_seconds_sum{%(amQuerySelector)s, integration="$integration"}[5m])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)
|
||||
/
|
||||
sum(rate(alertmanager_notification_latency_seconds_count{%(amQuerySelector)s, integration="$integration"}[5m])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)
|
||||
||| % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Average' % $._config { amNameDashboardLegend: amNameDashboardLegend }
|
||||
));
|
||||
|
||||
dashboard.new(
|
||||
'%sOverview' % $._config.dashboardNamePrefix,
|
||||
time_from='now-1h',
|
||||
tags=($._config.dashboardTags),
|
||||
timezone='utc',
|
||||
refresh='30s',
|
||||
graphTooltip='shared_crosshair',
|
||||
uid='alertmanager-overview'
|
||||
)
|
||||
.addTemplate(
|
||||
{
|
||||
current: {
|
||||
text: 'Prometheus',
|
||||
value: 'Prometheus',
|
||||
},
|
||||
hide: 0,
|
||||
label: null,
|
||||
name: 'datasource',
|
||||
options: [],
|
||||
query: 'prometheus',
|
||||
refresh: 1,
|
||||
regex: '',
|
||||
type: 'datasource',
|
||||
},
|
||||
)
|
||||
.addTemplates(alertmanagerClusterSelectorTemplates)
|
||||
.addTemplate(integrationTemplate)
|
||||
.addRow(
|
||||
row.new('Alerts')
|
||||
.addPanel(alerts)
|
||||
.addPanel(alertsRate)
|
||||
)
|
||||
.addRow(
|
||||
row.new('Notifications')
|
||||
.addPanel(notifications)
|
||||
.addPanel(notificationDuration)
|
||||
),
|
||||
},
|
||||
}
|
|
@ -0,0 +1,15 @@
|
|||
{
|
||||
"version": 1,
|
||||
"dependencies": [
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/grafana/grafonnet-lib.git",
|
||||
"subdir": "grafonnet"
|
||||
}
|
||||
},
|
||||
"version": "master"
|
||||
}
|
||||
],
|
||||
"legacyImports": false
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
"version": 1,
|
||||
"dependencies": [
|
||||
{
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/grafana/grafonnet-lib.git",
|
||||
"subdir": "grafonnet"
|
||||
}
|
||||
},
|
||||
"version": "55cf4ee53ced2b6d3ce96ecce9fb813b4465be98",
|
||||
"sum": "4/sUV0Kk+o8I+wlYxL9R6EPhL/NiLfYHk+NXlU64RUk="
|
||||
}
|
||||
],
|
||||
"legacyImports": false
|
||||
}
|
|
@ -1,2 +1,3 @@
|
|||
(import 'config.libsonnet') +
|
||||
(import 'alerts.libsonnet')
|
||||
(import 'alerts.libsonnet') +
|
||||
(import 'dashboards.libsonnet')
|
||||
|
|
Loading…
Reference in New Issue