diff --git a/.circleci/config.yml b/.circleci/config.yml index 1011b5c8..68f5f3f5 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -106,9 +106,10 @@ jobs: steps: - checkout - - run: go install github.com/monitoring-mixins/mixtool/cmd/mixtool@latest - - run: go install github.com/google/go-jsonnet/cmd/jsonnetfmt@latest - - run: make -C doc/alertmanager-mixin lint + - run: cd doc/alertmanager-mixin; go install github.com/monitoring-mixins/mixtool/cmd/mixtool@latest + - run: cd doc/alertmanager-mixin; go install github.com/google/go-jsonnet/cmd/jsonnetfmt@latest + - run: cd doc/alertmanager-mixin; go install github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb@latest + - run: cd doc/alertmanager-mixin; make lint workflows: version: 2 diff --git a/doc/alertmanager-mixin/.gitignore b/doc/alertmanager-mixin/.gitignore new file mode 100644 index 00000000..09527a31 --- /dev/null +++ b/doc/alertmanager-mixin/.gitignore @@ -0,0 +1,2 @@ +vendor +dashboards_out diff --git a/doc/alertmanager-mixin/Makefile b/doc/alertmanager-mixin/Makefile index 360ddf2d..314e4b38 100644 --- a/doc/alertmanager-mixin/Makefile +++ b/doc/alertmanager-mixin/Makefile @@ -1,10 +1,13 @@ -JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s +JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 1 --string-style s --comment-style s ALERTMANAGER_ALERTS := alertmanager_alerts.yaml -default: build +default: vendor build dashboards_out all: fmt build +vendor: + jb install + fmt: find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ xargs -n 1 -- $(JSONNET_FMT) -i @@ -17,7 +20,12 @@ lint: build mixtool lint mixin.libsonnet -build: + +dashboards_out: mixin.libsonnet config.libsonnet $(wildcard dashboards/*) + @mkdir -p dashboards_out + jsonnet -J vendor -m dashboards_out dashboards.jsonnet + +build: vendor mixtool generate alerts mixin.libsonnet > $(ALERTMANAGER_ALERTS) clean: diff --git a/doc/alertmanager-mixin/config.libsonnet b/doc/alertmanager-mixin/config.libsonnet index a9d8b558..d0a28923 100644 --- a/doc/alertmanager-mixin/config.libsonnet +++ b/doc/alertmanager-mixin/config.libsonnet @@ -1,5 +1,6 @@ { _config+:: { + local c = self, // alertmanagerSelector is inserted as part of the label selector in // PromQL queries to identify metrics collected from Alertmanager // servers. @@ -12,13 +13,17 @@ // to keep for resulting cluster-level alerts. alertmanagerClusterLabels: 'job', - // alertmanagerName is inserted into annotations to name the Alertmanager - // instance affected by the alert. - alertmanagerName: '{{$labels.instance}}', + // alertmanagerNameLabels is a string with comma-separated + // labels used to identify different alertmanagers within the same + // Alertmanager HA cluster. // If you run Alertmanager on Kubernetes with the Prometheus // Operator, you can make use of the configured target labels for // nicer naming: - // alertmanagerName: '{{$labels.namespace}}/{{$labels.pod}}' + // alertmanagerNameLabels: 'namespace,pod' + alertmanagerNameLabels: 'instance', + + // alertmanagerName is an identifier for alerts. By default, it is built from 'alertmanagerNameLabels'. + alertmanagerName: std.join('/', ['{{$labels.%s}}' % [label] for label in std.split(c.alertmanagerNameLabels, ',')]), // alertmanagerClusterName is inserted into annotations to name an // Alertmanager cluster. All labels used here must also be present @@ -32,5 +37,8 @@ // integration that is itself not used for critical alerts. // Example: @'pagerduty|webhook' alertmanagerCriticalIntegrationsRegEx: @'.*', + + dashboardNamePrefix: 'Alertmanager / ', + dashboardTags: ['alertmanager-mixin'], }, } diff --git a/doc/alertmanager-mixin/dashboards.jsonnet b/doc/alertmanager-mixin/dashboards.jsonnet new file mode 100644 index 00000000..9d913ed3 --- /dev/null +++ b/doc/alertmanager-mixin/dashboards.jsonnet @@ -0,0 +1,6 @@ +local dashboards = (import 'mixin.libsonnet').grafanaDashboards; + +{ + [name]: dashboards[name] + for name in std.objectFields(dashboards) +} diff --git a/doc/alertmanager-mixin/dashboards.libsonnet b/doc/alertmanager-mixin/dashboards.libsonnet new file mode 100644 index 00000000..e4cc41ca --- /dev/null +++ b/doc/alertmanager-mixin/dashboards.libsonnet @@ -0,0 +1 @@ +(import './dashboards/overview.libsonnet') diff --git a/doc/alertmanager-mixin/dashboards/overview.libsonnet b/doc/alertmanager-mixin/dashboards/overview.libsonnet new file mode 100644 index 00000000..fe98d637 --- /dev/null +++ b/doc/alertmanager-mixin/dashboards/overview.libsonnet @@ -0,0 +1,149 @@ +local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet'; +local dashboard = grafana.dashboard; +local row = grafana.row; +local prometheus = grafana.prometheus; +local template = grafana.template; +local graphPanel = grafana.graphPanel; + +{ + grafanaDashboards+:: { + + local amQuerySelector = std.join(',', ['%s="$%s"' % [label, label] for label in std.split($._config.alertmanagerClusterLabels, ',')]), + local amNameDashboardLegend = std.join('/', ['{{%s}}' % [label] for label in std.split($._config.alertmanagerNameLabels, ',')]), + + local alertmanagerClusterSelectorTemplates = + [ + template.new( + name=label, + datasource='$datasource', + query='label_values(alertmanager_alerts, %s)' % label, + current='', + refresh=2, + includeAll=false, + sort=1 + ) + for label in std.split($._config.alertmanagerClusterLabels, ',') + ], + + local integrationTemplate = + template.new( + name='integration', + datasource='$datasource', + query='label_values(alertmanager_notifications_total{integration=~"%s"}, integration)' % $._config.alertmanagerCriticalIntegrationsRegEx, + current='all', + hide='2', // Always hide + refresh=2, + includeAll=true, + sort=1 + ), + + 'alertmanager-overview.json': + local alerts = + graphPanel.new( + 'Alerts', + datasource='$datasource', + span=6, + format='none', + stack=true, + fill=1, + legend_show=false, + ) + .addTarget(prometheus.target('sum(alertmanager_alerts{%(amQuerySelector)s}) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s' % $._config { amNameDashboardLegend: amNameDashboardLegend })); + + local alertsRate = + graphPanel.new( + 'Alerts receive rate', + datasource='$datasource', + span=6, + format='ops', + stack=true, + fill=1, + legend_show=false, + ) + .addTarget(prometheus.target('sum(rate(alertmanager_alerts_received_total{%(amQuerySelector)s}[5m])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Received' % $._config { amNameDashboardLegend: amNameDashboardLegend })) + .addTarget(prometheus.target('sum(rate(alertmanager_alerts_invalid_total{%(amQuerySelector)s}[5m])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Invalid' % $._config { amNameDashboardLegend: amNameDashboardLegend })); + + local notifications = + graphPanel.new( + '$integration: Notifications Send Rate', + datasource='$datasource', + format='ops', + stack=true, + fill=1, + legend_show=false, + repeat='integration' + ) + .addTarget(prometheus.target('sum(rate(alertmanager_notifications_total{%(amQuerySelector)s, integration="$integration"}[5m])) by (integration,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Total' % $._config { amNameDashboardLegend: amNameDashboardLegend })) + .addTarget(prometheus.target('sum(rate(alertmanager_notifications_failed_total{%(amQuerySelector)s, integration="$integration"}[5m])) by (integration,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Failed' % $._config { amNameDashboardLegend: amNameDashboardLegend })); + + local notificationDuration = + graphPanel.new( + '$integration: Notification Duration', + datasource='$datasource', + format='s', + stack=false, + fill=1, + legend_show=false, + repeat='integration' + ) + .addTarget(prometheus.target( + ||| + histogram_quantile(0.99, + sum(rate(alertmanager_notification_latency_seconds_bucket{%(amQuerySelector)s, integration="$integration"}[5m])) by (le,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s) + ) + ||| % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s 99th Percentile' % $._config { amNameDashboardLegend: amNameDashboardLegend } + )) + .addTarget(prometheus.target( + ||| + histogram_quantile(0.50, + sum(rate(alertmanager_notification_latency_seconds_bucket{%(amQuerySelector)s, integration="$integration"}[5m])) by (le,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s) + ) + ||| % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Median' % $._config { amNameDashboardLegend: amNameDashboardLegend } + )) + .addTarget(prometheus.target( + ||| + sum(rate(alertmanager_notification_latency_seconds_sum{%(amQuerySelector)s, integration="$integration"}[5m])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s) + / + sum(rate(alertmanager_notification_latency_seconds_count{%(amQuerySelector)s, integration="$integration"}[5m])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s) + ||| % $._config { amQuerySelector: amQuerySelector }, legendFormat='%(amNameDashboardLegend)s Average' % $._config { amNameDashboardLegend: amNameDashboardLegend } + )); + + dashboard.new( + '%sOverview' % $._config.dashboardNamePrefix, + time_from='now-1h', + tags=($._config.dashboardTags), + timezone='utc', + refresh='30s', + graphTooltip='shared_crosshair', + uid='alertmanager-overview' + ) + .addTemplate( + { + current: { + text: 'Prometheus', + value: 'Prometheus', + }, + hide: 0, + label: null, + name: 'datasource', + options: [], + query: 'prometheus', + refresh: 1, + regex: '', + type: 'datasource', + }, + ) + .addTemplates(alertmanagerClusterSelectorTemplates) + .addTemplate(integrationTemplate) + .addRow( + row.new('Alerts') + .addPanel(alerts) + .addPanel(alertsRate) + ) + .addRow( + row.new('Notifications') + .addPanel(notifications) + .addPanel(notificationDuration) + ), + }, +} diff --git a/doc/alertmanager-mixin/jsonnetfile.json b/doc/alertmanager-mixin/jsonnetfile.json new file mode 100644 index 00000000..650733a0 --- /dev/null +++ b/doc/alertmanager-mixin/jsonnetfile.json @@ -0,0 +1,15 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "master" + } + ], + "legacyImports": false +} diff --git a/doc/alertmanager-mixin/jsonnetfile.lock.json b/doc/alertmanager-mixin/jsonnetfile.lock.json new file mode 100644 index 00000000..803febc8 --- /dev/null +++ b/doc/alertmanager-mixin/jsonnetfile.lock.json @@ -0,0 +1,16 @@ +{ + "version": 1, + "dependencies": [ + { + "source": { + "git": { + "remote": "https://github.com/grafana/grafonnet-lib.git", + "subdir": "grafonnet" + } + }, + "version": "55cf4ee53ced2b6d3ce96ecce9fb813b4465be98", + "sum": "4/sUV0Kk+o8I+wlYxL9R6EPhL/NiLfYHk+NXlU64RUk=" + } + ], + "legacyImports": false +} diff --git a/doc/alertmanager-mixin/mixin.libsonnet b/doc/alertmanager-mixin/mixin.libsonnet index 95efe331..22db15c9 100644 --- a/doc/alertmanager-mixin/mixin.libsonnet +++ b/doc/alertmanager-mixin/mixin.libsonnet @@ -1,2 +1,3 @@ (import 'config.libsonnet') + -(import 'alerts.libsonnet') +(import 'alerts.libsonnet') + +(import 'dashboards.libsonnet')