174 lines
9.3 KiB
Plaintext
174 lines
9.3 KiB
Plaintext
local grafana = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet';
|
|
local dashboard = grafana.dashboard;
|
|
local prometheus = grafana.query.prometheus;
|
|
local variable = dashboard.variable;
|
|
local panel = grafana.panel;
|
|
local row = panel.row;
|
|
|
|
{
|
|
grafanaDashboards+:: {
|
|
local amQuerySelector = std.join(',', ['%s=~"$%s"' % [label, label] for label in std.split($._config.alertmanagerClusterLabels, ',')]),
|
|
local amNameDashboardLegend = std.join('/', ['{{%s}}' % [label] for label in std.split($._config.alertmanagerNameLabels, ',')]),
|
|
|
|
local datasource =
|
|
variable.datasource.new('datasource', 'prometheus')
|
|
+ variable.datasource.generalOptions.withLabel('Data Source')
|
|
+ variable.datasource.generalOptions.withCurrent('Prometheus')
|
|
+ variable.datasource.generalOptions.showOnDashboard.withLabelAndValue(),
|
|
|
|
local alertmanagerClusterSelectorVariables =
|
|
[
|
|
variable.query.new(label)
|
|
+ variable.query.generalOptions.withLabel(label)
|
|
+ variable.query.withDatasourceFromVariable(datasource)
|
|
+ variable.query.queryTypes.withLabelValues(label, metric='alertmanager_alerts')
|
|
+ variable.query.generalOptions.withCurrent('')
|
|
+ variable.query.refresh.onTime()
|
|
+ variable.query.selectionOptions.withIncludeAll(false)
|
|
+ variable.query.withSort(type='alphabetical')
|
|
for label in std.split($._config.alertmanagerClusterLabels, ',')
|
|
],
|
|
|
|
local integrationVariable =
|
|
variable.query.new('integration')
|
|
+ variable.query.withDatasourceFromVariable(datasource)
|
|
+ variable.query.queryTypes.withLabelValues('integration', metric='alertmanager_notifications_total{integration=~"%s"}' % $._config.alertmanagerCriticalIntegrationsRegEx)
|
|
+ variable.query.generalOptions.withCurrent('$__all')
|
|
+ variable.datasource.generalOptions.showOnDashboard.withNothing()
|
|
+ variable.query.refresh.onTime()
|
|
+ variable.query.selectionOptions.withIncludeAll(true)
|
|
+ variable.query.withSort(type='alphabetical'),
|
|
|
|
local panelTimeSeriesStdOptions =
|
|
{}
|
|
+ panel.timeSeries.fieldConfig.defaults.custom.stacking.withMode('normal')
|
|
+ panel.timeSeries.fieldConfig.defaults.custom.withFillOpacity(10)
|
|
+ panel.timeSeries.fieldConfig.defaults.custom.withShowPoints('never')
|
|
+ panel.timeSeries.options.legend.withShowLegend(false)
|
|
+ panel.timeSeries.options.tooltip.withMode('multi')
|
|
+ panel.timeSeries.queryOptions.withDatasource('prometheus', '$datasource'),
|
|
|
|
'alertmanager-overview.json':
|
|
local alerts =
|
|
panel.timeSeries.new('Alerts')
|
|
+ panel.timeSeries.panelOptions.withDescription('current set of alerts stored in the Alertmanager')
|
|
+ panel.timeSeries.standardOptions.withUnit('none')
|
|
+ panelTimeSeriesStdOptions
|
|
+ panel.timeSeries.queryOptions.withTargets([
|
|
prometheus.new(
|
|
'$datasource',
|
|
'sum(alertmanager_alerts{%(amQuerySelector)s}) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector },
|
|
)
|
|
+ prometheus.withIntervalFactor(2)
|
|
+ prometheus.withLegendFormat('%(amNameDashboardLegend)s' % $._config { amNameDashboardLegend: amNameDashboardLegend }),
|
|
]);
|
|
|
|
local alertsRate =
|
|
panel.timeSeries.new('Alerts receive rate')
|
|
+ panel.timeSeries.panelOptions.withDescription('rate of successful and invalid alerts received by the Alertmanager')
|
|
+ panel.timeSeries.standardOptions.withUnit('ops')
|
|
+ panelTimeSeriesStdOptions
|
|
+ panel.timeSeries.queryOptions.withTargets([
|
|
prometheus.new(
|
|
'$datasource',
|
|
'sum(rate(alertmanager_alerts_received_total{%(amQuerySelector)s}[$__rate_interval])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector },
|
|
)
|
|
+ prometheus.withIntervalFactor(2)
|
|
+ prometheus.withLegendFormat('%(amNameDashboardLegend)s Received' % $._config { amNameDashboardLegend: amNameDashboardLegend }),
|
|
prometheus.new(
|
|
'$datasource',
|
|
'sum(rate(alertmanager_alerts_invalid_total{%(amQuerySelector)s}[$__rate_interval])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector },
|
|
)
|
|
+ prometheus.withIntervalFactor(2)
|
|
+ prometheus.withLegendFormat('%(amNameDashboardLegend)s Invalid' % $._config { amNameDashboardLegend: amNameDashboardLegend }),
|
|
]);
|
|
|
|
local notifications =
|
|
panel.timeSeries.new('$integration: Notifications Send Rate')
|
|
+ panel.timeSeries.panelOptions.withDescription('rate of successful and invalid notifications sent by the Alertmanager')
|
|
+ panel.timeSeries.standardOptions.withUnit('ops')
|
|
+ panelTimeSeriesStdOptions
|
|
+ panel.timeSeries.panelOptions.withRepeat('integration')
|
|
+ panel.timeSeries.queryOptions.withTargets([
|
|
prometheus.new(
|
|
'$datasource',
|
|
'sum(rate(alertmanager_notifications_total{%(amQuerySelector)s, integration="$integration"}[$__rate_interval])) by (integration,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector },
|
|
)
|
|
+ prometheus.withIntervalFactor(2)
|
|
+ prometheus.withLegendFormat('%(amNameDashboardLegend)s Total' % $._config { amNameDashboardLegend: amNameDashboardLegend }),
|
|
prometheus.new(
|
|
'$datasource',
|
|
'sum(rate(alertmanager_notifications_failed_total{%(amQuerySelector)s, integration="$integration"}[$__rate_interval])) by (integration,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)' % $._config { amQuerySelector: amQuerySelector },
|
|
)
|
|
+ prometheus.withIntervalFactor(2)
|
|
+ prometheus.withLegendFormat('%(amNameDashboardLegend)s Failed' % $._config { amNameDashboardLegend: amNameDashboardLegend }),
|
|
]);
|
|
|
|
local notificationDuration =
|
|
panel.timeSeries.new('$integration: Notification Duration')
|
|
+ panel.timeSeries.panelOptions.withDescription('latency of notifications sent by the Alertmanager')
|
|
+ panel.timeSeries.standardOptions.withUnit('s')
|
|
+ panelTimeSeriesStdOptions
|
|
+ panel.timeSeries.panelOptions.withRepeat('integration')
|
|
+ panel.timeSeries.queryOptions.withTargets([
|
|
prometheus.new(
|
|
'$datasource',
|
|
|||
|
|
histogram_quantile(0.99,
|
|
sum(rate(alertmanager_notification_latency_seconds_bucket{%(amQuerySelector)s, integration="$integration"}[$__rate_interval])) by (le,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)
|
|
)
|
|
||| % $._config { amQuerySelector: amQuerySelector },
|
|
)
|
|
+ prometheus.withIntervalFactor(2)
|
|
+ prometheus.withLegendFormat('%(amNameDashboardLegend)s 99th Percentile' % $._config { amNameDashboardLegend: amNameDashboardLegend }),
|
|
prometheus.new(
|
|
'$datasource',
|
|
|||
|
|
histogram_quantile(0.50,
|
|
sum(rate(alertmanager_notification_latency_seconds_bucket{%(amQuerySelector)s, integration="$integration"}[$__rate_interval])) by (le,%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)
|
|
)
|
|
||| % $._config { amQuerySelector: amQuerySelector },
|
|
)
|
|
+ prometheus.withIntervalFactor(2)
|
|
+ prometheus.withLegendFormat('%(amNameDashboardLegend)s Median' % $._config { amNameDashboardLegend: amNameDashboardLegend }),
|
|
prometheus.new(
|
|
'$datasource',
|
|
|||
|
|
sum(rate(alertmanager_notification_latency_seconds_sum{%(amQuerySelector)s, integration="$integration"}[$__rate_interval])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)
|
|
/
|
|
sum(rate(alertmanager_notification_latency_seconds_count{%(amQuerySelector)s, integration="$integration"}[$__rate_interval])) by (%(alertmanagerClusterLabels)s,%(alertmanagerNameLabels)s)
|
|
||| % $._config { amQuerySelector: amQuerySelector },
|
|
)
|
|
+ prometheus.withIntervalFactor(2)
|
|
+ prometheus.withLegendFormat('%(amNameDashboardLegend)s Average' % $._config { amNameDashboardLegend: amNameDashboardLegend }),
|
|
]);
|
|
|
|
dashboard.new('%sOverview' % $._config.dashboardNamePrefix)
|
|
+ dashboard.time.withFrom('now-1h')
|
|
+ dashboard.withTags($._config.dashboardTags)
|
|
+ dashboard.withTimezone('utc')
|
|
+ dashboard.timepicker.withRefreshIntervals('30s')
|
|
+ dashboard.graphTooltip.withSharedCrosshair()
|
|
+ dashboard.withUid('alertmanager-overview')
|
|
+ dashboard.withVariables(
|
|
[datasource]
|
|
+ alertmanagerClusterSelectorVariables
|
|
+ [integrationVariable]
|
|
)
|
|
+ dashboard.withPanels(
|
|
grafana.util.grid.makeGrid([
|
|
row.new('Alerts')
|
|
+ row.withPanels([
|
|
alerts,
|
|
alertsRate
|
|
]),
|
|
row.new('Notifications')
|
|
+ row.withPanels([
|
|
notifications,
|
|
notificationDuration
|
|
])
|
|
], panelWidth=12, panelHeight=7)
|
|
)
|
|
},
|
|
}
|