Merge pull request #4474 from prometheus/mixin

Prometheus Monitoring Mixin for Prometheus itself.
This commit is contained in:
Björn Rabenstein 2019-06-29 00:14:00 +02:00 committed by GitHub
commit 32c3a1beef
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
10 changed files with 513 additions and 0 deletions

View File

@ -0,0 +1,4 @@
*.yaml
dashboards_out
vendor
jsonnetfile.lock.json

View File

@ -0,0 +1,25 @@
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s
all: fmt prometheus_alerts.yaml dashboards_out lint
fmt:
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
xargs -n 1 -- $(JSONNET_FMT) -i
prometheus_alerts.yaml: mixin.libsonnet config.libsonnet alerts.libsonnet
jsonnet -S alerts.jsonnet > $@
dashboards_out: mixin.libsonnet config.libsonnet dashboards.libsonnet
@mkdir -p dashboards_out
jsonnet -J vendor -m dashboards_out dashboards.jsonnet
lint: prometheus_alerts.yaml
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
while read f; do \
$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
done
promtool check rules prometheus_alerts.yaml
clean:
rm -rf dashboards_out prometheus_alerts.yaml

View File

@ -0,0 +1,36 @@
# Prometheus Mixin
_This is work in progress. We aim for it to become a good role model for alerts
and dashboards eventually, but it is not quite there yet._
The Prometheus Mixin is a set of configurable, reusable, and extensible alerts
and dashboards for Prometheus.
To use them, you need to have `jsonnet` (v0.13+) and `jb` installed. If you
have a working Go development environment, it's easiest to run the following:
```bash
$ go get github.com/google/go-jsonnet/cmd/jsonnet
$ go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb
```
_Note: The make targets `lint` and `fmt` need the `jsonnetfmt` binary, which is
currently not included in the Go implementation of `jsonnet`. For the time
being, you have to install the [C++ version of
jsonnetfmt](https://github.com/google/jsonnet) if you want to use `make lint`
or `make fmt`._
Next, install the dependencies by running the following command in this
directory:
```bash
$ jb install
```
You can then build a `prometheus_alerts.yaml` with the alerts and a directory
`dashboards_out` with the Grafana dashboard JSON files:
```bash
$ make prometheus_alerts.yaml
$ make dashboards_out
```
For more advanced uses of mixins, see https://github.com/monitoring-mixins/docs.

View File

@ -0,0 +1 @@
std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts)

View File

@ -0,0 +1,260 @@
{
prometheusAlerts+:: {
groups+: [
{
name: 'prometheus',
rules: [
{
alert: 'PrometheusBadConfig',
expr: |||
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_config_last_reload_successful{%(prometheusSelector)s}[5m]) == 0
||| % $._config,
'for': '10m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Failed Prometheus configuration reload.',
description: 'Prometheus %(prometheusName)s has failed to reload its configuration.' % $._config,
},
},
{
alert: 'PrometheusNotificationQueueRunningFull',
expr: |||
# Without min_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30)
>
min_over_time(prometheus_notifications_queue_capacity{%(prometheusSelector)s}[5m])
)
||| % $._config,
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Prometheus alert notification queue predicted to run full in less than 30m.',
description: 'Alert notification queue of Prometheus %(prometheusName)s is running full.' % $._config,
},
},
{
alert: 'PrometheusErrorSendingAlertsToSomeAlertmanagers',
expr: |||
(
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
/
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
)
* 100
> 1
||| % $._config,
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.',
description: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}}.' % $._config,
},
},
{
alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager',
expr: |||
min without(alertmanager) (
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
/
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
)
* 100
> 3
||| % $._config,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Prometheus encounters more than 3% errors sending alerts to any Alertmanager.',
description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from Prometheus %(prometheusName)s to any Alertmanager.' % $._config,
},
},
{
alert: 'PrometheusNotConnectedToAlertmanagers',
expr: |||
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
max_over_time(prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s}[5m]) < 1
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Prometheus is not connected to any Alertmanagers.',
description: 'Prometheus %(prometheusName)s is not connected to any Alertmanagers.' % $._config,
},
},
{
alert: 'PrometheusTSDBReloadsFailing',
expr: |||
increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[3h]) > 0
||| % $._config,
'for': '4h',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Prometheus has issues reloading blocks from disk.',
description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} reload failures over the last 3h.' % $._config,
},
},
{
alert: 'PrometheusTSDBCompactionsFailing',
expr: |||
increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[3h]) > 0
||| % $._config,
'for': '4h',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Prometheus has issues compacting blocks.',
description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} compaction failures over the last 3h.' % $._config,
},
},
{
alert: 'PrometheusTSDBWALCorruptions',
expr: |||
increase(tsdb_wal_corruptions_total{%(prometheusSelector)s}[3h]) > 0
||| % $._config,
'for': '4h',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Prometheus is detecting WAL corruptions.',
description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} corruptions of the write-ahead log (WAL) over the last 3h.' % $._config,
},
},
{
alert: 'PrometheusNotIngestingSamples',
expr: |||
rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Prometheus is not ingesting samples.',
description: 'Prometheus %(prometheusName)s is not ingesting samples.' % $._config,
},
},
{
alert: 'PrometheusDuplicateTimestamps',
expr: |||
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Prometheus is dropping samples with duplicate timestamps.',
description: 'Prometheus %(prometheusName)s is dropping {{$value | humanize}} samples/s with different values but duplicated timestamp.' % $._config,
},
},
{
alert: 'PrometheusOutOfOrderTimestamps',
expr: |||
rate(prometheus_target_scrapes_sample_out_of_order_total{%(prometheusSelector)s}[5m]) > 0
||| % $._config,
'for': '10m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Prometheus drops samples with out-of-order timestamps.',
description: 'Prometheus %(prometheusName)s is dropping {{$value | humanize}} samples/s with timestamps arriving out of order.' % $._config,
},
},
{
alert: 'PrometheusRemoteStorageFailures',
expr: |||
(
rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[5m])
/
(
rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[5m])
+
rate(prometheus_remote_storage_succeeded_samples_total{%(prometheusSelector)s}[5m])
)
)
* 100
> 1
||| % $._config,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Prometheus fails to send samples to remote storage.',
description: 'Prometheus %(prometheusName)s failed to send {{ printf "%%.1f" $value }}%% of the samples to queue {{$labels.queue}}.' % $._config,
},
},
{
alert: 'PrometheusRemoteWriteBehind',
expr: |||
# Without max_over_time, failed scrapes could create false negatives, see
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
(
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s}[5m])
- on(job, instance) group_right
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s}[5m])
)
> 120
||| % $._config,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Prometheus remote write is behind.',
description: 'Prometheus %(prometheusName)s remote write is {{ printf "%%.1f" $value }}s behind for queue {{$labels.queue}}.' % $._config,
},
},
{
alert: 'PrometheusRuleFailures',
expr: |||
increase(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[5m]) > 0
||| % $._config,
'for': '15m',
labels: {
severity: 'critical',
},
annotations: {
summary: 'Prometheus is failing rule evaluations.',
description: 'Prometheus %(prometheusName)s has failed to evaluate {{ printf "%%.0f" $value }} rules in the last 5m.' % $._config,
},
},
{
alert: 'PrometheusMissingRuleEvaluations',
expr: |||
increase(prometheus_rule_group_iterations_missed_total{%(prometheusSelector)s}[5m]) > 0
||| % $._config,
'for': '15m',
labels: {
severity: 'warning',
},
annotations: {
summary: 'Prometheus is missing rule evaluations due to slow rule group evaluation.',
description: 'Prometheus %(prometheusName)s has missed {{ printf "%%.0f" $value }} rule group evaluations in the last 5m.' % $._config,
},
},
],
},
],
},
}

View File

@ -0,0 +1,16 @@
{
_config+:: {
// prometheusSelector is inserted as part of the label selector in
// PromQL queries to identify metrics collected from Prometheus
// servers.
prometheusSelector: 'job="prometheus"',
// prometheusName is inserted into annotations to name the Prometheus
// instance affected by the alert.
prometheusName: '{{$labels.instance}}',
// If you run Prometheus on Kubernetes with the Prometheus
// Operator, you can make use of the configured target labels for
// nicer naming:
// prometheusNameTemplate: '{{$labels.namespace}}/{{$labels.pod}}'
},
}

View File

@ -0,0 +1,6 @@
local dashboards = (import 'mixin.libsonnet').dashboards;
{
[name]: dashboards[name]
for name in std.objectFields(dashboards)
}

View File

@ -0,0 +1,148 @@
local g = import 'grafana-builder/grafana.libsonnet';
{
dashboards+: {
'prometheus.json':
g.dashboard('Prometheus')
.addMultiTemplate('job', 'prometheus_build_info', 'job')
.addMultiTemplate('instance', 'prometheus_build_info', 'instance')
.addRow(
g.row('Prometheus Stats')
.addPanel(
g.panel('Prometheus Stats') +
g.tablePanel([
'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})',
'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})',
], {
job: { alias: 'Job' },
instance: { alias: 'Instance' },
version: { alias: 'Version' },
'Value #A': { alias: 'Count', type: 'hidden' },
'Value #B': { alias: 'Uptime' },
})
)
)
.addRow(
g.row('Discovery')
.addPanel(
g.panel('Target Sync') +
g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3', '{{scrape_job}}') +
{ yaxes: g.yaxes('ms') }
)
.addPanel(
g.panel('Targets') +
g.queryPanel('sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})', 'Targets') +
g.stack
)
)
.addRow(
g.row('Retrieval')
.addPanel(
g.panel('Average Scrape Interval Duration') +
g.queryPanel('rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3', '{{interval}} configured') +
{ yaxes: g.yaxes('ms') }
)
.addPanel(
g.panel('Scrape failures') +
g.queryPanel([
'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))',
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))',
], [
'exceeded sample limit: {{job}}',
'duplicate timestamp: {{job}}',
'out of bounds: {{job}}',
'out of order: {{job}}',
]) +
g.stack
)
.addPanel(
g.panel('Appended Samples') +
g.queryPanel('rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])', '{{job}} {{instance}}') +
g.stack
)
)
.addRow(
g.row('Storage')
.addPanel(
g.panel('Head Series') +
g.queryPanel('prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head series') +
g.stack
)
.addPanel(
g.panel('Head Chunks') +
g.queryPanel('prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head chunks') +
g.stack
)
)
.addRow(
g.row('Query')
.addPanel(
g.panel('Query Rate') +
g.queryPanel('rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])', '{{job}} {{instance}}') +
g.stack,
)
.addPanel(
g.panel('Stage Duration') +
g.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3', '{{slice}}') +
{ yaxes: g.yaxes('ms') } +
g.stack,
)
),
// Remote write specific dashboard.
'prometheus-remote-write.json':
g.dashboard('Prometheus Remote Write')
.addMultiTemplate('instance', 'prometheus_build_info', 'instance')
.addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*prometheus.*"}', 'cluster')
.addRow(
g.row('Timestamps')
.addPanel(
g.panel('Highest Timestamp In vs. Highest Timestamp Sent') +
g.queryPanel('prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"} - ignoring(queue) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}') +
{ yaxes: g.yaxes('s') }
)
.addPanel(
g.panel('Rate[5m]') +
g.queryPanel('rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m]) - ignoring (queue) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
)
)
.addRow(
g.row('Samples')
.addPanel(
g.panel('Rate, in vs. succeeded or dropped [5m]') +
g.queryPanel('rate(prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m])- ignoring(queue) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) - rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
)
)
.addRow(
g.row('Shards')
.addPanel(
g.panel('Num. Shards') +
g.queryPanel('prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}')
)
.addPanel(
g.panel('Capacity') +
g.queryPanel('prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}')
)
)
.addRow(
g.row('Misc Rates.')
.addPanel(
g.panel('Dropped Samples') +
g.queryPanel('rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
)
.addPanel(
g.panel('Failed Samples') +
g.queryPanel('rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
)
.addPanel(
g.panel('Retried Samples') +
g.queryPanel('rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
)
.addPanel(
g.panel('Enqueue Retries') +
g.queryPanel('rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
)
),
},
}

View File

@ -0,0 +1,14 @@
{
"dependencies": [
{
"name": "grafana-builder",
"source": {
"git": {
"remote": "https://github.com/grafana/jsonnet-libs",
"subdir": "grafana-builder"
}
},
"version": "master"
}
]
}

View File

@ -0,0 +1,3 @@
(import 'config.libsonnet') +
(import 'dashboards.libsonnet') +
(import 'alerts.libsonnet')