diff --git a/documentation/prometheus-mixin/.gitignore b/documentation/prometheus-mixin/.gitignore new file mode 100644 index 000000000..b23a75c9b --- /dev/null +++ b/documentation/prometheus-mixin/.gitignore @@ -0,0 +1,4 @@ +*.yaml +dashboards_out +vendor +jsonnetfile.lock.json diff --git a/documentation/prometheus-mixin/Makefile b/documentation/prometheus-mixin/Makefile new file mode 100644 index 000000000..9ade5aa2b --- /dev/null +++ b/documentation/prometheus-mixin/Makefile @@ -0,0 +1,25 @@ +JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s + +all: fmt prometheus_alerts.yaml dashboards_out lint + +fmt: + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + xargs -n 1 -- $(JSONNET_FMT) -i + +prometheus_alerts.yaml: mixin.libsonnet config.libsonnet alerts.libsonnet + jsonnet -S alerts.jsonnet > $@ + +dashboards_out: mixin.libsonnet config.libsonnet dashboards.libsonnet + @mkdir -p dashboards_out + jsonnet -J vendor -m dashboards_out dashboards.jsonnet + +lint: prometheus_alerts.yaml + find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \ + while read f; do \ + $(JSONNET_FMT) "$$f" | diff -u "$$f" -; \ + done + + promtool check rules prometheus_alerts.yaml + +clean: + rm -rf dashboards_out prometheus_alerts.yaml diff --git a/documentation/prometheus-mixin/README.md b/documentation/prometheus-mixin/README.md new file mode 100644 index 000000000..c44e70ca0 --- /dev/null +++ b/documentation/prometheus-mixin/README.md @@ -0,0 +1,36 @@ +# Prometheus Mixin + +_This is work in progress. We aim for it to become a good role model for alerts +and dashboards eventually, but it is not quite there yet._ + +The Prometheus Mixin is a set of configurable, reusable, and extensible alerts +and dashboards for Prometheus. + +To use them, you need to have `jsonnet` (v0.13+) and `jb` installed. If you +have a working Go development environment, it's easiest to run the following: +```bash +$ go get github.com/google/go-jsonnet/cmd/jsonnet +$ go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb +``` + +_Note: The make targets `lint` and `fmt` need the `jsonnetfmt` binary, which is +currently not included in the Go implementation of `jsonnet`. For the time +being, you have to install the [C++ version of +jsonnetfmt](https://github.com/google/jsonnet) if you want to use `make lint` +or `make fmt`._ + +Next, install the dependencies by running the following command in this +directory: +```bash +$ jb install +``` + +You can then build a `prometheus_alerts.yaml` with the alerts and a directory +`dashboards_out` with the Grafana dashboard JSON files: +```bash +$ make prometheus_alerts.yaml +$ make dashboards_out +``` + +For more advanced uses of mixins, see https://github.com/monitoring-mixins/docs. + diff --git a/documentation/prometheus-mixin/alerts.jsonnet b/documentation/prometheus-mixin/alerts.jsonnet new file mode 100644 index 000000000..75e7c1b29 --- /dev/null +++ b/documentation/prometheus-mixin/alerts.jsonnet @@ -0,0 +1 @@ +std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts) diff --git a/documentation/prometheus-mixin/alerts.libsonnet b/documentation/prometheus-mixin/alerts.libsonnet new file mode 100644 index 000000000..06c527457 --- /dev/null +++ b/documentation/prometheus-mixin/alerts.libsonnet @@ -0,0 +1,260 @@ +{ + prometheusAlerts+:: { + groups+: [ + { + name: 'prometheus', + rules: [ + { + alert: 'PrometheusBadConfig', + expr: ||| + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_config_last_reload_successful{%(prometheusSelector)s}[5m]) == 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Failed Prometheus configuration reload.', + description: 'Prometheus %(prometheusName)s has failed to reload its configuration.' % $._config, + }, + }, + { + alert: 'PrometheusNotificationQueueRunningFull', + expr: ||| + # Without min_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30) + > + min_over_time(prometheus_notifications_queue_capacity{%(prometheusSelector)s}[5m]) + ) + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Prometheus alert notification queue predicted to run full in less than 30m.', + description: 'Alert notification queue of Prometheus %(prometheusName)s is running full.' % $._config, + }, + }, + { + alert: 'PrometheusErrorSendingAlertsToSomeAlertmanagers', + expr: ||| + ( + rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) + / + rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) + ) + * 100 + > 1 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.', + description: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}}.' % $._config, + }, + }, + { + alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager', + expr: ||| + min without(alertmanager) ( + rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m]) + / + rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m]) + ) + * 100 + > 3 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Prometheus encounters more than 3% errors sending alerts to any Alertmanager.', + description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from Prometheus %(prometheusName)s to any Alertmanager.' % $._config, + }, + }, + { + alert: 'PrometheusNotConnectedToAlertmanagers', + expr: ||| + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + max_over_time(prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s}[5m]) < 1 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Prometheus is not connected to any Alertmanagers.', + description: 'Prometheus %(prometheusName)s is not connected to any Alertmanagers.' % $._config, + }, + }, + { + alert: 'PrometheusTSDBReloadsFailing', + expr: ||| + increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[3h]) > 0 + ||| % $._config, + 'for': '4h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Prometheus has issues reloading blocks from disk.', + description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} reload failures over the last 3h.' % $._config, + }, + }, + { + alert: 'PrometheusTSDBCompactionsFailing', + expr: ||| + increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[3h]) > 0 + ||| % $._config, + 'for': '4h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Prometheus has issues compacting blocks.', + description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} compaction failures over the last 3h.' % $._config, + }, + }, + { + alert: 'PrometheusTSDBWALCorruptions', + expr: ||| + increase(tsdb_wal_corruptions_total{%(prometheusSelector)s}[3h]) > 0 + ||| % $._config, + 'for': '4h', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Prometheus is detecting WAL corruptions.', + description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} corruptions of the write-ahead log (WAL) over the last 3h.' % $._config, + }, + }, + { + alert: 'PrometheusNotIngestingSamples', + expr: ||| + rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Prometheus is not ingesting samples.', + description: 'Prometheus %(prometheusName)s is not ingesting samples.' % $._config, + }, + }, + { + alert: 'PrometheusDuplicateTimestamps', + expr: ||| + rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Prometheus is dropping samples with duplicate timestamps.', + description: 'Prometheus %(prometheusName)s is dropping {{$value | humanize}} samples/s with different values but duplicated timestamp.' % $._config, + }, + }, + { + alert: 'PrometheusOutOfOrderTimestamps', + expr: ||| + rate(prometheus_target_scrapes_sample_out_of_order_total{%(prometheusSelector)s}[5m]) > 0 + ||| % $._config, + 'for': '10m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Prometheus drops samples with out-of-order timestamps.', + description: 'Prometheus %(prometheusName)s is dropping {{$value | humanize}} samples/s with timestamps arriving out of order.' % $._config, + }, + }, + { + alert: 'PrometheusRemoteStorageFailures', + expr: ||| + ( + rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[5m]) + / + ( + rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[5m]) + + + rate(prometheus_remote_storage_succeeded_samples_total{%(prometheusSelector)s}[5m]) + ) + ) + * 100 + > 1 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Prometheus fails to send samples to remote storage.', + description: 'Prometheus %(prometheusName)s failed to send {{ printf "%%.1f" $value }}%% of the samples to queue {{$labels.queue}}.' % $._config, + }, + }, + { + alert: 'PrometheusRemoteWriteBehind', + expr: ||| + # Without max_over_time, failed scrapes could create false negatives, see + # https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details. + ( + max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s}[5m]) + - on(job, instance) group_right + max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s}[5m]) + ) + > 120 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Prometheus remote write is behind.', + description: 'Prometheus %(prometheusName)s remote write is {{ printf "%%.1f" $value }}s behind for queue {{$labels.queue}}.' % $._config, + }, + }, + { + alert: 'PrometheusRuleFailures', + expr: ||| + increase(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[5m]) > 0 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'critical', + }, + annotations: { + summary: 'Prometheus is failing rule evaluations.', + description: 'Prometheus %(prometheusName)s has failed to evaluate {{ printf "%%.0f" $value }} rules in the last 5m.' % $._config, + }, + }, + { + alert: 'PrometheusMissingRuleEvaluations', + expr: ||| + increase(prometheus_rule_group_iterations_missed_total{%(prometheusSelector)s}[5m]) > 0 + ||| % $._config, + 'for': '15m', + labels: { + severity: 'warning', + }, + annotations: { + summary: 'Prometheus is missing rule evaluations due to slow rule group evaluation.', + description: 'Prometheus %(prometheusName)s has missed {{ printf "%%.0f" $value }} rule group evaluations in the last 5m.' % $._config, + }, + }, + ], + }, + ], + }, +} diff --git a/documentation/prometheus-mixin/config.libsonnet b/documentation/prometheus-mixin/config.libsonnet new file mode 100644 index 000000000..27614289a --- /dev/null +++ b/documentation/prometheus-mixin/config.libsonnet @@ -0,0 +1,16 @@ +{ + _config+:: { + // prometheusSelector is inserted as part of the label selector in + // PromQL queries to identify metrics collected from Prometheus + // servers. + prometheusSelector: 'job="prometheus"', + + // prometheusName is inserted into annotations to name the Prometheus + // instance affected by the alert. + prometheusName: '{{$labels.instance}}', + // If you run Prometheus on Kubernetes with the Prometheus + // Operator, you can make use of the configured target labels for + // nicer naming: + // prometheusNameTemplate: '{{$labels.namespace}}/{{$labels.pod}}' + }, +} diff --git a/documentation/prometheus-mixin/dashboards.jsonnet b/documentation/prometheus-mixin/dashboards.jsonnet new file mode 100644 index 000000000..fb102817c --- /dev/null +++ b/documentation/prometheus-mixin/dashboards.jsonnet @@ -0,0 +1,6 @@ +local dashboards = (import 'mixin.libsonnet').dashboards; + +{ + [name]: dashboards[name] + for name in std.objectFields(dashboards) +} diff --git a/documentation/prometheus-mixin/dashboards.libsonnet b/documentation/prometheus-mixin/dashboards.libsonnet new file mode 100644 index 000000000..cc4dba2b8 --- /dev/null +++ b/documentation/prometheus-mixin/dashboards.libsonnet @@ -0,0 +1,148 @@ +local g = import 'grafana-builder/grafana.libsonnet'; + +{ + dashboards+: { + 'prometheus.json': + g.dashboard('Prometheus') + .addMultiTemplate('job', 'prometheus_build_info', 'job') + .addMultiTemplate('instance', 'prometheus_build_info', 'instance') + .addRow( + g.row('Prometheus Stats') + .addPanel( + g.panel('Prometheus Stats') + + g.tablePanel([ + 'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})', + 'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})', + ], { + job: { alias: 'Job' }, + instance: { alias: 'Instance' }, + version: { alias: 'Version' }, + 'Value #A': { alias: 'Count', type: 'hidden' }, + 'Value #B': { alias: 'Uptime' }, + }) + ) + ) + .addRow( + g.row('Discovery') + .addPanel( + g.panel('Target Sync') + + g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3', '{{scrape_job}}') + + { yaxes: g.yaxes('ms') } + ) + .addPanel( + g.panel('Targets') + + g.queryPanel('sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})', 'Targets') + + g.stack + ) + ) + .addRow( + g.row('Retrieval') + .addPanel( + g.panel('Average Scrape Interval Duration') + + g.queryPanel('rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3', '{{interval}} configured') + + { yaxes: g.yaxes('ms') } + ) + .addPanel( + g.panel('Scrape failures') + + g.queryPanel([ + 'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))', + 'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))', + 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))', + 'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))', + ], [ + 'exceeded sample limit: {{job}}', + 'duplicate timestamp: {{job}}', + 'out of bounds: {{job}}', + 'out of order: {{job}}', + ]) + + g.stack + ) + .addPanel( + g.panel('Appended Samples') + + g.queryPanel('rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])', '{{job}} {{instance}}') + + g.stack + ) + ) + .addRow( + g.row('Storage') + .addPanel( + g.panel('Head Series') + + g.queryPanel('prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head series') + + g.stack + ) + .addPanel( + g.panel('Head Chunks') + + g.queryPanel('prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head chunks') + + g.stack + ) + ) + .addRow( + g.row('Query') + .addPanel( + g.panel('Query Rate') + + g.queryPanel('rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])', '{{job}} {{instance}}') + + g.stack, + ) + .addPanel( + g.panel('Stage Duration') + + g.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3', '{{slice}}') + + { yaxes: g.yaxes('ms') } + + g.stack, + ) + ), + // Remote write specific dashboard. + 'prometheus-remote-write.json': + g.dashboard('Prometheus Remote Write') + .addMultiTemplate('instance', 'prometheus_build_info', 'instance') + .addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*prometheus.*"}', 'cluster') + .addRow( + g.row('Timestamps') + .addPanel( + g.panel('Highest Timestamp In vs. Highest Timestamp Sent') + + g.queryPanel('prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"} - ignoring(queue) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}') + + { yaxes: g.yaxes('s') } + ) + .addPanel( + g.panel('Rate[5m]') + + g.queryPanel('rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m]) - ignoring (queue) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') + ) + ) + .addRow( + g.row('Samples') + .addPanel( + g.panel('Rate, in vs. succeeded or dropped [5m]') + + g.queryPanel('rate(prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m])- ignoring(queue) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) - rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') + ) + ) + .addRow( + g.row('Shards') + .addPanel( + g.panel('Num. Shards') + + g.queryPanel('prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}') + ) + .addPanel( + g.panel('Capacity') + + g.queryPanel('prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}') + ) + ) + .addRow( + g.row('Misc Rates.') + .addPanel( + g.panel('Dropped Samples') + + g.queryPanel('rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') + ) + .addPanel( + g.panel('Failed Samples') + + g.queryPanel('rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') + ) + .addPanel( + g.panel('Retried Samples') + + g.queryPanel('rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') + ) + .addPanel( + g.panel('Enqueue Retries') + + g.queryPanel('rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}') + ) + ), + }, +} diff --git a/documentation/prometheus-mixin/jsonnetfile.json b/documentation/prometheus-mixin/jsonnetfile.json new file mode 100644 index 000000000..b5d0ad347 --- /dev/null +++ b/documentation/prometheus-mixin/jsonnetfile.json @@ -0,0 +1,14 @@ +{ + "dependencies": [ + { + "name": "grafana-builder", + "source": { + "git": { + "remote": "https://github.com/grafana/jsonnet-libs", + "subdir": "grafana-builder" + } + }, + "version": "master" + } + ] +} diff --git a/documentation/prometheus-mixin/mixin.libsonnet b/documentation/prometheus-mixin/mixin.libsonnet new file mode 100644 index 000000000..3c983a300 --- /dev/null +++ b/documentation/prometheus-mixin/mixin.libsonnet @@ -0,0 +1,3 @@ +(import 'config.libsonnet') + +(import 'dashboards.libsonnet') + +(import 'alerts.libsonnet')