Merge pull request #4474 from prometheus/mixin
Prometheus Monitoring Mixin for Prometheus itself.
This commit is contained in:
commit
32c3a1beef
|
@ -0,0 +1,4 @@
|
|||
*.yaml
|
||||
dashboards_out
|
||||
vendor
|
||||
jsonnetfile.lock.json
|
|
@ -0,0 +1,25 @@
|
|||
JSONNET_FMT := jsonnetfmt -n 2 --max-blank-lines 2 --string-style s --comment-style s
|
||||
|
||||
all: fmt prometheus_alerts.yaml dashboards_out lint
|
||||
|
||||
fmt:
|
||||
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
|
||||
xargs -n 1 -- $(JSONNET_FMT) -i
|
||||
|
||||
prometheus_alerts.yaml: mixin.libsonnet config.libsonnet alerts.libsonnet
|
||||
jsonnet -S alerts.jsonnet > $@
|
||||
|
||||
dashboards_out: mixin.libsonnet config.libsonnet dashboards.libsonnet
|
||||
@mkdir -p dashboards_out
|
||||
jsonnet -J vendor -m dashboards_out dashboards.jsonnet
|
||||
|
||||
lint: prometheus_alerts.yaml
|
||||
find . -name 'vendor' -prune -o -name '*.libsonnet' -print -o -name '*.jsonnet' -print | \
|
||||
while read f; do \
|
||||
$(JSONNET_FMT) "$$f" | diff -u "$$f" -; \
|
||||
done
|
||||
|
||||
promtool check rules prometheus_alerts.yaml
|
||||
|
||||
clean:
|
||||
rm -rf dashboards_out prometheus_alerts.yaml
|
|
@ -0,0 +1,36 @@
|
|||
# Prometheus Mixin
|
||||
|
||||
_This is work in progress. We aim for it to become a good role model for alerts
|
||||
and dashboards eventually, but it is not quite there yet._
|
||||
|
||||
The Prometheus Mixin is a set of configurable, reusable, and extensible alerts
|
||||
and dashboards for Prometheus.
|
||||
|
||||
To use them, you need to have `jsonnet` (v0.13+) and `jb` installed. If you
|
||||
have a working Go development environment, it's easiest to run the following:
|
||||
```bash
|
||||
$ go get github.com/google/go-jsonnet/cmd/jsonnet
|
||||
$ go get github.com/jsonnet-bundler/jsonnet-bundler/cmd/jb
|
||||
```
|
||||
|
||||
_Note: The make targets `lint` and `fmt` need the `jsonnetfmt` binary, which is
|
||||
currently not included in the Go implementation of `jsonnet`. For the time
|
||||
being, you have to install the [C++ version of
|
||||
jsonnetfmt](https://github.com/google/jsonnet) if you want to use `make lint`
|
||||
or `make fmt`._
|
||||
|
||||
Next, install the dependencies by running the following command in this
|
||||
directory:
|
||||
```bash
|
||||
$ jb install
|
||||
```
|
||||
|
||||
You can then build a `prometheus_alerts.yaml` with the alerts and a directory
|
||||
`dashboards_out` with the Grafana dashboard JSON files:
|
||||
```bash
|
||||
$ make prometheus_alerts.yaml
|
||||
$ make dashboards_out
|
||||
```
|
||||
|
||||
For more advanced uses of mixins, see https://github.com/monitoring-mixins/docs.
|
||||
|
|
@ -0,0 +1 @@
|
|||
std.manifestYamlDoc((import 'mixin.libsonnet').prometheusAlerts)
|
|
@ -0,0 +1,260 @@
|
|||
{
|
||||
prometheusAlerts+:: {
|
||||
groups+: [
|
||||
{
|
||||
name: 'prometheus',
|
||||
rules: [
|
||||
{
|
||||
alert: 'PrometheusBadConfig',
|
||||
expr: |||
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_config_last_reload_successful{%(prometheusSelector)s}[5m]) == 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Failed Prometheus configuration reload.',
|
||||
description: 'Prometheus %(prometheusName)s has failed to reload its configuration.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusNotificationQueueRunningFull',
|
||||
expr: |||
|
||||
# Without min_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
predict_linear(prometheus_notifications_queue_length{%(prometheusSelector)s}[5m], 60 * 30)
|
||||
>
|
||||
min_over_time(prometheus_notifications_queue_capacity{%(prometheusSelector)s}[5m])
|
||||
)
|
||||
||| % $._config,
|
||||
'for': '15m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus alert notification queue predicted to run full in less than 30m.',
|
||||
description: 'Alert notification queue of Prometheus %(prometheusName)s is running full.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusErrorSendingAlertsToSomeAlertmanagers',
|
||||
expr: |||
|
||||
(
|
||||
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
||| % $._config,
|
||||
'for': '15m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus has encountered more than 1% errors sending alerts to a specific Alertmanager.',
|
||||
description: '{{ printf "%%.1f" $value }}%% errors while sending alerts from Prometheus %(prometheusName)s to Alertmanager {{$labels.alertmanager}}.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusErrorSendingAlertsToAnyAlertmanager',
|
||||
expr: |||
|
||||
min without(alertmanager) (
|
||||
rate(prometheus_notifications_errors_total{%(prometheusSelector)s}[5m])
|
||||
/
|
||||
rate(prometheus_notifications_sent_total{%(prometheusSelector)s}[5m])
|
||||
)
|
||||
* 100
|
||||
> 3
|
||||
||| % $._config,
|
||||
'for': '15m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus encounters more than 3% errors sending alerts to any Alertmanager.',
|
||||
description: '{{ printf "%%.1f" $value }}%% minimum errors while sending alerts from Prometheus %(prometheusName)s to any Alertmanager.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusNotConnectedToAlertmanagers',
|
||||
expr: |||
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
max_over_time(prometheus_notifications_alertmanagers_discovered{%(prometheusSelector)s}[5m]) < 1
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus is not connected to any Alertmanagers.',
|
||||
description: 'Prometheus %(prometheusName)s is not connected to any Alertmanagers.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusTSDBReloadsFailing',
|
||||
expr: |||
|
||||
increase(prometheus_tsdb_reloads_failures_total{%(prometheusSelector)s}[3h]) > 0
|
||||
||| % $._config,
|
||||
'for': '4h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus has issues reloading blocks from disk.',
|
||||
description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} reload failures over the last 3h.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusTSDBCompactionsFailing',
|
||||
expr: |||
|
||||
increase(prometheus_tsdb_compactions_failed_total{%(prometheusSelector)s}[3h]) > 0
|
||||
||| % $._config,
|
||||
'for': '4h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus has issues compacting blocks.',
|
||||
description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} compaction failures over the last 3h.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusTSDBWALCorruptions',
|
||||
expr: |||
|
||||
increase(tsdb_wal_corruptions_total{%(prometheusSelector)s}[3h]) > 0
|
||||
||| % $._config,
|
||||
'for': '4h',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus is detecting WAL corruptions.',
|
||||
description: 'Prometheus %(prometheusName)s has detected {{$value | humanize}} corruptions of the write-ahead log (WAL) over the last 3h.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusNotIngestingSamples',
|
||||
expr: |||
|
||||
rate(prometheus_tsdb_head_samples_appended_total{%(prometheusSelector)s}[5m]) <= 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus is not ingesting samples.',
|
||||
description: 'Prometheus %(prometheusName)s is not ingesting samples.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusDuplicateTimestamps',
|
||||
expr: |||
|
||||
rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{%(prometheusSelector)s}[5m]) > 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus is dropping samples with duplicate timestamps.',
|
||||
description: 'Prometheus %(prometheusName)s is dropping {{$value | humanize}} samples/s with different values but duplicated timestamp.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusOutOfOrderTimestamps',
|
||||
expr: |||
|
||||
rate(prometheus_target_scrapes_sample_out_of_order_total{%(prometheusSelector)s}[5m]) > 0
|
||||
||| % $._config,
|
||||
'for': '10m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus drops samples with out-of-order timestamps.',
|
||||
description: 'Prometheus %(prometheusName)s is dropping {{$value | humanize}} samples/s with timestamps arriving out of order.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusRemoteStorageFailures',
|
||||
expr: |||
|
||||
(
|
||||
rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[5m])
|
||||
/
|
||||
(
|
||||
rate(prometheus_remote_storage_failed_samples_total{%(prometheusSelector)s}[5m])
|
||||
+
|
||||
rate(prometheus_remote_storage_succeeded_samples_total{%(prometheusSelector)s}[5m])
|
||||
)
|
||||
)
|
||||
* 100
|
||||
> 1
|
||||
||| % $._config,
|
||||
'for': '15m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus fails to send samples to remote storage.',
|
||||
description: 'Prometheus %(prometheusName)s failed to send {{ printf "%%.1f" $value }}%% of the samples to queue {{$labels.queue}}.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusRemoteWriteBehind',
|
||||
expr: |||
|
||||
# Without max_over_time, failed scrapes could create false negatives, see
|
||||
# https://www.robustperception.io/alerting-on-gauges-in-prometheus-2-0 for details.
|
||||
(
|
||||
max_over_time(prometheus_remote_storage_highest_timestamp_in_seconds{%(prometheusSelector)s}[5m])
|
||||
- on(job, instance) group_right
|
||||
max_over_time(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{%(prometheusSelector)s}[5m])
|
||||
)
|
||||
> 120
|
||||
||| % $._config,
|
||||
'for': '15m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus remote write is behind.',
|
||||
description: 'Prometheus %(prometheusName)s remote write is {{ printf "%%.1f" $value }}s behind for queue {{$labels.queue}}.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusRuleFailures',
|
||||
expr: |||
|
||||
increase(prometheus_rule_evaluation_failures_total{%(prometheusSelector)s}[5m]) > 0
|
||||
||| % $._config,
|
||||
'for': '15m',
|
||||
labels: {
|
||||
severity: 'critical',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus is failing rule evaluations.',
|
||||
description: 'Prometheus %(prometheusName)s has failed to evaluate {{ printf "%%.0f" $value }} rules in the last 5m.' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'PrometheusMissingRuleEvaluations',
|
||||
expr: |||
|
||||
increase(prometheus_rule_group_iterations_missed_total{%(prometheusSelector)s}[5m]) > 0
|
||||
||| % $._config,
|
||||
'for': '15m',
|
||||
labels: {
|
||||
severity: 'warning',
|
||||
},
|
||||
annotations: {
|
||||
summary: 'Prometheus is missing rule evaluations due to slow rule group evaluation.',
|
||||
description: 'Prometheus %(prometheusName)s has missed {{ printf "%%.0f" $value }} rule group evaluations in the last 5m.' % $._config,
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
},
|
||||
}
|
|
@ -0,0 +1,16 @@
|
|||
{
|
||||
_config+:: {
|
||||
// prometheusSelector is inserted as part of the label selector in
|
||||
// PromQL queries to identify metrics collected from Prometheus
|
||||
// servers.
|
||||
prometheusSelector: 'job="prometheus"',
|
||||
|
||||
// prometheusName is inserted into annotations to name the Prometheus
|
||||
// instance affected by the alert.
|
||||
prometheusName: '{{$labels.instance}}',
|
||||
// If you run Prometheus on Kubernetes with the Prometheus
|
||||
// Operator, you can make use of the configured target labels for
|
||||
// nicer naming:
|
||||
// prometheusNameTemplate: '{{$labels.namespace}}/{{$labels.pod}}'
|
||||
},
|
||||
}
|
|
@ -0,0 +1,6 @@
|
|||
local dashboards = (import 'mixin.libsonnet').dashboards;
|
||||
|
||||
{
|
||||
[name]: dashboards[name]
|
||||
for name in std.objectFields(dashboards)
|
||||
}
|
|
@ -0,0 +1,148 @@
|
|||
local g = import 'grafana-builder/grafana.libsonnet';
|
||||
|
||||
{
|
||||
dashboards+: {
|
||||
'prometheus.json':
|
||||
g.dashboard('Prometheus')
|
||||
.addMultiTemplate('job', 'prometheus_build_info', 'job')
|
||||
.addMultiTemplate('instance', 'prometheus_build_info', 'instance')
|
||||
.addRow(
|
||||
g.row('Prometheus Stats')
|
||||
.addPanel(
|
||||
g.panel('Prometheus Stats') +
|
||||
g.tablePanel([
|
||||
'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})',
|
||||
'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})',
|
||||
], {
|
||||
job: { alias: 'Job' },
|
||||
instance: { alias: 'Instance' },
|
||||
version: { alias: 'Version' },
|
||||
'Value #A': { alias: 'Count', type: 'hidden' },
|
||||
'Value #B': { alias: 'Uptime' },
|
||||
})
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Discovery')
|
||||
.addPanel(
|
||||
g.panel('Target Sync') +
|
||||
g.queryPanel('sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3', '{{scrape_job}}') +
|
||||
{ yaxes: g.yaxes('ms') }
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Targets') +
|
||||
g.queryPanel('sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})', 'Targets') +
|
||||
g.stack
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Retrieval')
|
||||
.addPanel(
|
||||
g.panel('Average Scrape Interval Duration') +
|
||||
g.queryPanel('rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3', '{{interval}} configured') +
|
||||
{ yaxes: g.yaxes('ms') }
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Scrape failures') +
|
||||
g.queryPanel([
|
||||
'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))',
|
||||
'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))',
|
||||
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))',
|
||||
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))',
|
||||
], [
|
||||
'exceeded sample limit: {{job}}',
|
||||
'duplicate timestamp: {{job}}',
|
||||
'out of bounds: {{job}}',
|
||||
'out of order: {{job}}',
|
||||
]) +
|
||||
g.stack
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Appended Samples') +
|
||||
g.queryPanel('rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])', '{{job}} {{instance}}') +
|
||||
g.stack
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Storage')
|
||||
.addPanel(
|
||||
g.panel('Head Series') +
|
||||
g.queryPanel('prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head series') +
|
||||
g.stack
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Head Chunks') +
|
||||
g.queryPanel('prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}', '{{job}} {{instance}} head chunks') +
|
||||
g.stack
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Query')
|
||||
.addPanel(
|
||||
g.panel('Query Rate') +
|
||||
g.queryPanel('rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])', '{{job}} {{instance}}') +
|
||||
g.stack,
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Stage Duration') +
|
||||
g.queryPanel('max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3', '{{slice}}') +
|
||||
{ yaxes: g.yaxes('ms') } +
|
||||
g.stack,
|
||||
)
|
||||
),
|
||||
// Remote write specific dashboard.
|
||||
'prometheus-remote-write.json':
|
||||
g.dashboard('Prometheus Remote Write')
|
||||
.addMultiTemplate('instance', 'prometheus_build_info', 'instance')
|
||||
.addMultiTemplate('cluster', 'kube_pod_container_info{image=~".*prometheus.*"}', 'cluster')
|
||||
.addRow(
|
||||
g.row('Timestamps')
|
||||
.addPanel(
|
||||
g.panel('Highest Timestamp In vs. Highest Timestamp Sent') +
|
||||
g.queryPanel('prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"} - ignoring(queue) group_right(instance) prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}') +
|
||||
{ yaxes: g.yaxes('s') }
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Rate[5m]') +
|
||||
g.queryPanel('rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m]) - ignoring (queue) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Samples')
|
||||
.addPanel(
|
||||
g.panel('Rate, in vs. succeeded or dropped [5m]') +
|
||||
g.queryPanel('rate(prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m])- ignoring(queue) group_right(instance) rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m]) - rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Shards')
|
||||
.addPanel(
|
||||
g.panel('Num. Shards') +
|
||||
g.queryPanel('prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}')
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Capacity') +
|
||||
g.queryPanel('prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance"}', '{{cluster}}:{{instance}}-{{queue}}')
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
g.row('Misc Rates.')
|
||||
.addPanel(
|
||||
g.panel('Dropped Samples') +
|
||||
g.queryPanel('rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Failed Samples') +
|
||||
g.queryPanel('rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Retried Samples') +
|
||||
g.queryPanel('rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Enqueue Retries') +
|
||||
g.queryPanel('rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance"}[5m])', '{{cluster}}:{{instance}}-{{queue}}')
|
||||
)
|
||||
),
|
||||
},
|
||||
}
|
|
@ -0,0 +1,14 @@
|
|||
{
|
||||
"dependencies": [
|
||||
{
|
||||
"name": "grafana-builder",
|
||||
"source": {
|
||||
"git": {
|
||||
"remote": "https://github.com/grafana/jsonnet-libs",
|
||||
"subdir": "grafana-builder"
|
||||
}
|
||||
},
|
||||
"version": "master"
|
||||
}
|
||||
]
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
(import 'config.libsonnet') +
|
||||
(import 'dashboards.libsonnet') +
|
||||
(import 'alerts.libsonnet')
|
Loading…
Reference in New Issue