439 lines
19 KiB
Plaintext
439 lines
19 KiB
Plaintext
local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
|
|
local g = import 'github.com/grafana/jsonnet-libs/grafana-builder/grafana.libsonnet';
|
|
local dashboard = grafana.dashboard;
|
|
local row = grafana.row;
|
|
local singlestat = grafana.singlestat;
|
|
local prometheus = grafana.prometheus;
|
|
local graphPanel = grafana.graphPanel;
|
|
local tablePanel = grafana.tablePanel;
|
|
local template = grafana.template;
|
|
{
|
|
grafanaDashboards+:: {
|
|
'prometheus.json':
|
|
local showMultiCluster = $._config.showMultiCluster;
|
|
local dashboard = g.dashboard(
|
|
'%(prefix)sOverview' % $._config.grafanaPrometheus
|
|
);
|
|
local templatedDashboard = if showMultiCluster then
|
|
dashboard
|
|
.addMultiTemplate('cluster', 'prometheus_build_info{%(prometheusSelector)s}' % $._config, $._config.clusterLabel)
|
|
.addMultiTemplate('job', 'prometheus_build_info{cluster=~"$cluster"}', 'job')
|
|
.addMultiTemplate('instance', 'prometheus_build_info{cluster=~"$cluster", job=~"$job"}', 'instance')
|
|
else
|
|
dashboard
|
|
.addMultiTemplate('job', 'prometheus_build_info{%(prometheusSelector)s}' % $._config, 'job')
|
|
.addMultiTemplate('instance', 'prometheus_build_info{job=~"$job"}', 'instance');
|
|
templatedDashboard
|
|
.addRow(
|
|
g.row('Prometheus Stats')
|
|
.addPanel(
|
|
g.panel('Prometheus Stats') +
|
|
g.tablePanel(if showMultiCluster then [
|
|
'count by (cluster, job, instance, version) (prometheus_build_info{cluster=~"$cluster", job=~"$job", instance=~"$instance"})',
|
|
'max by (cluster, job, instance) (time() - process_start_time_seconds{cluster=~"$cluster", job=~"$job", instance=~"$instance"})',
|
|
] else [
|
|
'count by (job, instance, version) (prometheus_build_info{job=~"$job", instance=~"$instance"})',
|
|
'max by (job, instance) (time() - process_start_time_seconds{job=~"$job", instance=~"$instance"})',
|
|
], {
|
|
cluster: { alias: if showMultiCluster then 'Cluster' else '' },
|
|
job: { alias: 'Job' },
|
|
instance: { alias: 'Instance' },
|
|
version: { alias: 'Version' },
|
|
'Value #A': { alias: 'Count', type: 'hidden' },
|
|
'Value #B': { alias: 'Uptime', type: 'number', unit: 's' },
|
|
})
|
|
)
|
|
)
|
|
.addRow(
|
|
g.row('Discovery')
|
|
.addPanel(
|
|
g.panel('Target Sync') +
|
|
g.queryPanel(if showMultiCluster then 'sum(rate(prometheus_target_sync_length_seconds_sum{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[5m])) by (cluster, job, scrape_job, instance) * 1e3'
|
|
else 'sum(rate(prometheus_target_sync_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m])) by (scrape_job) * 1e3',
|
|
if showMultiCluster then '{{cluster}}:{{job}}:{{instance}}:{{scrape_job}}'
|
|
else '{{scrape_job}}') +
|
|
{ yaxes: g.yaxes('ms') }
|
|
)
|
|
.addPanel(
|
|
g.panel('Targets') +
|
|
g.queryPanel(if showMultiCluster then 'sum by (cluster, job, instance) (prometheus_sd_discovered_targets{cluster=~"$cluster", job=~"$job",instance=~"$instance"})'
|
|
else 'sum(prometheus_sd_discovered_targets{job=~"$job",instance=~"$instance"})',
|
|
if showMultiCluster then '{{cluster}}:{{job}}:{{instance}}'
|
|
else 'Targets') +
|
|
g.stack
|
|
)
|
|
)
|
|
.addRow(
|
|
g.row('Retrieval')
|
|
.addPanel(
|
|
g.panel('Average Scrape Interval Duration') +
|
|
g.queryPanel(if showMultiCluster then 'rate(prometheus_target_interval_length_seconds_sum{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m]) * 1e3'
|
|
else 'rate(prometheus_target_interval_length_seconds_sum{job=~"$job",instance=~"$instance"}[5m]) / rate(prometheus_target_interval_length_seconds_count{job=~"$job",instance=~"$instance"}[5m]) * 1e3',
|
|
if showMultiCluster then '{{cluster}}:{{job}}:{{instance}} {{interval}} configured'
|
|
else '{{interval}} configured') +
|
|
{ yaxes: g.yaxes('ms') }
|
|
)
|
|
.addPanel(
|
|
g.panel('Scrape failures') +
|
|
g.queryPanel(if showMultiCluster then [
|
|
'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))',
|
|
'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_exceeded_sample_limit_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))',
|
|
'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))',
|
|
'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_bounds_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))',
|
|
'sum by (cluster, job, instance) (rate(prometheus_target_scrapes_sample_out_of_order_total{cluster=~"$cluster",job=~"$job",instance=~"$instance"}[1m]))',
|
|
] else [
|
|
'sum by (job) (rate(prometheus_target_scrapes_exceeded_body_size_limit_total[1m]))',
|
|
'sum by (job) (rate(prometheus_target_scrapes_exceeded_sample_limit_total[1m]))',
|
|
'sum by (job) (rate(prometheus_target_scrapes_sample_duplicate_timestamp_total[1m]))',
|
|
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_bounds_total[1m]))',
|
|
'sum by (job) (rate(prometheus_target_scrapes_sample_out_of_order_total[1m]))',
|
|
], if showMultiCluster then [
|
|
'exceeded body size limit: {{cluster}} {{job}} {{instance}}',
|
|
'exceeded sample limit: {{cluster}} {{job}} {{instance}}',
|
|
'duplicate timestamp: {{cluster}} {{job}} {{instance}}',
|
|
'out of bounds: {{cluster}} {{job}} {{instance}}',
|
|
'out of order: {{cluster}} {{job}} {{instance}}',
|
|
] else [
|
|
'exceeded body size limit: {{job}}',
|
|
'exceeded sample limit: {{job}}',
|
|
'duplicate timestamp: {{job}}',
|
|
'out of bounds: {{job}}',
|
|
'out of order: {{job}}',
|
|
]) +
|
|
g.stack
|
|
)
|
|
.addPanel(
|
|
g.panel('Appended Samples') +
|
|
g.queryPanel(if showMultiCluster then 'rate(prometheus_tsdb_head_samples_appended_total{cluster=~"$cluster", job=~"$job",instance=~"$instance"}[5m])'
|
|
else 'rate(prometheus_tsdb_head_samples_appended_total{job=~"$job",instance=~"$instance"}[5m])',
|
|
if showMultiCluster then '{{cluster}} {{job}} {{instance}}'
|
|
else '{{job}} {{instance}}') +
|
|
g.stack
|
|
)
|
|
)
|
|
.addRow(
|
|
g.row('Storage')
|
|
.addPanel(
|
|
g.panel('Head Series') +
|
|
g.queryPanel(if showMultiCluster then 'prometheus_tsdb_head_series{cluster=~"$cluster",job=~"$job",instance=~"$instance"}'
|
|
else 'prometheus_tsdb_head_series{job=~"$job",instance=~"$instance"}',
|
|
if showMultiCluster then '{{cluster}} {{job}} {{instance}} head series'
|
|
else '{{job}} {{instance}} head series') +
|
|
g.stack
|
|
)
|
|
.addPanel(
|
|
g.panel('Head Chunks') +
|
|
g.queryPanel(if showMultiCluster then 'prometheus_tsdb_head_chunks{cluster=~"$cluster",job=~"$job",instance=~"$instance"}'
|
|
else 'prometheus_tsdb_head_chunks{job=~"$job",instance=~"$instance"}',
|
|
if showMultiCluster then '{{cluster}} {{job}} {{instance}} head chunks'
|
|
else '{{job}} {{instance}} head chunks') +
|
|
g.stack
|
|
)
|
|
)
|
|
.addRow(
|
|
g.row('Query')
|
|
.addPanel(
|
|
g.panel('Query Rate') +
|
|
g.queryPanel(if showMultiCluster then 'rate(prometheus_engine_query_duration_seconds_count{cluster=~"$cluster",job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])'
|
|
else 'rate(prometheus_engine_query_duration_seconds_count{job=~"$job",instance=~"$instance",slice="inner_eval"}[5m])',
|
|
if showMultiCluster then '{{cluster}} {{job}} {{instance}}'
|
|
else '{{job}} {{instance}}') +
|
|
g.stack,
|
|
)
|
|
.addPanel(
|
|
g.panel('Stage Duration') +
|
|
g.queryPanel(if showMultiCluster then 'max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",cluster=~"$cluster", job=~"$job",instance=~"$instance"}) * 1e3'
|
|
else 'max by (slice) (prometheus_engine_query_duration_seconds{quantile="0.9",job=~"$job",instance=~"$instance"}) * 1e3',
|
|
if showMultiCluster then '{{slice}}'
|
|
else '{{slice}}') +
|
|
{ yaxes: g.yaxes('ms') } +
|
|
g.stack,
|
|
)
|
|
) + {
|
|
tags: $._config.grafanaPrometheus.tags,
|
|
refresh: $._config.grafanaPrometheus.refresh,
|
|
},
|
|
// Remote write specific dashboard.
|
|
'prometheus-remote-write.json':
|
|
local timestampComparison =
|
|
graphPanel.new(
|
|
'Highest Timestamp In vs. Highest Timestamp Sent',
|
|
datasource='$datasource',
|
|
span=6,
|
|
)
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
(
|
|
prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}
|
|
-
|
|
ignoring(remote_name, url) group_right(instance) (prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance", url=~"$url"} != 0)
|
|
)
|
|
|||,
|
|
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}',
|
|
));
|
|
|
|
local timestampComparisonRate =
|
|
graphPanel.new(
|
|
'Rate[5m]',
|
|
datasource='$datasource',
|
|
span=6,
|
|
)
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
clamp_min(
|
|
rate(prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~"$cluster", instance=~"$instance"}[5m])
|
|
-
|
|
ignoring (remote_name, url) group_right(instance) rate(prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])
|
|
, 0)
|
|
|||,
|
|
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}',
|
|
));
|
|
|
|
local samplesRate =
|
|
graphPanel.new(
|
|
'Rate, in vs. succeeded or dropped [5m]',
|
|
datasource='$datasource',
|
|
span=12,
|
|
)
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
rate(
|
|
prometheus_remote_storage_samples_in_total{cluster=~"$cluster", instance=~"$instance"}[5m])
|
|
-
|
|
ignoring(remote_name, url) group_right(instance) (rate(prometheus_remote_storage_succeeded_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]))
|
|
-
|
|
(rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]))
|
|
|||,
|
|
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
|
|
));
|
|
|
|
local currentShards =
|
|
graphPanel.new(
|
|
'Current Shards',
|
|
datasource='$datasource',
|
|
span=12,
|
|
min_span=6,
|
|
)
|
|
.addTarget(prometheus.target(
|
|
'prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance", url=~"$url"}',
|
|
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
|
|
));
|
|
|
|
local maxShards =
|
|
graphPanel.new(
|
|
'Max Shards',
|
|
datasource='$datasource',
|
|
span=4,
|
|
)
|
|
.addTarget(prometheus.target(
|
|
'prometheus_remote_storage_shards_max{cluster=~"$cluster", instance=~"$instance", url=~"$url"}',
|
|
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
|
|
));
|
|
|
|
local minShards =
|
|
graphPanel.new(
|
|
'Min Shards',
|
|
datasource='$datasource',
|
|
span=4,
|
|
)
|
|
.addTarget(prometheus.target(
|
|
'prometheus_remote_storage_shards_min{cluster=~"$cluster", instance=~"$instance", url=~"$url"}',
|
|
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
|
|
));
|
|
|
|
local desiredShards =
|
|
graphPanel.new(
|
|
'Desired Shards',
|
|
datasource='$datasource',
|
|
span=4,
|
|
)
|
|
.addTarget(prometheus.target(
|
|
'prometheus_remote_storage_shards_desired{cluster=~"$cluster", instance=~"$instance", url=~"$url"}',
|
|
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
|
|
));
|
|
|
|
local shardsCapacity =
|
|
graphPanel.new(
|
|
'Shard Capacity',
|
|
datasource='$datasource',
|
|
span=6,
|
|
)
|
|
.addTarget(prometheus.target(
|
|
'prometheus_remote_storage_shard_capacity{cluster=~"$cluster", instance=~"$instance", url=~"$url"}',
|
|
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
|
|
));
|
|
|
|
|
|
local pendingSamples =
|
|
graphPanel.new(
|
|
'Pending Samples',
|
|
datasource='$datasource',
|
|
span=6,
|
|
)
|
|
.addTarget(prometheus.target(
|
|
'prometheus_remote_storage_pending_samples{cluster=~"$cluster", instance=~"$instance", url=~"$url"} or prometheus_remote_storage_samples_pending{cluster=~"$cluster", instance=~"$instance", url=~"$url"}',
|
|
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
|
|
));
|
|
|
|
local walSegment =
|
|
graphPanel.new(
|
|
'TSDB Current Segment',
|
|
datasource='$datasource',
|
|
span=6,
|
|
formatY1='none',
|
|
)
|
|
.addTarget(prometheus.target(
|
|
'prometheus_tsdb_wal_segment_current{cluster=~"$cluster", instance=~"$instance"}',
|
|
legendFormat='{{cluster}}:{{instance}}'
|
|
));
|
|
|
|
local queueSegment =
|
|
graphPanel.new(
|
|
'Remote Write Current Segment',
|
|
datasource='$datasource',
|
|
span=6,
|
|
formatY1='none',
|
|
)
|
|
.addTarget(prometheus.target(
|
|
'prometheus_wal_watcher_current_segment{cluster=~"$cluster", instance=~"$instance"}',
|
|
legendFormat='{{cluster}}:{{instance}} {{consumer}}'
|
|
));
|
|
|
|
local droppedSamples =
|
|
graphPanel.new(
|
|
'Dropped Samples',
|
|
datasource='$datasource',
|
|
span=3,
|
|
)
|
|
.addTarget(prometheus.target(
|
|
'rate(prometheus_remote_storage_dropped_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_dropped_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])',
|
|
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
|
|
));
|
|
|
|
local failedSamples =
|
|
graphPanel.new(
|
|
'Failed Samples',
|
|
datasource='$datasource',
|
|
span=3,
|
|
)
|
|
.addTarget(prometheus.target(
|
|
'rate(prometheus_remote_storage_failed_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_failed_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])',
|
|
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
|
|
));
|
|
|
|
local retriedSamples =
|
|
graphPanel.new(
|
|
'Retried Samples',
|
|
datasource='$datasource',
|
|
span=3,
|
|
)
|
|
.addTarget(prometheus.target(
|
|
'rate(prometheus_remote_storage_retried_samples_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m]) or rate(prometheus_remote_storage_samples_retried_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])',
|
|
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
|
|
));
|
|
|
|
local enqueueRetries =
|
|
graphPanel.new(
|
|
'Enqueue Retries',
|
|
datasource='$datasource',
|
|
span=3,
|
|
)
|
|
.addTarget(prometheus.target(
|
|
'rate(prometheus_remote_storage_enqueue_retries_total{cluster=~"$cluster", instance=~"$instance", url=~"$url"}[5m])',
|
|
legendFormat='{{cluster}}:{{instance}} {{remote_name}}:{{url}}'
|
|
));
|
|
|
|
dashboard.new(
|
|
title='%(prefix)sRemote Write' % $._config.grafanaPrometheus,
|
|
editable=true
|
|
)
|
|
.addTemplate(
|
|
{
|
|
hide: 0,
|
|
label: null,
|
|
name: 'datasource',
|
|
options: [],
|
|
query: 'prometheus',
|
|
refresh: 1,
|
|
regex: '',
|
|
type: 'datasource',
|
|
},
|
|
)
|
|
.addTemplate(
|
|
template.new(
|
|
'cluster',
|
|
'$datasource',
|
|
'label_values(prometheus_build_info, cluster)' % $._config,
|
|
refresh='time',
|
|
current={
|
|
selected: true,
|
|
text: 'All',
|
|
value: '$__all',
|
|
},
|
|
includeAll=true,
|
|
)
|
|
)
|
|
.addTemplate(
|
|
template.new(
|
|
'instance',
|
|
'$datasource',
|
|
'label_values(prometheus_build_info{cluster=~"$cluster"}, instance)' % $._config,
|
|
refresh='time',
|
|
current={
|
|
selected: true,
|
|
text: 'All',
|
|
value: '$__all',
|
|
},
|
|
includeAll=true,
|
|
)
|
|
)
|
|
.addTemplate(
|
|
template.new(
|
|
'url',
|
|
'$datasource',
|
|
'label_values(prometheus_remote_storage_shards{cluster=~"$cluster", instance=~"$instance"}, url)' % $._config,
|
|
refresh='time',
|
|
includeAll=true,
|
|
)
|
|
)
|
|
.addRow(
|
|
row.new('Timestamps')
|
|
.addPanel(timestampComparison)
|
|
.addPanel(timestampComparisonRate)
|
|
)
|
|
.addRow(
|
|
row.new('Samples')
|
|
.addPanel(samplesRate)
|
|
)
|
|
.addRow(
|
|
row.new(
|
|
'Shards'
|
|
)
|
|
.addPanel(currentShards)
|
|
.addPanel(maxShards)
|
|
.addPanel(minShards)
|
|
.addPanel(desiredShards)
|
|
)
|
|
.addRow(
|
|
row.new('Shard Details')
|
|
.addPanel(shardsCapacity)
|
|
.addPanel(pendingSamples)
|
|
)
|
|
.addRow(
|
|
row.new('Segments')
|
|
.addPanel(walSegment)
|
|
.addPanel(queueSegment)
|
|
)
|
|
.addRow(
|
|
row.new('Misc. Rates')
|
|
.addPanel(droppedSamples)
|
|
.addPanel(failedSamples)
|
|
.addPanel(retriedSamples)
|
|
.addPanel(enqueueRetries)
|
|
) + {
|
|
tags: $._config.grafanaPrometheus.tags,
|
|
refresh: $._config.grafanaPrometheus.refresh,
|
|
},
|
|
},
|
|
}
|