ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet
Aashish Sharma 2573426f54 mgr/dashboard: upgrade from old 'graph' type panels to the new
'timeseries' panel

The graph panel type is deprecated, and disappears after Grafana v9.1 (current version is 10.0) to prevent more old type panels being created. These should be migrated to the timeseries panel type, to avoid potential problems with future Grafana versions.

Fixes: https://tracker.ceph.com/issues/61720

Signed-off-by: Aashish Sharma <aasharma@redhat.com>
2023-12-22 11:19:40 +05:30

619 lines
20 KiB
Plaintext

local g = import 'grafonnet/grafana.libsonnet';
(import 'utils.libsonnet') {
'osds-overview.json':
$.dashboardSchema(
'OSD Overview',
'',
'lo02I1Aiz',
'now-1h',
'30s',
16,
$._config.dashboardTags,
''
)
.addAnnotation(
$.addAnnotationSchema(
1,
'-- Grafana --',
true,
true,
'rgba(0, 211, 255, 1)',
'Annotations & Alerts',
'dashboard'
)
)
.addRequired(
type='grafana', id='grafana', name='Grafana', version='5.0.0'
)
.addRequired(
type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3'
)
.addRequired(
type='panel', id='graph', name='Graph', version='5.0.0'
)
.addRequired(
type='panel', id='table', name='Table', version='5.0.0'
)
.addTemplate(
g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
)
.addTemplate(
$.addClusterTemplate()
)
.addTemplate(
$.addJobTemplate()
)
.addPanels([
$.simpleGraphPanel(
{ '@95%ile': '#e0752d' },
'OSD Read Latencies',
'',
'ms',
null,
'0',
|||
avg (
rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
)
||| % $.matchers(),
'AVG read',
0,
0,
8,
8
)
.addTargets(
[
$.addTargetSchema(
|||
max(
rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
)
||| % $.matchers(),
'MAX read'
),
$.addTargetSchema(
|||
quantile(0.95,
(
rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
* 1000
)
)
||| % $.matchers(),
'@95%ile'
),
],
),
$.addTableSchema(
'$datasource',
"This table shows the osd's that are delivering the 10 highest read latencies within the cluster",
{ col: 2, desc: true },
[
$.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
$.overviewStyle('Latency (ms)', 'Value', 'number', 'none'),
$.overviewStyle('', '/.*/', 'hidden', 'short'),
],
'Highest READ Latencies',
'table'
)
.addTarget(
$.addTargetSchema(
|||
topk(10,
(sort(
(
rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) *
1000
)
))
)
||| % $.matchers(),
'',
'table',
1,
true
)
) + { gridPos: { x: 8, y: 0, w: 4, h: 8 } },
$.simpleGraphPanel(
{
'@95%ile write': '#e0752d',
},
'OSD Write Latencies',
'',
'ms',
null,
'0',
|||
avg(
rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
* 1000
)
||| % $.matchers(),
'AVG write',
12,
0,
8,
8
)
.addTargets(
[
$.addTargetSchema(
|||
max(
rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
1000
)
||| % $.matchers(), 'MAX write'
),
$.addTargetSchema(
|||
quantile(0.95, (
rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
1000
))
||| % $.matchers(), '@95%ile write'
),
],
),
$.addTableSchema(
'$datasource',
"This table shows the osd's that are delivering the 10 highest write latencies within the cluster",
{ col: 2, desc: true },
[
$.overviewStyle(
'OSD ID', 'ceph_daemon', 'string', 'short'
),
$.overviewStyle('Latency (ms)', 'Value', 'number', 'none'),
$.overviewStyle('', '/.*/', 'hidden', 'short'),
],
'Highest WRITE Latencies',
'table'
)
.addTarget(
$.addTargetSchema(
|||
topk(10,
(sort(
(rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
1000)
))
)
||| % $.matchers(),
'',
'table',
1,
true
)
) + { gridPos: { x: 20, y: 0, w: 4, h: 8 } },
$.simplePieChart(
{}, '', 'OSD Types Summary'
)
.addTarget(
$.addTargetSchema('count by (device_class) (ceph_osd_metadata{%(matchers)s})' % $.matchers(), '{{device_class}}')
) + { gridPos: { x: 0, y: 8, w: 4, h: 8 } },
$.simplePieChart(
{ 'Non-Encrypted': '#E5AC0E' }, '', 'OSD Objectstore Types'
)
.addTarget(
$.addTargetSchema(
'count(ceph_bluefs_wal_total_bytes{%(matchers)s})' % $.matchers(), 'bluestore', 'time_series', 2
)
)
.addTarget(
$.addTargetSchema(
'absent(ceph_bluefs_wal_total_bytes{%(matchers)s}) * count(ceph_osd_metadata{%(matchers)s})' % $.matchers(), 'filestore', 'time_series', 2
)
) + { gridPos: { x: 4, y: 8, w: 4, h: 8 } },
$.simplePieChart(
{}, 'The pie chart shows the various OSD sizes used within the cluster', 'OSD Size Summary'
)
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} < 1099511627776)' % $.matchers(), '<1TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 1099511627776 < 2199023255552)' % $.matchers(), '<2TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 2199023255552 < 3298534883328)' % $.matchers(), '<3TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 3298534883328 < 4398046511104)' % $.matchers(), '<4TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 4398046511104 < 6597069766656)' % $.matchers(), '<6TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 6597069766656 < 8796093022208)' % $.matchers(), '<8TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 8796093022208 < 10995116277760)' % $.matchers(), '<10TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 10995116277760 < 13194139533312)' % $.matchers(), '<12TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 13194139533312)' % $.matchers(), '<12TB+', 'time_series', 2
)) + { gridPos: { x: 8, y: 8, w: 4, h: 8 } },
g.graphPanel.new(bars=true,
datasource='$datasource',
title='Distribution of PGs per OSD',
x_axis_buckets=20,
x_axis_mode='histogram',
x_axis_values=['total'],
formatY1='short',
formatY2='short',
labelY1='# of OSDs',
min='0',
nullPointMode='null')
.addTarget($.addTargetSchema(
'ceph_osd_numpg{%(matchers)s}' % $.matchers(), 'PGs per OSD', 'time_series', 1, true
)) + { type: 'timeseries' } + { fieldConfig: { defaults: { unit: 'short', custom: { fillOpacity: 8, showPoints: 'never' } } } } + { gridPos: { x: 12, y: 8, w: 8, h: 8 } },
$.gaugeSingleStatPanel(
'percentunit',
'OSD onode Hits Ratio',
'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster',
'current',
true,
1,
true,
false,
'.75',
|||
sum(ceph_bluestore_onode_hits{%(matchers)s}) / (
sum(ceph_bluestore_onode_hits{%(matchers)s}) +
sum(ceph_bluestore_onode_misses{%(matchers)s})
)
||| % $.matchers(),
'time_series',
20,
8,
4,
8
),
$.addRowSchema(false,
true,
'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } },
$.simpleGraphPanel(
{},
'Read/Write Profile',
'Show the read/write workload profile overtime',
'short',
null,
null,
'round(sum(rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])))' % $.matchers(),
'Reads',
0,
17,
24,
8
)
.addTargets([$.addTargetSchema(
'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes'
)]),
$.addTableSchema(
'$datasource',
'This table shows the 10 OSDs with the highest number of slow ops',
{ col: 2, desc: true },
[
$.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
$.overviewStyle('Slow Ops', 'Value', 'number', 'none'),
$.overviewStyle('', '/.*/', 'hidden', 'short'),
],
'Top Slow Ops',
'table'
)
.addTarget(
$.addTargetSchema(
|||
topk(10,
(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"})
)
||| % $.matchers(),
'',
'table',
1,
true
)
) + { gridPos: { x: 0, y: 20, w: 4, h: 8 } },
]),
'osd-device-details.json':
local OsdDeviceDetailsPanel(title,
description,
formatY1,
labelY1,
expr1,
expr2,
legendFormat1,
legendFormat2,
x,
y,
w,
h) =
$.graphPanelSchema({},
title,
description,
'null as zero',
false,
formatY1,
'short',
labelY1,
null,
null,
1,
'$datasource')
.addTargets(
[
$.addTargetSchema(expr1,
legendFormat1),
$.addTargetSchema(expr2, legendFormat2),
]
) + { type: 'timeseries' } + { fieldConfig: { defaults: { unit: formatY1, custom: { fillOpacity: 8, showPoints: 'never' } } } } + { gridPos: { x: x, y: y, w: w, h: h } };
$.dashboardSchema(
'OSD device details',
'',
'CrAHE0iZz',
'now-3h',
'30s',
16,
$._config.dashboardTags,
''
)
.addAnnotation(
$.addAnnotationSchema(
1,
'-- Grafana --',
true,
true,
'rgba(0, 211, 255, 1)',
'Annotations & Alerts',
'dashboard'
)
)
.addRequired(
type='grafana', id='grafana', name='Grafana', version='5.3.2'
)
.addRequired(
type='panel', id='graph', name='Graph', version='5.0.0'
)
.addTemplate(
g.template.datasource('datasource',
'prometheus',
'default',
label='Data Source')
)
.addTemplate(
$.addClusterTemplate()
)
.addTemplate(
$.addJobTemplate()
)
.addTemplate(
$.addTemplateSchema('osd',
'$datasource',
'label_values(ceph_osd_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
1,
false,
1,
'OSD',
'(.*)')
)
.addPanels([
$.addRowSchema(
false, true, 'OSD Performance'
) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
OsdDeviceDetailsPanel(
'$osd Latency',
'',
's',
'Read (-) / Write (+)',
|||
rate(ceph_osd_op_r_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
||| % $.matchers(),
|||
rate(ceph_osd_op_w_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
||| % $.matchers(),
'read',
'write',
0,
1,
6,
9
)
.addSeriesOverride(
{
alias: 'read',
transform: 'negative-Y',
}
),
OsdDeviceDetailsPanel(
'$osd R/W IOPS',
'',
'short',
'Read (-) / Write (+)',
'rate(ceph_osd_op_r{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
'rate(ceph_osd_op_w{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
'Reads',
'Writes',
6,
1,
6,
9
)
.addSeriesOverride(
{ alias: 'Reads', transform: 'negative-Y' }
),
OsdDeviceDetailsPanel(
'$osd R/W Bytes',
'',
'bytes',
'Read (-) / Write (+)',
'rate(ceph_osd_op_r_out_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
'rate(ceph_osd_op_w_in_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
'Read Bytes',
'Write Bytes',
12,
1,
6,
9
)
.addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }),
$.addRowSchema(
false, true, 'Physical Device Performance'
) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } },
OsdDeviceDetailsPanel(
'Physical Device Latency for $osd',
'',
's',
'Read (-) / Write (+)',
|||
(
label_replace(
rate(node_disk_read_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
)
||| % $.matchers(),
|||
(
label_replace(
rate(node_disk_write_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*") and on (instance, device)
label_replace(
label_replace(
ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
)
||| % $.matchers(),
'{{instance}}/{{device}} Reads',
'{{instance}}/{{device}} Writes',
0,
11,
6,
9
)
.addSeriesOverride(
{ alias: '/.*Reads/', transform: 'negative-Y' }
),
OsdDeviceDetailsPanel(
'Physical Device R/W IOPS for $osd',
'',
'short',
'Read (-) / Write (+)',
|||
label_replace(
rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
||| % $.matchers(),
|||
label_replace(
rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
||| % $.matchers(),
'{{device}} on {{instance}} Writes',
'{{device}} on {{instance}} Reads',
6,
11,
6,
9
)
.addSeriesOverride(
{ alias: '/.*Reads/', transform: 'negative-Y' }
),
OsdDeviceDetailsPanel(
'Physical Device R/W Bytes for $osd',
'',
'Bps',
'Read (-) / Write (+)',
|||
label_replace(
rate(node_disk_read_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
||| % $.matchers(),
|||
label_replace(
rate(node_disk_written_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
||| % $.matchers(),
'{{instance}} {{device}} Reads',
'{{instance}} {{device}} Writes',
12,
11,
6,
9
)
.addSeriesOverride(
{ alias: '/.*Reads/', transform: 'negative-Y' }
),
$.graphPanelSchema(
{},
'Physical Device Util% for $osd',
'',
'null',
false,
'percentunit',
'short',
null,
null,
null,
1,
'$datasource'
)
.addTarget($.addTargetSchema(
|||
label_replace(
rate(node_disk_io_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
||| % $.matchers(),
'{{device}} on {{instance}}'
)) + { type: 'timeseries' } + { fieldConfig: { defaults: { unit: 'percentunit', custom: { fillOpacity: 8, showPoints: 'never' } } } } + { gridPos: { x: 18, y: 11, w: 6, h: 9 } },
]),
}