ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet
Aashish Sharma 6f3f58cb8e mgr/dashboard: Consider null values as zero in grafana panels
After upgrading from RHCS4 to RHCS5..some of the grafana charts broke.
This is because in RHCS5 we do not generate the metrics if its value is
zero as a result the null value from that metric breaks the grafana
charts or graphs. This PR is to fix the above mentioned issue.

Fixes: https://tracker.ceph.com/issues/63088

Signed-off-by: Aashish Sharma <aasharma@redhat.com>
2023-10-04 12:31:42 +05:30

619 lines
20 KiB
Plaintext

local g = import 'grafonnet/grafana.libsonnet';
(import 'utils.libsonnet') {
'osds-overview.json':
$.dashboardSchema(
'OSD Overview',
'',
'lo02I1Aiz',
'now-1h',
'30s',
16,
$._config.dashboardTags,
''
)
.addAnnotation(
$.addAnnotationSchema(
1,
'-- Grafana --',
true,
true,
'rgba(0, 211, 255, 1)',
'Annotations & Alerts',
'dashboard'
)
)
.addRequired(
type='grafana', id='grafana', name='Grafana', version='5.0.0'
)
.addRequired(
type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3'
)
.addRequired(
type='panel', id='graph', name='Graph', version='5.0.0'
)
.addRequired(
type='panel', id='table', name='Table', version='5.0.0'
)
.addTemplate(
g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
)
.addTemplate(
$.addClusterTemplate()
)
.addTemplate(
$.addJobTemplate()
)
.addPanels([
$.simpleGraphPanel(
{ '@95%ile': '#e0752d' },
'OSD Read Latencies',
'',
'ms',
null,
'0',
|||
avg (
rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
)
||| % $.matchers(),
'AVG read',
0,
0,
8,
8
)
.addTargets(
[
$.addTargetSchema(
|||
max(
rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
)
||| % $.matchers(),
'MAX read'
),
$.addTargetSchema(
|||
quantile(0.95,
(
rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
* 1000
)
)
||| % $.matchers(),
'@95%ile'
),
],
),
$.addTableSchema(
'$datasource',
"This table shows the osd's that are delivering the 10 highest read latencies within the cluster",
{ col: 2, desc: true },
[
$.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
$.overviewStyle('Latency (ms)', 'Value', 'number', 'none'),
$.overviewStyle('', '/.*/', 'hidden', 'short'),
],
'Highest READ Latencies',
'table'
)
.addTarget(
$.addTargetSchema(
|||
topk(10,
(sort(
(
rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) *
1000
)
))
)
||| % $.matchers(),
'',
'table',
1,
true
)
) + { gridPos: { x: 8, y: 0, w: 4, h: 8 } },
$.simpleGraphPanel(
{
'@95%ile write': '#e0752d',
},
'OSD Write Latencies',
'',
'ms',
null,
'0',
|||
avg(
rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
* 1000
)
||| % $.matchers(),
'AVG write',
12,
0,
8,
8
)
.addTargets(
[
$.addTargetSchema(
|||
max(
rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
1000
)
||| % $.matchers(), 'MAX write'
),
$.addTargetSchema(
|||
quantile(0.95, (
rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
1000
))
||| % $.matchers(), '@95%ile write'
),
],
),
$.addTableSchema(
'$datasource',
"This table shows the osd's that are delivering the 10 highest write latencies within the cluster",
{ col: 2, desc: true },
[
$.overviewStyle(
'OSD ID', 'ceph_daemon', 'string', 'short'
),
$.overviewStyle('Latency (ms)', 'Value', 'number', 'none'),
$.overviewStyle('', '/.*/', 'hidden', 'short'),
],
'Highest WRITE Latencies',
'table'
)
.addTarget(
$.addTargetSchema(
|||
topk(10,
(sort(
(rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
1000)
))
)
||| % $.matchers(),
'',
'table',
1,
true
)
) + { gridPos: { x: 20, y: 0, w: 4, h: 8 } },
$.simplePieChart(
{}, '', 'OSD Types Summary'
)
.addTarget(
$.addTargetSchema('count by (device_class) (ceph_osd_metadata{%(matchers)s})' % $.matchers(), '{{device_class}}')
) + { gridPos: { x: 0, y: 8, w: 4, h: 8 } },
$.simplePieChart(
{ 'Non-Encrypted': '#E5AC0E' }, '', 'OSD Objectstore Types'
)
.addTarget(
$.addTargetSchema(
'count(ceph_bluefs_wal_total_bytes{%(matchers)s})' % $.matchers(), 'bluestore', 'time_series', 2
)
)
.addTarget(
$.addTargetSchema(
'absent(ceph_bluefs_wal_total_bytes{%(matchers)s}) * count(ceph_osd_metadata{%(matchers)s})' % $.matchers(), 'filestore', 'time_series', 2
)
) + { gridPos: { x: 4, y: 8, w: 4, h: 8 } },
$.simplePieChart(
{}, 'The pie chart shows the various OSD sizes used within the cluster', 'OSD Size Summary'
)
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} < 1099511627776)' % $.matchers(), '<1TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 1099511627776 < 2199023255552)' % $.matchers(), '<2TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 2199023255552 < 3298534883328)' % $.matchers(), '<3TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 3298534883328 < 4398046511104)' % $.matchers(), '<4TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 4398046511104 < 6597069766656)' % $.matchers(), '<6TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 6597069766656 < 8796093022208)' % $.matchers(), '<8TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 8796093022208 < 10995116277760)' % $.matchers(), '<10TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 10995116277760 < 13194139533312)' % $.matchers(), '<12TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 13194139533312)' % $.matchers(), '<12TB+', 'time_series', 2
)) + { gridPos: { x: 8, y: 8, w: 4, h: 8 } },
g.graphPanel.new(bars=true,
datasource='$datasource',
title='Distribution of PGs per OSD',
x_axis_buckets=20,
x_axis_mode='histogram',
x_axis_values=['total'],
formatY1='short',
formatY2='short',
labelY1='# of OSDs',
min='0',
nullPointMode='null')
.addTarget($.addTargetSchema(
'ceph_osd_numpg{%(matchers)s}' % $.matchers(), 'PGs per OSD', 'time_series', 1, true
)) + { gridPos: { x: 12, y: 8, w: 8, h: 8 } },
$.gaugeSingleStatPanel(
'percentunit',
'OSD onode Hits Ratio',
'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster',
'current',
true,
1,
true,
false,
'.75',
|||
sum(ceph_bluestore_onode_hits{%(matchers)s}) / (
sum(ceph_bluestore_onode_hits{%(matchers)s}) +
sum(ceph_bluestore_onode_misses{%(matchers)s})
)
||| % $.matchers(),
'time_series',
20,
8,
4,
8
),
$.addRowSchema(false,
true,
'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } },
$.simpleGraphPanel(
{},
'Read/Write Profile',
'Show the read/write workload profile overtime',
'short',
null,
null,
'round(sum(rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])))' % $.matchers(),
'Reads',
0,
17,
24,
8
)
.addTargets([$.addTargetSchema(
'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes'
)]),
$.addTableSchema(
'$datasource',
'This table shows the 10 OSDs with the highest number of slow ops',
{ col: 2, desc: true },
[
$.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
$.overviewStyle('Slow Ops', 'Value', 'number', 'none'),
$.overviewStyle('', '/.*/', 'hidden', 'short'),
],
'Top Slow Ops',
'table'
)
.addTarget(
$.addTargetSchema(
|||
topk(10,
(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"})
)
||| % $.matchers(),
'',
'table',
1,
true
)
) + { gridPos: { x: 0, y: 20, w: 4, h: 8 } },
]),
'osd-device-details.json':
local OsdDeviceDetailsPanel(title,
description,
formatY1,
labelY1,
expr1,
expr2,
legendFormat1,
legendFormat2,
x,
y,
w,
h) =
$.graphPanelSchema({},
title,
description,
'null as zero',
false,
formatY1,
'short',
labelY1,
null,
null,
1,
'$datasource')
.addTargets(
[
$.addTargetSchema(expr1,
legendFormat1),
$.addTargetSchema(expr2, legendFormat2),
]
) + { gridPos: { x: x, y: y, w: w, h: h } };
$.dashboardSchema(
'OSD device details',
'',
'CrAHE0iZz',
'now-3h',
'30s',
16,
$._config.dashboardTags,
''
)
.addAnnotation(
$.addAnnotationSchema(
1,
'-- Grafana --',
true,
true,
'rgba(0, 211, 255, 1)',
'Annotations & Alerts',
'dashboard'
)
)
.addRequired(
type='grafana', id='grafana', name='Grafana', version='5.3.2'
)
.addRequired(
type='panel', id='graph', name='Graph', version='5.0.0'
)
.addTemplate(
g.template.datasource('datasource',
'prometheus',
'default',
label='Data Source')
)
.addTemplate(
$.addClusterTemplate()
)
.addTemplate(
$.addJobTemplate()
)
.addTemplate(
$.addTemplateSchema('osd',
'$datasource',
'label_values(ceph_osd_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
1,
false,
1,
'OSD',
'(.*)')
)
.addPanels([
$.addRowSchema(
false, true, 'OSD Performance'
) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
OsdDeviceDetailsPanel(
'$osd Latency',
'',
's',
'Read (-) / Write (+)',
|||
rate(ceph_osd_op_r_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
||| % $.matchers(),
|||
rate(ceph_osd_op_w_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
||| % $.matchers(),
'read',
'write',
0,
1,
6,
9
)
.addSeriesOverride(
{
alias: 'read',
transform: 'negative-Y',
}
),
OsdDeviceDetailsPanel(
'$osd R/W IOPS',
'',
'short',
'Read (-) / Write (+)',
'rate(ceph_osd_op_r{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
'rate(ceph_osd_op_w{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
'Reads',
'Writes',
6,
1,
6,
9
)
.addSeriesOverride(
{ alias: 'Reads', transform: 'negative-Y' }
),
OsdDeviceDetailsPanel(
'$osd R/W Bytes',
'',
'bytes',
'Read (-) / Write (+)',
'rate(ceph_osd_op_r_out_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
'rate(ceph_osd_op_w_in_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
'Read Bytes',
'Write Bytes',
12,
1,
6,
9
)
.addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }),
$.addRowSchema(
false, true, 'Physical Device Performance'
) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } },
OsdDeviceDetailsPanel(
'Physical Device Latency for $osd',
'',
's',
'Read (-) / Write (+)',
|||
(
label_replace(
rate(node_disk_read_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
)
||| % $.matchers(),
|||
(
label_replace(
rate(node_disk_write_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*") and on (instance, device)
label_replace(
label_replace(
ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
)
||| % $.matchers(),
'{{instance}}/{{device}} Reads',
'{{instance}}/{{device}} Writes',
0,
11,
6,
9
)
.addSeriesOverride(
{ alias: '/.*Reads/', transform: 'negative-Y' }
),
OsdDeviceDetailsPanel(
'Physical Device R/W IOPS for $osd',
'',
'short',
'Read (-) / Write (+)',
|||
label_replace(
rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
||| % $.matchers(),
|||
label_replace(
rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
||| % $.matchers(),
'{{device}} on {{instance}} Writes',
'{{device}} on {{instance}} Reads',
6,
11,
6,
9
)
.addSeriesOverride(
{ alias: '/.*Reads/', transform: 'negative-Y' }
),
OsdDeviceDetailsPanel(
'Physical Device R/W Bytes for $osd',
'',
'Bps',
'Read (-) / Write (+)',
|||
label_replace(
rate(node_disk_read_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
||| % $.matchers(),
|||
label_replace(
rate(node_disk_written_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
||| % $.matchers(),
'{{instance}} {{device}} Reads',
'{{instance}} {{device}} Writes',
12,
11,
6,
9
)
.addSeriesOverride(
{ alias: '/.*Reads/', transform: 'negative-Y' }
),
$.graphPanelSchema(
{},
'Physical Device Util% for $osd',
'',
'null',
false,
'percentunit',
'short',
null,
null,
null,
1,
'$datasource'
)
.addTarget($.addTargetSchema(
|||
label_replace(
rate(node_disk_io_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
||| % $.matchers(),
'{{device}} on {{instance}}'
)) + { gridPos: { x: 18, y: 11, w: 6, h: 9 } },
]),
}