ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet
Christian Rohmann 090b8e17f1 Cleanup of variables, queries and tests to enable showMultiCluster=True
Rendering the dashboards with showMultiCluster=True allows for
them to work with multiple clusters storing their metrics in a single
Prometheus instance. This works via the cluster label and that functionality
already existed. This just fixes some inconsistencies in applying the label
filters.

Additionally this contains updates to the tests to have them succeed with
with both configurations and avoid the introduction of regressions in
regards to multiCluster in the future.

There also are some consistency cleanups here and there:
 * `datasource` was not used consistently
 * `cluster` label_values are determined from `ceph_health_status`
 * `job` template and filters on this label were removed to align multi cluster
    support solely via the `cluster` label
 * `ceph_hosts` filter now uses label_values from any ceph_metadata metrici
    to now show all instance values, but those of hosts with some Ceph
    component / daemon.
 *  Enable showMultiCluster=True since `cluster` label is now always present,
    via https://github.com/ceph/ceph/pull/54964

Improves: https://tracker.ceph.com/issues/64321
Signed-off-by: Christian Rohmann <christian.rohmann@inovex.de>
2024-04-22 08:29:37 +02:00

772 lines
24 KiB
Plaintext

local g = import 'grafonnet/grafana.libsonnet';
(import 'utils.libsonnet') {
'osds-overview.json':
$.dashboardSchema(
'OSD Overview',
'',
'lo02I1Aiz',
'now-1h',
'30s',
16,
$._config.dashboardTags,
''
)
.addAnnotation(
$.addAnnotationSchema(
1,
'-- Grafana --',
true,
true,
'rgba(0, 211, 255, 1)',
'Annotations & Alerts',
'dashboard'
)
)
.addRequired(
type='grafana', id='grafana', name='Grafana', version='5.0.0'
)
.addRequired(
type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3'
)
.addRequired(
type='panel', id='graph', name='Graph', version='5.0.0'
)
.addRequired(
type='panel', id='table', name='Table', version='5.0.0'
)
.addTemplate(
g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
)
.addTemplate(
$.addClusterTemplate()
)
.addPanels([
$.simpleGraphPanel(
{ '@95%ile': '#e0752d' },
'OSD Read Latencies',
'',
'ms',
null,
'0',
|||
avg (
rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
)
||| % $.matchers(),
'AVG read',
0,
0,
8,
8
)
.addTargets(
[
$.addTargetSchema(
|||
max(
rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
)
||| % $.matchers(),
'MAX read'
),
$.addTargetSchema(
|||
quantile(0.95,
(
rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
* 1000
)
)
||| % $.matchers(),
'@95%ile'
),
],
),
$.addTableExtended(
datasource='${datasource}',
title='Highest READ Latencies',
gridPosition={ h: 8, w: 4, x: 8, y: 0 },
options={
footer: {
fields: '',
reducer: ['sum'],
countRows: false,
enablePagination: false,
show: false,
},
frameIndex: 1,
showHeader: true,
},
custom={ align: 'null', cellOptions: { type: 'auto' }, filterable: true, inspect: false },
thresholds={
mode: 'absolute',
steps: [
{ color: 'green', value: null },
{ color: 'red', value: 80 },
],
},
overrides=[
{
matcher: { id: 'byName', options: 'ceph_daemon' },
properties: [
{ id: 'displayName', value: 'OSD ID' },
{ id: 'unit', value: 'short' },
{ id: 'decimals', value: 2 },
{ id: 'custom.align', value: null },
],
},
{
matcher: { id: 'byName', options: 'Value' },
properties: [
{ id: 'displayName', value: 'Latency (ms)' },
{ id: 'unit', value: 'none' },
{ id: 'decimals', value: 2 },
{ id: 'custom.align', value: null },
],
},
],
pluginVersion='10.4.0'
)
.addTransformations([
{
id: 'merge',
options: { reducers: [] },
},
{
id: 'organize',
options: {
excludeByName: {
Time: true,
cluster: true,
},
indexByName: {},
renameByName: {},
includeByName: {},
},
},
]).addTarget(
$.addTargetSchema(
|||
topk(10,
(sort(
(
rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) *
1000
)
))
)
||| % $.matchers(),
'',
'table',
1,
true
)
),
$.simpleGraphPanel(
{
'@95%ile write': '#e0752d',
},
'OSD Write Latencies',
'',
'ms',
null,
'0',
|||
avg(
rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
* 1000
)
||| % $.matchers(),
'AVG write',
12,
0,
8,
8
)
.addTargets(
[
$.addTargetSchema(
|||
max(
rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
1000
)
||| % $.matchers(), 'MAX write'
),
$.addTargetSchema(
|||
quantile(0.95, (
rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
1000
))
||| % $.matchers(), '@95%ile write'
),
],
),
$.addTableExtended(
datasource='${datasource}',
title='Highest WRITE Latencies',
description="This table shows the osd's that are delivering the 10 highest write latencies within the cluster",
gridPosition={ h: 8, w: 4, x: 20, y: 0 },
options={
footer: {
fields: '',
reducer: ['sum'],
countRows: false,
enablePagination: false,
show: false,
},
frameIndex: 1,
showHeader: true,
},
custom={ align: 'null', cellOptions: { type: 'auto' }, filterable: true, inspect: false },
thresholds={
mode: 'absolute',
steps: [
{ color: 'green', value: null },
{ color: 'red', value: 80 },
],
},
overrides=[
{
matcher: { id: 'byName', options: 'ceph_daemon' },
properties: [
{ id: 'displayName', value: 'OSD ID' },
{ id: 'unit', value: 'short' },
{ id: 'decimals', value: 2 },
{ id: 'custom.align', value: null },
],
},
{
matcher: { id: 'byName', options: 'Value' },
properties: [
{ id: 'displayName', value: 'Latency (ms)' },
{ id: 'unit', value: 'none' },
{ id: 'decimals', value: 2 },
{ id: 'custom.align', value: null },
],
},
{
matcher: { id: 'byName', options: 'Value' },
properties: [
{ id: 'mappings', value: [{ type: 'value', options: { NaN: { text: '0.00', index: 0 } } }] },
{ id: 'unit', value: 'none' },
{ id: 'decimals', value: 2 },
{ id: 'custom.align', value: null },
],
},
],
pluginVersion='10.4.0'
)
.addTransformations([
{
id: 'merge',
options: { reducers: [] },
},
{
id: 'organize',
options: {
excludeByName: {
Time: true,
cluster: true,
},
indexByName: {},
renameByName: {},
includeByName: {},
},
},
]).addTarget(
$.addTargetSchema(
|||
topk(10,
(sort(
(rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
1000)
))
)
||| % $.matchers(),
'',
'table',
1,
true
)
),
$.pieChartPanel('OSD Types Summary', '', '$datasource', { x: 0, y: 8, w: 4, h: 8 }, 'table', 'bottom', true, ['percent'], { mode: 'single', sort: 'none' }, 'pie', ['percent', 'value'], 'palette-classic')
.addTarget(
$.addTargetSchema('count by (device_class) (ceph_osd_metadata{%(matchers)s})' % $.matchers(), '{{device_class}}')
),
$.pieChartPanel('OSD Objectstore Types', '', '$datasource', { x: 4, y: 8, w: 4, h: 8 }, 'table', 'bottom', true, ['percent'], { mode: 'single', sort: 'none' }, 'pie', ['percent', 'value'], 'palette-classic')
.addTarget($.addTargetSchema(
'count(ceph_bluefs_wal_total_bytes{%(matchers)s})' % $.matchers(), 'bluestore', 'time_series', 2
))
.addTarget($.addTargetSchema(
'absent(ceph_bluefs_wal_total_bytes{%(matchers)s}) * count(ceph_osd_metadata{%(matchers)s})' % $.matchers(), 'filestore', 'time_series', 2
)),
$.pieChartPanel('OSD Size Summary', 'The pie chart shows the various OSD sizes used within the cluster', '$datasource', { x: 8, y: 8, w: 4, h: 8 }, 'table', 'bottom', true, ['percent'], { mode: 'single', sort: 'none' }, 'pie', ['percent', 'value'], 'palette-classic')
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} < 1099511627776)' % $.matchers(), '<1TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 1099511627776 < 2199023255552)' % $.matchers(), '<2TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 2199023255552 < 3298534883328)' % $.matchers(), '<3TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 3298534883328 < 4398046511104)' % $.matchers(), '<4TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 4398046511104 < 6597069766656)' % $.matchers(), '<6TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 6597069766656 < 8796093022208)' % $.matchers(), '<8TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 8796093022208 < 10995116277760)' % $.matchers(), '<10TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 10995116277760 < 13194139533312)' % $.matchers(), '<12TB', 'time_series', 2
))
.addTarget($.addTargetSchema(
'count(ceph_osd_stat_bytes{%(matchers)s} >= 13194139533312)' % $.matchers(), '<12TB+', 'time_series', 2
)),
g.graphPanel.new(bars=true,
datasource='$datasource',
title='Distribution of PGs per OSD',
x_axis_buckets=20,
x_axis_mode='histogram',
x_axis_values=['total'],
formatY1='short',
formatY2='short',
labelY1='# of OSDs',
min='0',
nullPointMode='null')
.addTarget($.addTargetSchema(
'ceph_osd_numpg{%(matchers)s}' % $.matchers(), 'PGs per OSD', 'time_series', 1, true
)) + { type: 'timeseries' } + { fieldConfig: { defaults: { unit: 'short', custom: { fillOpacity: 8, showPoints: 'never' } } } } + { gridPos: { x: 12, y: 8, w: 8, h: 8 } },
$.gaugeSingleStatPanel(
'percentunit',
'OSD onode Hits Ratio',
'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster',
'current',
true,
1,
true,
false,
'.75',
|||
sum(ceph_bluestore_onode_hits{%(matchers)s}) / (
sum(ceph_bluestore_onode_hits{%(matchers)s}) +
sum(ceph_bluestore_onode_misses{%(matchers)s})
)
||| % $.matchers(),
'time_series',
20,
8,
4,
8
),
$.addRowSchema(false,
true,
'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } },
$.simpleGraphPanel(
{},
'Read/Write Profile',
'Show the read/write workload profile overtime',
'short',
null,
null,
'round(sum(rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])))' % $.matchers(),
'Reads',
0,
17,
24,
8
)
.addTargets([$.addTargetSchema(
'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes'
)]),
$.addTableExtended(
datasource='${datasource}',
title='Top Slow Ops',
description='This table shows the 10 OSDs with the highest number of slow ops',
gridPosition={ h: 8, w: 5, x: 0, y: 25 },
options={
footer: {
fields: '',
reducer: ['sum'],
countRows: false,
enablePagination: false,
show: false,
},
frameIndex: 1,
showHeader: true,
},
custom={ align: 'null', cellOptions: { type: 'auto' }, filterable: true, inspect: false },
thresholds={
mode: 'absolute',
steps: [
{ color: 'green', value: null },
{ color: 'red', value: 80 },
],
},
overrides=[
{
matcher: { id: 'byName', options: 'ceph_daemon' },
properties: [
{ id: 'displayName', value: 'OSD ID' },
{ id: 'unit', value: 'short' },
{ id: 'decimals', value: 2 },
{ id: 'custom.align', value: null },
],
},
{
matcher: { id: 'byName', options: 'Value' },
properties: [
{ id: 'displayName', value: 'Slow Ops' },
{ id: 'unit', value: 'none' },
{ id: 'decimals', value: 2 },
{ id: 'custom.align', value: null },
],
},
],
pluginVersion='10.4.0'
)
.addTransformations([
{
id: 'merge',
options: { reducers: [] },
},
{
id: 'organize',
options: {
excludeByName: {
Time: true,
__name__: true,
instance: true,
job: true,
type: true,
cluster: true,
},
indexByName: {},
renameByName: {},
includeByName: {},
},
},
]).addTarget(
$.addTargetSchema(
|||
topk(10,
(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"})
)
||| % $.matchers(),
'',
'table',
1,
true
)
),
]),
'osd-device-details.json':
local OsdDeviceDetailsPanel(title,
description,
formatY1,
labelY1,
expr1,
expr2,
legendFormat1,
legendFormat2,
x,
y,
w,
h) =
$.graphPanelSchema({},
title,
description,
'null as zero',
false,
formatY1,
'short',
labelY1,
null,
null,
1,
'$datasource')
.addTargets(
[
$.addTargetSchema(expr1,
legendFormat1),
$.addTargetSchema(expr2, legendFormat2),
]
) + { type: 'timeseries' } + { fieldConfig: { defaults: { unit: formatY1, custom: { fillOpacity: 8, showPoints: 'never' } } } } + { gridPos: { x: x, y: y, w: w, h: h } };
$.dashboardSchema(
'OSD device details',
'',
'CrAHE0iZz',
'now-3h',
'30s',
16,
$._config.dashboardTags,
''
)
.addAnnotation(
$.addAnnotationSchema(
1,
'-- Grafana --',
true,
true,
'rgba(0, 211, 255, 1)',
'Annotations & Alerts',
'dashboard'
)
)
.addRequired(
type='grafana', id='grafana', name='Grafana', version='5.3.2'
)
.addRequired(
type='panel', id='graph', name='Graph', version='5.0.0'
)
.addTemplate(
g.template.datasource('datasource',
'prometheus',
'default',
label='Data Source')
)
.addTemplate(
$.addClusterTemplate()
)
.addTemplate(
$.addTemplateSchema('osd',
'$datasource',
'label_values(ceph_osd_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
1,
false,
1,
'OSD',
'(.*)')
)
.addPanels([
$.addRowSchema(
false, true, 'OSD Performance'
) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
OsdDeviceDetailsPanel(
'$osd Latency',
'',
's',
'Read (-) / Write (+)',
|||
rate(ceph_osd_op_r_latency_sum{ceph_daemon=~"$osd", %(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
||| % $.matchers(),
|||
rate(ceph_osd_op_w_latency_sum{ceph_daemon=~"$osd", %(matchers)s}[$__rate_interval]) /
on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
||| % $.matchers(),
'read',
'write',
0,
1,
6,
9
)
.addSeriesOverride(
{
alias: 'read',
transform: 'negative-Y',
}
),
OsdDeviceDetailsPanel(
'$osd R/W IOPS',
'',
'short',
'Read (-) / Write (+)',
'rate(ceph_osd_op_r{ceph_daemon=~"$osd", %(matchers)s}[$__rate_interval])' % $.matchers(),
'rate(ceph_osd_op_w{ceph_daemon=~"$osd", %(matchers)s}[$__rate_interval])' % $.matchers(),
'Reads',
'Writes',
6,
1,
6,
9
)
.addSeriesOverride(
{ alias: 'Reads', transform: 'negative-Y' }
),
OsdDeviceDetailsPanel(
'$osd R/W Bytes',
'',
'bytes',
'Read (-) / Write (+)',
'rate(ceph_osd_op_r_out_bytes{ceph_daemon=~"$osd", %(matchers)s}[$__rate_interval])' % $.matchers(),
'rate(ceph_osd_op_w_in_bytes{ceph_daemon=~"$osd", %(matchers)s}[$__rate_interval])' % $.matchers(),
'Read Bytes',
'Write Bytes',
12,
1,
6,
9
)
.addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }),
$.addRowSchema(
false, true, 'Physical Device Performance'
) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } },
OsdDeviceDetailsPanel(
'Physical Device Latency for $osd',
'',
's',
'Read (-) / Write (+)',
|||
(
label_replace(
rate(node_disk_read_time_seconds_total[$__rate_interval]) /
rate(node_disk_reads_completed_total[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{ceph_daemon=~"$osd", %(matchers)s},
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
)
||| % $.matchers(),
|||
(
label_replace(
rate(node_disk_write_time_seconds_total[$__rate_interval]) /
rate(node_disk_writes_completed_total[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*") and on (instance, device)
label_replace(
label_replace(
ceph_disk_occupation_human{ceph_daemon=~"$osd", %(matchers)s}, "device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
)
||| % $.matchers(),
'{{instance}}/{{device}} Reads',
'{{instance}}/{{device}} Writes',
0,
11,
6,
9
)
.addSeriesOverride(
{ alias: '/.*Reads/', transform: 'negative-Y' }
),
OsdDeviceDetailsPanel(
'Physical Device R/W IOPS for $osd',
'',
'short',
'Read (-) / Write (+)',
|||
label_replace(
rate(node_disk_writes_completed_total[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{ceph_daemon=~"$osd", %(matchers)s},
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
||| % $.matchers(),
|||
label_replace(
rate(node_disk_reads_completed_total[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{ceph_daemon=~"$osd", %(matchers)s},
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
||| % $.matchers(),
'{{device}} on {{instance}} Writes',
'{{device}} on {{instance}} Reads',
6,
11,
6,
9
)
.addSeriesOverride(
{ alias: '/.*Reads/', transform: 'negative-Y' }
),
OsdDeviceDetailsPanel(
'Physical Device R/W Bytes for $osd',
'',
'Bps',
'Read (-) / Write (+)',
|||
label_replace(
rate(node_disk_read_bytes_total[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{ceph_daemon=~"$osd", %(matchers)s},
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
||| % $.matchers(),
|||
label_replace(
rate(node_disk_written_bytes_total[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{ceph_daemon=~"$osd", %(matchers)s},
"device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
||| % $.matchers(),
'{{instance}} {{device}} Reads',
'{{instance}} {{device}} Writes',
12,
11,
6,
9
)
.addSeriesOverride(
{ alias: '/.*Reads/', transform: 'negative-Y' }
),
$.graphPanelSchema(
{},
'Physical Device Util% for $osd',
'',
'null',
false,
'percentunit',
'short',
null,
null,
null,
1,
'$datasource'
)
.addTarget($.addTargetSchema(
|||
label_replace(
rate(node_disk_io_time_seconds_total[$__rate_interval]),
"instance", "$1", "instance", "([^:.]*).*"
) and on (instance, device) label_replace(
label_replace(
ceph_disk_occupation_human{ceph_daemon=~"$osd", %(matchers)s}, "device", "$1", "device", "/dev/(.*)"
), "instance", "$1", "instance", "([^:.]*).*"
)
||| % $.matchers(),
'{{device}} on {{instance}}'
)) + { type: 'timeseries' } + { fieldConfig: { defaults: { unit: 'percentunit', custom: { fillOpacity: 8, showPoints: 'never' } } } } + { gridPos: { x: 18, y: 11, w: 6, h: 9 } },
]),
}