mirror of
https://github.com/ceph/ceph
synced 2025-01-24 20:13:45 +00:00
8e07fbd2ea
mgr/prometheus: expose daemon health metrics Reviewed-by: Anthony D Atri <anthony.datri@gmail.com> Reviewed-by: Avan Thakkar <athakkar@redhat.com> Reviewed-by: Ernesto Puerta <epuertat@redhat.com> Reviewed-by: Nizamudeen A <nia@redhat.com>
749 lines
25 KiB
Plaintext
749 lines
25 KiB
Plaintext
local g = import 'grafonnet/grafana.libsonnet';
|
|
|
|
(import 'utils.libsonnet') {
|
|
'hosts-overview.json':
|
|
$.dashboardSchema(
|
|
'Host Overview',
|
|
'',
|
|
'y0KGL0iZz',
|
|
'now-1h',
|
|
'30s',
|
|
16,
|
|
$._config.dashboardTags,
|
|
'',
|
|
)
|
|
.addRequired(
|
|
type='grafana', id='grafana', name='Grafana', version='5.3.2'
|
|
)
|
|
.addRequired(
|
|
type='panel', id='graph', name='Graph', version='5.0.0'
|
|
)
|
|
.addRequired(
|
|
type='panel', id='singlestat', name='Singlestat', version='5.0.0'
|
|
)
|
|
.addAnnotation(
|
|
$.addAnnotationSchema(
|
|
1,
|
|
'-- Grafana --',
|
|
true,
|
|
true,
|
|
'rgba(0, 211, 255, 1)',
|
|
'Annotations & Alerts',
|
|
'dashboard'
|
|
)
|
|
)
|
|
.addTemplate(
|
|
g.template.datasource('datasource',
|
|
'prometheus',
|
|
'default',
|
|
label='Data Source')
|
|
)
|
|
.addTemplate(
|
|
$.addClusterTemplate()
|
|
)
|
|
.addTemplate(
|
|
$.addJobTemplate()
|
|
)
|
|
.addTemplate(
|
|
$.addTemplateSchema('osd_hosts',
|
|
'$datasource',
|
|
'label_values(ceph_disk_occupation{%(matchers)s}, exported_instance)' % $.matchers(),
|
|
1,
|
|
true,
|
|
1,
|
|
null,
|
|
'([^.]*).*')
|
|
)
|
|
.addTemplate(
|
|
$.addTemplateSchema('mon_hosts',
|
|
'$datasource',
|
|
'label_values(ceph_mon_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
|
|
1,
|
|
true,
|
|
1,
|
|
null,
|
|
'mon.(.*)')
|
|
)
|
|
.addTemplate(
|
|
$.addTemplateSchema('mds_hosts',
|
|
'$datasource',
|
|
'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % $.matchers(),
|
|
1,
|
|
true,
|
|
1,
|
|
null,
|
|
'mds.(.*)')
|
|
)
|
|
.addTemplate(
|
|
$.addTemplateSchema('rgw_hosts',
|
|
'$datasource',
|
|
'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
|
|
1,
|
|
true,
|
|
1,
|
|
null,
|
|
'rgw.(.*)')
|
|
)
|
|
.addPanels([
|
|
$.simpleSingleStatPanel(
|
|
'none',
|
|
'OSD Hosts',
|
|
'',
|
|
'current',
|
|
'count(sum by (hostname) (ceph_osd_metadata{%(matchers)s}))' % $.matchers(),
|
|
true,
|
|
'time_series',
|
|
0,
|
|
0,
|
|
4,
|
|
5
|
|
),
|
|
$.simpleSingleStatPanel(
|
|
'percentunit',
|
|
'AVG CPU Busy',
|
|
'Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster',
|
|
'current',
|
|
|||
|
|
avg(1 - (
|
|
avg by(instance) (
|
|
rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or
|
|
rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval])
|
|
)
|
|
))
|
|
|||,
|
|
true,
|
|
'time_series',
|
|
4,
|
|
0,
|
|
4,
|
|
5
|
|
),
|
|
$.simpleSingleStatPanel(
|
|
'percentunit',
|
|
'AVG RAM Utilization',
|
|
'Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)',
|
|
'current',
|
|
|||
|
|
avg ((
|
|
(
|
|
node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
|
|
node_memory_MemTotal_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
|
|
) - ((
|
|
node_memory_MemFree{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
|
|
node_memory_MemFree_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) +
|
|
(
|
|
node_memory_Cached{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
|
|
node_memory_Cached_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
|
|
) + (
|
|
node_memory_Buffers{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
|
|
node_memory_Buffers_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
|
|
) + (
|
|
node_memory_Slab{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
|
|
node_memory_Slab_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
|
|
)
|
|
)
|
|
) / (
|
|
node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
|
|
node_memory_MemTotal_bytes{instance=~"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*"}
|
|
))
|
|
|||,
|
|
true,
|
|
'time_series',
|
|
8,
|
|
0,
|
|
4,
|
|
5
|
|
),
|
|
$.simpleSingleStatPanel(
|
|
'none',
|
|
'Physical IOPS',
|
|
'IOPS Load at the device as reported by the OS on all OSD hosts',
|
|
'current',
|
|
|||
|
|
sum ((
|
|
rate(node_disk_reads_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or
|
|
rate(node_disk_reads_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval])
|
|
) + (
|
|
rate(node_disk_writes_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or
|
|
rate(node_disk_writes_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval])
|
|
))
|
|
|||,
|
|
true,
|
|
'time_series',
|
|
12,
|
|
0,
|
|
4,
|
|
5
|
|
),
|
|
$.simpleSingleStatPanel(
|
|
'percent',
|
|
'AVG Disk Utilization',
|
|
'Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)',
|
|
'current',
|
|
|||
|
|
avg (
|
|
label_replace(
|
|
(rate(node_disk_io_time_ms[$__rate_interval]) / 10 ) or
|
|
(rate(node_disk_io_time_seconds_total[$__rate_interval]) * 100),
|
|
"instance", "$1", "instance", "([^.:]*).*"
|
|
) * on(instance, device) group_left(ceph_daemon) label_replace(
|
|
label_replace(
|
|
ceph_disk_occupation_human{%(matchers)s, instance=~"($osd_hosts).*"},
|
|
"device", "$1", "device", "/dev/(.*)"
|
|
), "instance", "$1", "instance", "([^.:]*).*"
|
|
)
|
|
)
|
|
||| % $.matchers(),
|
|
true,
|
|
'time_series',
|
|
16,
|
|
0,
|
|
4,
|
|
5
|
|
),
|
|
$.simpleSingleStatPanel(
|
|
'bytes',
|
|
'Network Load',
|
|
'Total send/receive network load across all hosts in the ceph cluster',
|
|
'current',
|
|
|||
|
|
sum (
|
|
(
|
|
rate(node_network_receive_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or
|
|
rate(node_network_receive_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval])
|
|
) unless on (device, instance)
|
|
label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")
|
|
) +
|
|
sum (
|
|
(
|
|
rate(node_network_transmit_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or
|
|
rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval])
|
|
) unless on (device, instance)
|
|
label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")
|
|
)
|
|
|||,
|
|
true,
|
|
'time_series',
|
|
20,
|
|
0,
|
|
4,
|
|
5
|
|
),
|
|
$.simpleGraphPanel(
|
|
{},
|
|
'CPU Busy - Top 10 Hosts',
|
|
'Show the top 10 busiest hosts by cpu',
|
|
'percent',
|
|
null,
|
|
0,
|
|
|||
|
|
topk(10,
|
|
100 * (
|
|
1 - (
|
|
avg by(instance) (
|
|
rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or
|
|
rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval])
|
|
)
|
|
)
|
|
)
|
|
)
|
|
|||,
|
|
'{{instance}}',
|
|
0,
|
|
5,
|
|
12,
|
|
9
|
|
),
|
|
$.simpleGraphPanel(
|
|
{},
|
|
'Network Load - Top 10 Hosts',
|
|
'Top 10 hosts by network load',
|
|
'Bps',
|
|
null,
|
|
0,
|
|
|||
|
|
topk(10, (sum by(instance) (
|
|
(
|
|
rate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or
|
|
rate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval])
|
|
) +
|
|
(
|
|
rate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or
|
|
rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval])
|
|
) unless on (device, instance)
|
|
label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)"))
|
|
))
|
|
|||,
|
|
'{{instance}}',
|
|
12,
|
|
5,
|
|
12,
|
|
9
|
|
),
|
|
]),
|
|
'host-details.json':
|
|
$.dashboardSchema(
|
|
'Host Details',
|
|
'',
|
|
'rtOg0AiWz',
|
|
'now-1h',
|
|
'30s',
|
|
16,
|
|
$._config.dashboardTags + ['overview'],
|
|
''
|
|
)
|
|
.addRequired(
|
|
type='grafana', id='grafana', name='Grafana', version='5.3.2'
|
|
)
|
|
.addRequired(
|
|
type='panel', id='graph', name='Graph', version='5.0.0'
|
|
)
|
|
.addRequired(
|
|
type='panel', id='singlestat', name='Singlestat', version='5.0.0'
|
|
)
|
|
.addAnnotation(
|
|
$.addAnnotationSchema(
|
|
1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard'
|
|
)
|
|
)
|
|
.addTemplate(
|
|
g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
|
|
)
|
|
.addTemplate(
|
|
$.addClusterTemplate()
|
|
)
|
|
.addTemplate(
|
|
$.addJobTemplate()
|
|
)
|
|
.addTemplate(
|
|
$.addTemplateSchema('ceph_hosts',
|
|
'$datasource',
|
|
if $._config.showMultiCluster then ('label_values({%(clusterMatcher)s}, instance)' % $.matchers()) else 'label_values(instance)',
|
|
1,
|
|
false,
|
|
3,
|
|
'Hostname',
|
|
'([^.:]*).*')
|
|
)
|
|
.addPanels([
|
|
$.addRowSchema(false, true, '$ceph_hosts System Overview') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
|
|
$.simpleSingleStatPanel(
|
|
'none',
|
|
'OSDs',
|
|
'',
|
|
'current',
|
|
"count(sum by (ceph_daemon) (ceph_osd_metadata{%(matchers)s, hostname='$ceph_hosts'}))" % $.matchers(),
|
|
null,
|
|
'time_series',
|
|
0,
|
|
1,
|
|
3,
|
|
5
|
|
),
|
|
$.simpleGraphPanel(
|
|
{
|
|
interrupt: '#447EBC',
|
|
steal: '#6D1F62',
|
|
system: '#890F02',
|
|
user: '#3F6833',
|
|
wait: '#C15C17',
|
|
},
|
|
'CPU Utilization',
|
|
"Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown",
|
|
'percent',
|
|
'% Utilization',
|
|
null,
|
|
|||
|
|
sum by (mode) (
|
|
rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval]) or
|
|
rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval])
|
|
) / (
|
|
scalar(
|
|
sum(rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
|
|
rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]))
|
|
) * 100
|
|
)
|
|
|||,
|
|
'{{mode}}',
|
|
3,
|
|
1,
|
|
6,
|
|
10
|
|
),
|
|
$.simpleGraphPanel(
|
|
{
|
|
Available: '#508642',
|
|
Free: '#508642',
|
|
Total: '#bf1b00',
|
|
Used: '#bf1b00',
|
|
total: '#bf1b00',
|
|
used: '#0a50a1',
|
|
},
|
|
'RAM Usage',
|
|
'',
|
|
'bytes',
|
|
'RAM used',
|
|
null,
|
|
|||
|
|
node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or
|
|
node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
|
|
|||,
|
|
'Free',
|
|
9,
|
|
1,
|
|
6,
|
|
10
|
|
)
|
|
.addTargets(
|
|
[
|
|
$.addTargetSchema(
|
|
|||
|
|
node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or
|
|
node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
|
|
|||,
|
|
'total'
|
|
),
|
|
$.addTargetSchema(
|
|
|||
|
|
(
|
|
node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or
|
|
node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
|
|
) + (
|
|
node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or
|
|
node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
|
|
) + (
|
|
node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or
|
|
node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
|
|
)
|
|
|||,
|
|
'buffers/cache'
|
|
),
|
|
$.addTargetSchema(
|
|
|||
|
|
(
|
|
node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or
|
|
node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
|
|
) - (
|
|
(
|
|
node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or
|
|
node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
|
|
) + (
|
|
node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or
|
|
node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
|
|
) + (
|
|
node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or
|
|
node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
|
|
) +
|
|
(
|
|
node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or
|
|
node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
|
|
)
|
|
)
|
|
|||,
|
|
'used'
|
|
),
|
|
]
|
|
)
|
|
.addSeriesOverride(
|
|
{
|
|
alias: 'total',
|
|
color: '#bf1b00',
|
|
fill: 0,
|
|
linewidth: 2,
|
|
stack: false,
|
|
}
|
|
),
|
|
$.simpleGraphPanel(
|
|
{},
|
|
'Network Load',
|
|
"Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')",
|
|
'decbytes',
|
|
'Send (-) / Receive (+)',
|
|
null,
|
|
|||
|
|
sum by (device) (
|
|
rate(
|
|
node_network_receive_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or
|
|
rate(node_network_receive_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]
|
|
)
|
|
)
|
|
|||,
|
|
'{{device}}.rx',
|
|
15,
|
|
1,
|
|
6,
|
|
10
|
|
)
|
|
.addTargets(
|
|
[
|
|
$.addTargetSchema(
|
|
|||
|
|
sum by (device) (
|
|
rate(node_network_transmit_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or
|
|
rate(node_network_transmit_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval])
|
|
)
|
|
|||,
|
|
'{{device}}.tx'
|
|
),
|
|
]
|
|
)
|
|
.addSeriesOverride(
|
|
{ alias: '/.*tx/', transform: 'negative-Y' }
|
|
),
|
|
$.simpleGraphPanel(
|
|
{},
|
|
'Network drop rate',
|
|
'',
|
|
'pps',
|
|
'Send (-) / Receive (+)',
|
|
null,
|
|
|||
|
|
rate(node_network_receive_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
|
|
rate(node_network_receive_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
|
|
|||,
|
|
'{{device}}.rx',
|
|
21,
|
|
1,
|
|
3,
|
|
5
|
|
)
|
|
.addTargets(
|
|
[
|
|
$.addTargetSchema(
|
|
|||
|
|
rate(node_network_transmit_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
|
|
rate(node_network_transmit_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
|
|
|||,
|
|
'{{device}}.tx'
|
|
),
|
|
]
|
|
)
|
|
.addSeriesOverride(
|
|
{
|
|
alias: '/.*tx/',
|
|
transform: 'negative-Y',
|
|
}
|
|
),
|
|
$.simpleSingleStatPanel(
|
|
'bytes',
|
|
'Raw Capacity',
|
|
'Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.',
|
|
'current',
|
|
|||
|
|
sum(
|
|
ceph_osd_stat_bytes{%(matchers)s} and
|
|
on (ceph_daemon) ceph_disk_occupation{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"}
|
|
)
|
|
||| % $.matchers(),
|
|
null,
|
|
'time_series',
|
|
0,
|
|
6,
|
|
3,
|
|
5
|
|
),
|
|
$.simpleGraphPanel(
|
|
{},
|
|
'Network error rate',
|
|
'',
|
|
'pps',
|
|
'Send (-) / Receive (+)',
|
|
null,
|
|
|||
|
|
rate(node_network_receive_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
|
|
rate(node_network_receive_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
|
|
|||,
|
|
'{{device}}.rx',
|
|
21,
|
|
6,
|
|
3,
|
|
5
|
|
)
|
|
.addTargets(
|
|
[$.addTargetSchema(
|
|
|||
|
|
rate(node_network_transmit_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
|
|
rate(node_network_transmit_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
|
|
|||,
|
|
'{{device}}.tx'
|
|
)]
|
|
)
|
|
.addSeriesOverride(
|
|
{
|
|
alias: '/.*tx/',
|
|
transform: 'negative-Y',
|
|
}
|
|
),
|
|
$.addRowSchema(false,
|
|
true,
|
|
'OSD Disk Performance Statistics') + { gridPos: { x: 0, y: 11, w: 24, h: 1 } },
|
|
$.simpleGraphPanel(
|
|
{},
|
|
'$ceph_hosts Disk IOPS',
|
|
"For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value",
|
|
'ops',
|
|
'Read (-) / Write (+)',
|
|
null,
|
|
|||
|
|
label_replace(
|
|
(
|
|
rate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
|
|
rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
|
|
), "instance", "$1", "instance", "([^:.]*).*"
|
|
) * on(instance, device) group_left(ceph_daemon) label_replace(
|
|
label_replace(
|
|
ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"
|
|
), "instance", "$1", "instance", "([^:.]*).*"
|
|
)
|
|
||| % $.matchers(),
|
|
'{{device}}({{ceph_daemon}}) writes',
|
|
0,
|
|
12,
|
|
11,
|
|
9
|
|
)
|
|
.addTargets(
|
|
[
|
|
$.addTargetSchema(
|
|
|||
|
|
label_replace(
|
|
(
|
|
rate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
|
|
rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
|
|
), "instance", "$1", "instance", "([^:.]*).*"
|
|
) * on(instance, device) group_left(ceph_daemon) label_replace(
|
|
label_replace(
|
|
ceph_disk_occupation_human{%(matchers)s},"device", "$1", "device", "/dev/(.*)"
|
|
), "instance", "$1", "instance", "([^:.]*).*"
|
|
)
|
|
||| % $.matchers(),
|
|
'{{device}}({{ceph_daemon}}) reads'
|
|
),
|
|
]
|
|
)
|
|
.addSeriesOverride(
|
|
{ alias: '/.*reads/', transform: 'negative-Y' }
|
|
),
|
|
$.simpleGraphPanel(
|
|
{},
|
|
'$ceph_hosts Throughput by Disk',
|
|
'For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id',
|
|
'Bps',
|
|
'Read (-) / Write (+)',
|
|
null,
|
|
|||
|
|
label_replace(
|
|
(
|
|
rate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
|
|
rate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
|
|
), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device)
|
|
group_left(ceph_daemon) label_replace(
|
|
label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"),
|
|
"instance", "$1", "instance", "([^:.]*).*"
|
|
)
|
|
||| % $.matchers(),
|
|
'{{device}}({{ceph_daemon}}) write',
|
|
12,
|
|
12,
|
|
11,
|
|
9
|
|
)
|
|
.addTargets(
|
|
[$.addTargetSchema(
|
|
|||
|
|
label_replace(
|
|
(
|
|
rate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
|
|
rate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
|
|
),
|
|
"instance", "$1", "instance", "([^:.]*).*") * on(instance, device)
|
|
group_left(ceph_daemon) label_replace(
|
|
label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"),
|
|
"instance", "$1", "instance", "([^:.]*).*"
|
|
)
|
|
||| % $.matchers(),
|
|
'{{device}}({{ceph_daemon}}) read'
|
|
)]
|
|
)
|
|
.addSeriesOverride(
|
|
{ alias: '/.*read/', transform: 'negative-Y' }
|
|
),
|
|
$.simpleGraphPanel(
|
|
{},
|
|
'$ceph_hosts Disk Latency',
|
|
"For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id",
|
|
's',
|
|
'',
|
|
null,
|
|
|||
|
|
max by(instance, device) (label_replace(
|
|
(rate(node_disk_write_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) /
|
|
clamp_min(rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001) or
|
|
(rate(node_disk_read_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) /
|
|
clamp_min(rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001),
|
|
"instance", "$1", "instance", "([^:.]*).*"
|
|
)) * on(instance, device) group_left(ceph_daemon) label_replace(
|
|
label_replace(
|
|
ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"},
|
|
"device", "$1", "device", "/dev/(.*)"
|
|
), "instance", "$1", "instance", "([^:.]*).*"
|
|
)
|
|
||| % $.matchers(),
|
|
'{{device}}({{ceph_daemon}})',
|
|
0,
|
|
21,
|
|
11,
|
|
9
|
|
),
|
|
$.simpleGraphPanel(
|
|
{},
|
|
'$ceph_hosts Disk utilization',
|
|
'Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.',
|
|
'percent',
|
|
'%Util',
|
|
null,
|
|
|||
|
|
label_replace(
|
|
(
|
|
(rate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) / 10) or
|
|
rate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) * 100
|
|
), "instance", "$1", "instance", "([^:.]*).*"
|
|
) * on(instance, device) group_left(ceph_daemon) label_replace(
|
|
label_replace(ceph_disk_occupation_human{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"},
|
|
"device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"
|
|
)
|
|
||| % $.matchers(),
|
|
'{{device}}({{ceph_daemon}})',
|
|
12,
|
|
21,
|
|
11,
|
|
9
|
|
),
|
|
$.addTableSchema(
|
|
'$datasource',
|
|
'This table shows the 10 hosts with the highest number of slow ops',
|
|
{ col: 2, desc: true },
|
|
[
|
|
$.overviewStyle('Instance', 'instance', 'string', 'short'),
|
|
$.overviewStyle('Slow Ops', 'Value', 'number', 'none'),
|
|
$.overviewStyle('', '/.*/', 'hidden', 'short'),
|
|
],
|
|
'Top Slow Ops per Host',
|
|
'table'
|
|
)
|
|
.addTarget(
|
|
$.addTargetSchema(
|
|
|||
|
|
topk(10,
|
|
(sum by (instance)(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"}))
|
|
)
|
|
||| % $.matchers(),
|
|
'',
|
|
'table',
|
|
1,
|
|
true
|
|
)
|
|
) + { gridPos: { x: 0, y: 40, w: 4, h: 8 } },
|
|
]),
|
|
}
|