ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet
Tatjana Dehler 15fa97d49d
monitoring/ceph-mixin: add RGW host to label info
Add the missing information about the RGW instance to the labels of the
"Average GET/PUT Latencies" panel on the "RGW Overview" dashboard.

Fixes: https://tracker.ceph.com/issues/57166
Signed-off-by: Tatjana Dehler <tdehler@suse.com>
2022-09-06 16:19:19 +02:00

873 lines
24 KiB
Plaintext

local g = import 'grafonnet/grafana.libsonnet';
local u = import 'utils.libsonnet';
(import 'utils.libsonnet') {
'radosgw-sync-overview.json':
local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) =
$.graphPanelSchema({},
title,
'',
'null as zero',
true,
formatY1,
'short',
labelY1,
null,
0,
1,
'$datasource')
.addTargets(
[
$.addTargetSchema(
'sum by (source_zone) (rate(%(rgwMetric)s{%(matchers)s}[$__rate_interval]))'
% ($.matchers() + { rgwMetric: rgwMetric }),
'{{source_zone}}'
),
]
) + { gridPos: { x: x, y: y, w: w, h: h } };
$.dashboardSchema(
'RGW Sync Overview',
'',
'rgw-sync-overview',
'now-1h',
'30s',
16,
$._config.dashboardTags + ['overview'],
''
)
.addAnnotation(
$.addAnnotationSchema(
1,
'-- Grafana --',
true,
true,
'rgba(0, 211, 255, 1)',
'Annotations & Alerts',
'dashboard'
)
)
.addRequired(
type='grafana', id='grafana', name='Grafana', version='5.0.0'
)
.addRequired(
type='panel', id='graph', name='Graph', version='5.0.0'
)
.addTemplate(
g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
)
.addTemplate(
$.addClusterTemplate()
)
.addTemplate(
$.addJobTemplate()
)
.addTemplate(
$.addTemplateSchema(
'rgw_servers',
'$datasource',
'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
1,
true,
1,
'',
'RGW Server'
)
)
.addPanels([
RgwSyncOverviewPanel(
'Replication (throughput) from Source Zone',
'Bps',
null,
'ceph_data_sync_from_zone_fetch_bytes_sum',
0,
0,
8,
7
),
RgwSyncOverviewPanel(
'Replication (objects) from Source Zone',
'short',
'Objects/s',
'ceph_data_sync_from_zone_fetch_bytes_count',
8,
0,
8,
7
),
RgwSyncOverviewPanel(
'Polling Request Latency from Source Zone',
'ms',
null,
'ceph_data_sync_from_zone_poll_latency_sum',
16,
0,
8,
7
),
RgwSyncOverviewPanel(
'Unsuccessful Object Replications from Source Zone',
'short',
'Count/s',
'ceph_data_sync_from_zone_fetch_errors',
0,
7,
8,
7
),
]),
'radosgw-overview.json':
local RgwOverviewPanel(
title,
description,
formatY1,
formatY2,
expr1,
legendFormat1,
x,
y,
w,
h,
datasource='$datasource',
legend_alignAsTable=false,
legend_avg=false,
legend_min=false,
legend_max=false,
legend_current=false,
legend_values=false
) =
$.graphPanelSchema(
{},
title,
description,
'null',
false,
formatY1,
formatY2,
null,
null,
0,
1,
datasource,
legend_alignAsTable,
legend_avg,
legend_min,
legend_max,
legend_current,
legend_values
)
.addTargets(
[$.addTargetSchema(expr1, legendFormat1)]
) + { gridPos: { x: x, y: y, w: w, h: h } };
$.dashboardSchema(
'RGW Overview',
'',
'WAkugZpiz',
'now-1h',
'30s',
16,
$._config.dashboardTags + ['overview'],
''
)
.addAnnotation(
$.addAnnotationSchema(
1,
'-- Grafana --',
true,
true,
'rgba(0, 211, 255, 1)',
'Annotations & Alerts',
'dashboard'
)
)
.addRequired(
type='grafana', id='grafana', name='Grafana', version='5.0.0'
)
.addRequired(
type='panel', id='graph', name='Graph', version='5.0.0'
)
.addTemplate(
g.template.datasource('datasource',
'prometheus',
'default',
label='Data Source')
)
.addTemplate(
$.addClusterTemplate()
)
.addTemplate(
$.addJobTemplate()
)
.addTemplate(
$.addTemplateSchema(
'rgw_servers',
'$datasource',
'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
1,
true,
1,
'',
'RGW Server'
)
)
.addTemplate(
$.addTemplateSchema(
'code',
'$datasource',
'label_values(haproxy_server_http_responses_total{job=~"$job_haproxy", instance=~"$ingress_service"}, code)',
1,
true,
1,
'HTTP Code',
''
)
)
.addTemplate(
$.addTemplateSchema(
'job_haproxy',
'$datasource',
'label_values(haproxy_server_status, job)',
1,
true,
1,
'job haproxy',
'(.*)',
multi=true,
allValues='.+',
),
)
.addTemplate(
$.addTemplateSchema(
'ingress_service',
'$datasource',
'label_values(haproxy_server_status{job=~"$job_haproxy"}, instance)',
1,
true,
1,
'Ingress Service',
''
)
)
.addPanels([
$.addRowSchema(false,
true,
'RGW Overview - All Gateways') +
{
gridPos: { x: 0, y: 0, w: 24, h: 1 },
},
RgwOverviewPanel(
'Average GET/PUT Latencies by RGW Instance',
'',
's',
'short',
|||
label_replace(
rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) *
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
)
||| % $.matchers(),
'GET {{rgw_host}}',
0,
1,
8,
7
).addTargets(
[
$.addTargetSchema(
|||
label_replace(
rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) *
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
)
||| % $.matchers(),
'PUT {{rgw_host}}'
),
]
),
RgwOverviewPanel(
'Total Requests/sec by RGW Instance',
'',
'none',
'short',
|||
sum by (rgw_host) (
label_replace(
rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) *
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
)
)
||| % $.matchers(),
'{{rgw_host}}',
8,
1,
7,
7
),
RgwOverviewPanel(
'GET Latencies by RGW Instance',
'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts',
's',
'short',
|||
label_replace(
rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) *
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
)
||| % $.matchers(),
'{{rgw_host}}',
15,
1,
6,
7
),
RgwOverviewPanel(
'Bandwidth Consumed by Type',
'Total bytes transferred in/out of all radosgw instances within the cluster',
'bytes',
'short',
'sum(rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]))' % $.matchers(),
'GETs',
0,
8,
8,
6
).addTargets(
[$.addTargetSchema('sum(rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]))' % $.matchers(),
'PUTs')]
),
RgwOverviewPanel(
'Bandwidth by RGW Instance',
'Total bytes transferred in/out through get/put operations, by radosgw instance',
'bytes',
'short',
|||
label_replace(sum by (instance_id) (
rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) +
rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval])) *
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
)
||| % $.matchers(),
'{{rgw_host}}',
8,
8,
7,
6
),
RgwOverviewPanel(
'PUT Latencies by RGW Instance',
'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts',
's',
'short',
|||
label_replace(
rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) *
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
"rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
)
||| % $.matchers(),
'{{rgw_host}}',
15,
8,
6,
6
),
$.addRowSchema(
false, true, 'RGW Overview - HAProxy Metrics'
) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } },
RgwOverviewPanel(
'Total responses by HTTP code',
'',
'short',
'short',
|||
sum(
rate(
haproxy_frontend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"frontend"}[$__rate_interval]
)
) by (code)
|||,
'Frontend {{ code }}',
0,
12,
5,
12,
'$datasource',
true,
true,
true,
true,
true,
true
)
.addTargets(
[
$.addTargetSchema(
|||
sum(
rate(
haproxy_backend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"backend"}[$__rate_interval]
)
) by (code)
|||, 'Backend {{ code }}'
),
]
)
.addSeriesOverride([
{
alias: '/.*Back.*/',
transform: 'negative-Y',
},
{ alias: '/.*1.*/' },
{ alias: '/.*2.*/' },
{ alias: '/.*3.*/' },
{ alias: '/.*4.*/' },
{ alias: '/.*5.*/' },
{ alias: '/.*other.*/' },
]),
RgwOverviewPanel(
'Total requests / responses',
'',
'short',
'short',
|||
sum(
rate(
haproxy_frontend_http_requests_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
)
) by (instance)
|||,
'Requests',
5,
12,
5,
12,
'$datasource',
true,
true,
true,
true,
true,
true
)
.addTargets(
[
$.addTargetSchema(
|||
sum(
rate(
haproxy_backend_response_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
)
) by (instance)
|||, 'Response errors', 'time_series', 2
),
$.addTargetSchema(
|||
sum(
rate(
haproxy_frontend_request_errors_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
)
) by (instance)
|||, 'Requests errors'
),
$.addTargetSchema(
|||
sum(
rate(
haproxy_backend_redispatch_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
)
) by (instance)
|||, 'Backend redispatch', 'time_series', 2
),
$.addTargetSchema(
|||
sum(
rate(
haproxy_backend_retry_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
)
) by (instance)
|||, 'Backend retry', 'time_series', 2
),
$.addTargetSchema(
|||
sum(
rate(
haproxy_frontend_requests_denied_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
)
) by (instance)
|||, 'Request denied', 'time_series', 2
),
$.addTargetSchema(
|||
sum(
haproxy_backend_current_queue{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}
) by (instance)
|||, 'Backend Queued', 'time_series', 2
),
]
)
.addSeriesOverride([
{
alias: '/.*Response.*/',
transform: 'negative-Y',
},
{
alias: '/.*Backend.*/',
transform: 'negative-Y',
},
]),
RgwOverviewPanel(
'Total number of connections',
'',
'short',
'short',
|||
sum(
rate(
haproxy_frontend_connections_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
)
) by (instance)
|||,
'Front',
10,
12,
5,
12,
'$datasource',
true,
true,
true,
true,
true,
true
)
.addTargets(
[
$.addTargetSchema(
|||
sum(
rate(
haproxy_backend_connection_attempts_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
)
) by (instance)
|||, 'Back'
),
$.addTargetSchema(
|||
sum(
rate(
haproxy_backend_connection_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
)
) by (instance)
|||, 'Back errors'
),
]
)
.addSeriesOverride([
{
alias: '/.*Back.*/',
transform: 'negative-Y',
},
]),
RgwOverviewPanel(
'Current total of incoming / outgoing bytes',
'',
'short',
'short',
|||
sum(
rate(
haproxy_frontend_bytes_in_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
) * 8
) by (instance)
|||,
'IN Front',
15,
12,
6,
12,
'$datasource',
true,
true,
true,
true,
true,
true
)
.addTargets(
[
$.addTargetSchema(
|||
sum(
rate(
haproxy_frontend_bytes_out_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
) * 8
) by (instance)
|||, 'OUT Front', 'time_series', 2
),
$.addTargetSchema(
|||
sum(
rate(
haproxy_backend_bytes_in_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
) * 8
) by (instance)
|||, 'IN Back', 'time_series', 2
),
$.addTargetSchema(
|||
sum(
rate(
haproxy_backend_bytes_out_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
) * 8
) by (instance)
|||, 'OUT Back', 'time_series', 2
),
]
)
.addSeriesOverride([
{
alias: '/.*OUT.*/',
transform: 'negative-Y',
},
]),
]),
'radosgw-detail.json':
local RgwDetailsPanel(aliasColors,
title,
description,
formatY1,
formatY2,
expr1,
expr2,
legendFormat1,
legendFormat2,
x,
y,
w,
h) =
$.graphPanelSchema(aliasColors,
title,
description,
'null',
false,
formatY1,
formatY2,
null,
null,
0,
1,
'$datasource')
.addTargets(
[$.addTargetSchema(expr1, legendFormat1), $.addTargetSchema(expr2, legendFormat2)]
) + { gridPos: { x: x, y: y, w: w, h: h } };
$.dashboardSchema(
'RGW Instance Detail',
'',
'x5ARzZtmk',
'now-1h',
'30s',
16,
$._config.dashboardTags + ['overview'],
''
)
.addAnnotation(
$.addAnnotationSchema(
1,
'-- Grafana --',
true,
true,
'rgba(0, 211, 255, 1)',
'Annotations & Alerts',
'dashboard'
)
)
.addRequired(
type='grafana', id='grafana', name='Grafana', version='5.0.0'
)
.addRequired(
type='panel',
id='grafana-piechart-panel',
name='Pie Chart',
version='1.3.3'
)
.addRequired(
type='panel', id='graph', name='Graph', version='5.0.0'
)
.addTemplate(
g.template.datasource('datasource',
'prometheus',
'default',
label='Data Source')
)
.addTemplate(
$.addClusterTemplate()
)
.addTemplate(
$.addJobTemplate()
)
.addTemplate(
$.addTemplateSchema('rgw_servers',
'$datasource',
'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
1,
true,
1,
'',
'')
)
.addPanels([
$.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
RgwDetailsPanel(
{},
'$rgw_servers GET/PUT Latencies',
'',
's',
'short',
|||
sum by (instance_id) (
rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval])
) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
||| % $.matchers(),
|||
sum by (instance_id) (
rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval])
) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
||| % $.matchers(),
'GET {{ceph_daemon}}',
'PUT {{ceph_daemon}}',
0,
1,
6,
8
),
RgwDetailsPanel(
{},
'Bandwidth by HTTP Operation',
'',
'bytes',
'short',
|||
rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) *
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
||| % $.matchers(),
|||
rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]) *
on (instance_id) group_left (ceph_daemon)
ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
||| % $.matchers(),
'GETs {{ceph_daemon}}',
'PUTs {{ceph_daemon}}',
6,
1,
7,
8
),
RgwDetailsPanel(
{
GETs: '#7eb26d',
Other: '#447ebc',
PUTs: '#eab839',
Requests: '#3f2b5b',
'Requests Failed': '#bf1b00',
},
'HTTP Request Breakdown',
'',
'short',
'short',
|||
rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) *
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s,ceph_daemon=~"$rgw_servers"}
||| % $.matchers(),
|||
rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) *
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
||| % $.matchers(),
'Requests Failed {{ceph_daemon}}',
'GETs {{ceph_daemon}}',
13,
1,
7,
8
)
.addTargets(
[
$.addTargetSchema(
|||
rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) *
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
||| % $.matchers(),
'PUTs {{ceph_daemon}}'
),
$.addTargetSchema(
|||
(
rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) -
(
rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) +
rate(ceph_rgw_put{%(matchers)s}[$__rate_interval])
)
) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
||| % $.matchers(),
'Other {{ceph_daemon}}'
),
]
),
$.simplePieChart(
{
GETs: '#7eb26d',
'Other (HEAD,POST,DELETE)': '#447ebc',
PUTs: '#eab839',
Requests: '#3f2b5b',
Failures: '#bf1b00',
}, '', 'Workload Breakdown'
)
.addTarget($.addTargetSchema(
|||
rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) *
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
||| % $.matchers(),
'Failures {{ceph_daemon}}'
))
.addTarget($.addTargetSchema(
|||
rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) *
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
||| % $.matchers(),
'GETs {{ceph_daemon}}'
))
.addTarget($.addTargetSchema(
|||
rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) *
on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
||| % $.matchers(),
'PUTs {{ceph_daemon}}'
))
.addTarget($.addTargetSchema(
|||
(
rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) -
(
rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) +
rate(ceph_rgw_put{%(matchers)s}[$__rate_interval])
)
) * on (instance_id) group_left (ceph_daemon)
ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
||| % $.matchers(),
'Other (DELETE,LIST) {{ceph_daemon}}'
)) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } },
]),
}