ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet
Arthur Outhenin-Chalandre 98236e3a1d
mgr/dashboard: monitoring: refactor into ceph-mixin
Mixin is a way to bundle dashboards, prometheus rules and alerts into
jsonnet package. Shifting to mixin will allow easier integration with
monitoring automation that some users may use.

This commit moves `/monitoring/grafana/dashboards` and
`/monitoring/prometheus` to `/monitoring/ceph-mixin`. Prometheus alerts
was also converted to Jsonnet using an automated way (from yaml to json
to jsonnet). This commit minimises any change made to the generated files
and should not change neithers the dashboards nor the Prometheus alerts.

In the future some configuration will also be added to jsonnet to add
more functionalities to the dashboards or alerts (i.e.: multi cluster).

Fixes: https://tracker.ceph.com/issues/53374
Signed-off-by: Arthur Outhenin-Chalandre <arthur.outhenin-chalandre@cern.ch>
2022-02-03 13:08:20 +01:00

644 lines
21 KiB
Plaintext

local g = import 'grafonnet/grafana.libsonnet';
local u = import 'utils.libsonnet';
{
grafanaDashboards+:: {
'radosgw-sync-overview.json':
local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) =
u.graphPanelSchema({},
title,
'',
'null as zero',
true,
formatY1,
'short',
labelY1,
null,
0,
1,
'$datasource')
.addTargets(
[u.addTargetSchema('sum by (source_zone) (rate(%s[30s]))' % rgwMetric,
1,
'time_series',
'{{source_zone}}')]
) + { gridPos: { x: x, y: y, w: w, h: h } };
u.dashboardSchema(
'RGW Sync Overview',
'',
'rgw-sync-overview',
'now-1h',
'15s',
16,
['overview'],
'',
{
refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
}
)
.addAnnotation(
u.addAnnotationSchema(
1,
'-- Grafana --',
true,
true,
'rgba(0, 211, 255, 1)',
'Annotations & Alerts',
'dashboard'
)
)
.addRequired(
type='grafana', id='grafana', name='Grafana', version='5.0.0'
)
.addRequired(
type='panel', id='graph', name='Graph', version='5.0.0'
)
.addTemplate(
u.addTemplateSchema('rgw_servers', '$datasource', 'prometehus', 1, true, 1, '', '')
)
.addTemplate(
g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
)
.addPanels([
RgwSyncOverviewPanel(
'Replication (throughput) from Source Zone',
'Bps',
null,
'ceph_data_sync_from_zone_fetch_bytes_sum',
0,
0,
8,
7
),
RgwSyncOverviewPanel(
'Replication (objects) from Source Zone',
'short',
'Objects/s',
'ceph_data_sync_from_zone_fetch_bytes_count',
8,
0,
8,
7
),
RgwSyncOverviewPanel(
'Polling Request Latency from Source Zone',
'ms',
null,
'ceph_data_sync_from_zone_poll_latency_sum',
16,
0,
8,
7
),
RgwSyncOverviewPanel(
'Unsuccessful Object Replications from Source Zone',
'short',
'Count/s',
'ceph_data_sync_from_zone_fetch_errors',
0,
7,
8,
7
),
]),
'radosgw-overview.json':
local RgwOverviewPanel(
title,
description,
formatY1,
formatY2,
expr1,
legendFormat1,
x,
y,
w,
h,
datasource='$datasource',
legend_alignAsTable=false,
legend_avg=false,
legend_min=false,
legend_max=false,
legend_current=false,
legend_values=false
) =
u.graphPanelSchema(
{},
title,
description,
'null',
false,
formatY1,
formatY2,
null,
null,
0,
1,
datasource,
legend_alignAsTable,
legend_avg,
legend_min,
legend_max,
legend_current,
legend_values
)
.addTargets(
[u.addTargetSchema(expr1, 1, 'time_series', legendFormat1)]
) + { gridPos: { x: x, y: y, w: w, h: h } };
u.dashboardSchema(
'RGW Overview',
'',
'WAkugZpiz',
'now-1h',
'15s',
16,
['overview'],
'',
{
refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
}
)
.addAnnotation(
u.addAnnotationSchema(
1,
'-- Grafana --',
true,
true,
'rgba(0, 211, 255, 1)',
'Annotations & Alerts',
'dashboard'
)
)
.addRequired(
type='grafana', id='grafana', name='Grafana', version='5.0.0'
)
.addRequired(
type='panel', id='graph', name='Graph', version='5.0.0'
)
.addTemplate(
u.addTemplateSchema(
'rgw_servers',
'$datasource',
'label_values(ceph_rgw_metadata, ceph_daemon)',
1,
true,
1,
'',
''
)
)
.addTemplate(
u.addTemplateSchema(
'code',
'$datasource',
'label_values(haproxy_server_http_responses_total{instance=~"$ingress_service"}, code)',
1,
true,
1,
'HTTP Code',
''
)
)
.addTemplate(
u.addTemplateSchema(
'ingress_service',
'$datasource',
'label_values(haproxy_server_status, instance)',
1,
true,
1,
'Ingress Service',
''
)
)
.addTemplate(
g.template.datasource('datasource',
'prometheus',
'default',
label='Data Source')
)
.addPanels([
u.addRowSchema(false,
true,
'RGW Overview - All Gateways') +
{
gridPos: { x: 0, y: 0, w: 24, h: 1 },
},
RgwOverviewPanel(
'Average GET/PUT Latencies',
'',
's',
'short',
'rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata',
'GET AVG',
0,
1,
8,
7
).addTargets(
[
u.addTargetSchema(
'rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata',
1,
'time_series',
'PUT AVG'
),
]
),
RgwOverviewPanel(
'Total Requests/sec by RGW Instance',
'',
'none',
'short',
'sum by (rgw_host) (label_replace(rate(ceph_rgw_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"))',
'{{rgw_host}}',
8,
1,
7,
7
),
RgwOverviewPanel(
'GET Latencies by RGW Instance',
'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts',
's',
'short',
'label_replace(\n rate(ceph_rgw_get_initial_lat_sum[30s]) /\n rate(ceph_rgw_get_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
'{{rgw_host}}',
15,
1,
6,
7
),
RgwOverviewPanel(
'Bandwidth Consumed by Type',
'Total bytes transferred in/out of all radosgw instances within the cluster',
'bytes',
'short',
'sum(rate(ceph_rgw_get_b[30s]))',
'GETs',
0,
8,
8,
6
).addTargets(
[u.addTargetSchema('sum(rate(ceph_rgw_put_b[30s]))',
1,
'time_series',
'PUTs')]
),
RgwOverviewPanel(
'Bandwidth by RGW Instance',
'Total bytes transferred in/out through get/put operations, by radosgw instance',
'bytes',
'short',
'label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b[30s]) + \n rate(ceph_rgw_put_b[30s])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
'{{rgw_host}}',
8,
8,
7,
6
),
RgwOverviewPanel(
'PUT Latencies by RGW Instance',
'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts',
's',
'short',
'label_replace(\n rate(ceph_rgw_put_initial_lat_sum[30s]) /\n rate(ceph_rgw_put_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
'{{rgw_host}}',
15,
8,
6,
6
),
u.addRowSchema(
false, true, 'RGW Overview - HAProxy Metrics'
) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } },
RgwOverviewPanel(
'Total responses by HTTP code',
'',
'short',
'short',
'sum(irate(haproxy_frontend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"frontend"}[5m])) by (code)',
'Frontend {{ code }}',
0,
12,
5,
12,
'$datasource',
true,
true,
true,
true,
true,
true
)
.addTargets(
[u.addTargetSchema('sum(irate(haproxy_backend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"backend"}[5m])) by (code)', 1, 'time_series', 'Backend {{ code }}')]
)
.addSeriesOverride([
{
alias: '/.*Back.*/',
transform: 'negative-Y',
},
{ alias: '/.*1.*/' },
{ alias: '/.*2.*/' },
{ alias: '/.*3.*/' },
{ alias: '/.*4.*/' },
{ alias: '/.*5.*/' },
{ alias: '/.*other.*/' },
]),
RgwOverviewPanel(
'Total requests / responses',
'',
'short',
'short',
'sum(irate(haproxy_frontend_http_requests_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)',
'Requests',
5,
12,
5,
12,
'$datasource',
true,
true,
true,
true,
true,
true
)
.addTargets(
[
u.addTargetSchema('sum(irate(haproxy_backend_response_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Response errors'),
u.addTargetSchema('sum(irate(haproxy_frontend_request_errors_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Requests errors'),
u.addTargetSchema('sum(irate(haproxy_backend_redispatch_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend redispatch'),
u.addTargetSchema('sum(irate(haproxy_backend_retry_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend retry'),
u.addTargetSchema('sum(irate(haproxy_frontend_requests_denied_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Request denied'),
u.addTargetSchema('sum(haproxy_backend_current_queue{proxy=~"backend",instance=~"$ingress_service"}) by (instance)', 2, 'time_series', 'Backend Queued'),
]
)
.addSeriesOverride([
{
alias: '/.*Response.*/',
transform: 'negative-Y',
},
{
alias: '/.*Backend.*/',
transform: 'negative-Y',
},
]),
RgwOverviewPanel(
'Total number of connections',
'',
'short',
'short',
'sum(irate(haproxy_frontend_connections_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)',
'Front',
10,
12,
5,
12,
'$datasource',
true,
true,
true,
true,
true,
true
)
.addTargets(
[
u.addTargetSchema('sum(irate(haproxy_backend_connection_attempts_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back'),
u.addTargetSchema('sum(irate(haproxy_backend_connection_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back errors'),
]
)
.addSeriesOverride([
{
alias: '/.*Back.*/',
transform: 'negative-Y',
},
]),
RgwOverviewPanel(
'Current total of incoming / outgoing bytes',
'',
'short',
'short',
'sum(irate(haproxy_frontend_bytes_in_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)',
'IN Front',
15,
12,
6,
12,
'$datasource',
true,
true,
true,
true,
true,
true
)
.addTargets(
[
u.addTargetSchema('sum(irate(haproxy_frontend_bytes_out_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Front'),
u.addTargetSchema('sum(irate(haproxy_backend_bytes_in_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'IN Back'),
u.addTargetSchema('sum(irate(haproxy_backend_bytes_out_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Back'),
]
)
.addSeriesOverride([
{
alias: '/.*OUT.*/',
transform: 'negative-Y',
},
]),
]),
'radosgw-detail.json':
local RgwDetailsPanel(aliasColors,
title,
description,
formatY1,
formatY2,
expr1,
expr2,
legendFormat1,
legendFormat2,
x,
y,
w,
h) =
u.graphPanelSchema(aliasColors,
title,
description,
'null',
false,
formatY1,
formatY2,
null,
null,
0,
1,
'$datasource')
.addTargets(
[u.addTargetSchema(expr1, 1, 'time_series', legendFormat1), u.addTargetSchema(expr2, 1, 'time_series', legendFormat2)]
) + { gridPos: { x: x, y: y, w: w, h: h } };
u.dashboardSchema(
'RGW Instance Detail',
'',
'x5ARzZtmk',
'now-1h',
'15s',
16,
['overview'],
'',
{
refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
}
)
.addAnnotation(
u.addAnnotationSchema(
1,
'-- Grafana --',
true,
true,
'rgba(0, 211, 255, 1)',
'Annotations & Alerts',
'dashboard'
)
)
.addRequired(
type='grafana', id='grafana', name='Grafana', version='5.0.0'
)
.addRequired(
type='panel',
id='grafana-piechart-panel',
name='Pie Chart',
version='1.3.3'
)
.addRequired(
type='panel', id='graph', name='Graph', version='5.0.0'
)
.addTemplate(
g.template.datasource('datasource',
'prometheus',
'default',
label='Data Source')
)
.addTemplate(
u.addTemplateSchema('rgw_servers',
'$datasource',
'label_values(ceph_rgw_metadata, ceph_daemon)',
1,
true,
1,
'',
'')
)
.addPanels([
u.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
RgwDetailsPanel(
{},
'$rgw_servers GET/PUT Latencies',
'',
's',
'short',
'sum by (instance_id) (rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
'sum by (instance_id) (rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
'GET {{ceph_daemon}}',
'PUT {{ceph_daemon}}',
0,
1,
6,
8
),
RgwDetailsPanel(
{},
'Bandwidth by HTTP Operation',
'',
'bytes',
'short',
'rate(ceph_rgw_get_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
'rate(ceph_rgw_put_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
'GETs {{ceph_daemon}}',
'PUTs {{ceph_daemon}}',
6,
1,
7,
8
),
RgwDetailsPanel(
{
GETs: '#7eb26d',
Other: '#447ebc',
PUTs: '#eab839',
Requests: '#3f2b5b',
'Requests Failed': '#bf1b00',
},
'HTTP Request Breakdown',
'',
'short',
'short',
'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
'Requests Failed {{ceph_daemon}}',
'GETs {{ceph_daemon}}',
13,
1,
7,
8
)
.addTargets(
[
u.addTargetSchema(
'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
1,
'time_series',
'PUTs {{ceph_daemon}}'
),
u.addTargetSchema(
'(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
1,
'time_series',
'Other {{ceph_daemon}}'
),
]
),
u.addPieChartSchema(
{
GETs: '#7eb26d',
'Other (HEAD,POST,DELETE)': '#447ebc',
PUTs: '#eab839',
Requests: '#3f2b5b',
Failures: '#bf1b00',
}, '$datasource', '', 'Under graph', 'pie', 'Workload Breakdown', 'current'
)
.addTarget(u.addTargetSchema(
'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
1,
'time_series',
'Failures {{ceph_daemon}}'
))
.addTarget(u.addTargetSchema(
'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
1,
'time_series',
'GETs {{ceph_daemon}}'
))
.addTarget(u.addTargetSchema(
'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
1,
'time_series',
'PUTs {{ceph_daemon}}'
))
.addTarget(u.addTargetSchema(
'(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
1,
'time_series',
'Other (DELETE,LIST) {{ceph_daemon}}'
)) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } },
]),
},
}