ceph/monitoring/ceph-mixin/dashboards/multi-cluster.libsonnet

947 lines
27 KiB
Plaintext

local g = import 'grafonnet/grafana.libsonnet';
(import 'utils.libsonnet') {
'multi-cluster-overview.json':
$.dashboardSchema(
'Ceph - Multi-cluster',
'',
'BnxelG7Sx',
'now-1h',
'30s',
22,
$._config.dashboardTags,
''
)
.addAnnotation(
$.addAnnotationSchema(
1,
'-- Grafana --',
true,
true,
'rgba(0, 211, 255, 1)',
'Annotations & Alerts',
'dashboard'
)
)
.addTemplate(
g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
)
.addTemplate(
$.addTemplateSchema(
'cluster',
'$datasource',
'label_values(ceph_health_status, %s)' % $._config.clusterLabel,
1,
true,
1,
'cluster',
'(.*)',
if !$._config.showMultiCluster then 'variable' else '',
multi=true,
allValues='.*',
),
)
.addLinks([
$.addLinkSchema(
asDropdown=true,
icon='external link',
includeVars=true,
keepTime=true,
tags=[],
targetBlank=false,
title='Browse Dashboards',
tooltip='',
type='dashboards',
url=''
),
])
.addPanels([
$.addRowSchema(false, true, 'Clusters') + { gridPos: { x: 0, y: 1, w: 24, h: 1 } },
$.addStatPanel(
title='Status',
datasource='$datasource',
gridPosition={ x: 0, y: 2, w: 5, h: 7 },
graphMode='none',
colorMode='value',
orientation='auto',
justifyMode='center',
thresholdsMode='absolute',
pluginVersion='9.4.7',
).addThresholds([
{ color: 'text', value: null },
])
.addOverrides(
[
{
matcher: { id: 'byName', options: 'Warning' },
properties: [
{
id: 'thresholds',
value: { mode: 'absolute', steps: [{ color: 'text', value: null }, { color: 'semi-dark-yellow', value: 1 }] },
},
],
},
{
matcher: { id: 'byName', options: 'Error' },
properties: [
{
id: 'thresholds',
value: { mode: 'absolute', steps: [{ color: 'text', value: null }, { color: 'semi-dark-red', value: 1 }] },
},
],
},
{
matcher: { id: 'byName', options: 'Healthy' },
properties: [
{
id: 'thresholds',
value: { mode: 'absolute', steps: [{ color: 'text', value: null }, { color: 'semi-dark-green', value: 1 }] },
},
],
},
]
)
.addTargets([
$.addTargetSchema(
expr='count(ceph_health_status==0) or vector(0)',
datasource='$datasource',
legendFormat='Healthy',
),
$.addTargetSchema(
expr='count(ceph_health_status==1)',
datasource='$datasource',
legendFormat='Warning'
),
$.addTargetSchema(
expr='count(ceph_health_status==2)',
datasource='$datasource',
legendFormat='Error'
),
]),
$.addTableExtended(
datasource='$datasource',
title='Details',
gridPosition={ h: 7, w: 19, x: 5, y: 2 },
options={
footer: {
fields: '',
reducer: ['sum'],
countRows: false,
enablePagination: false,
show: false,
},
frameIndex: 1,
showHeader: true,
},
custom={ align: 'left', cellOptions: { type: 'color-text' }, filterable: false, inspect: false },
thresholds={
mode: 'absolute',
steps: [
{ color: 'text' },
],
},
overrides=[
{
matcher: { id: 'byName', options: 'Value #A' },
properties: [
{ id: 'mappings', value: [{ options: { '0': { color: 'semi-dark-green', index: 2, text: 'Healthy' }, '1': { color: 'semi-dark-yellow', index: 0, text: 'Warning' }, '2': { color: 'semi-dark-red', index: 1, text: 'Error' } }, type: 'value' }] },
],
},
{
matcher: { id: 'byName', options: 'Capacity Used' },
properties: [
{ id: 'unit', value: 'bytes' },
],
},
{
matcher: { id: 'byName', options: 'Cluster' },
properties: [
{ id: 'links', value: [{ title: '', url: '/d/edtb0oxdq/ceph-cluster?var-cluster=${__data.fields.Cluster}&${DS_PROMETHEUS:queryparam}' }] },
],
},
{
matcher: { id: 'byName', options: 'Alerts' },
properties: [
{ id: 'mappings', value: [{ options: { match: null, result: { index: 0, text: '0' } }, type: 'special' }] },
],
},
],
pluginVersion='9.4.7'
)
.addTransformations([
{
id: 'joinByField',
options: { byField: 'cluster', mode: 'outer' },
},
{
id: 'organize',
options: {
excludeByName: {
'Time 1': true,
'Time 2': true,
'Time 3': true,
'Time 4': true,
'Time 5': true,
'Time 6': true,
'Value #B': true,
'__name__ 1': true,
'__name__ 2': true,
'__name__ 3': true,
ceph_daemon: true,
device_class: true,
hostname: true,
'instance 1': true,
'instance 2': true,
'instance 3': true,
'job 1': true,
'job 2': true,
'job 3': true,
'replica 1': true,
'replica 2': true,
'replica 3': true,
},
indexByName: {
'Time 1': 8,
'Time 2': 13,
'Time 3': 21,
'Time 4': 7,
'Time 5': 22,
'Time 6': 23,
'Value #A': 1,
'Value #B': 20,
'Value #C': 3,
'Value #D': 6,
'__name__ 1': 9,
'__name__ 2': 14,
'__name__ 3': 24,
ceph_daemon: 15,
ceph_version: 2,
cluster: 0,
device_class: 25,
hostname: 16,
'instance 1': 10,
'instance 2': 17,
'instance 3': 26,
'job 1': 11,
'job 2': 18,
'job 3': 27,
'replica 1': 12,
'replica 2': 19,
'replica 3': 28,
},
renameByName: {
'Value #A': 'Status',
'Value #C': 'Alerts',
'Value #D': 'Capacity Used',
ceph_version: 'Version',
cluster: 'Cluster',
},
},
},
]).addTargets([
$.addTargetSchema(
expr='ceph_health_status',
datasource={ type: 'prometheus', uid: '$datasource' },
format='table',
hide=false,
exemplar=false,
instant=true,
interval='',
legendFormat='__auto',
range=false,
),
$.addTargetSchema(
expr='ceph_mgr_metadata',
datasource={ type: 'prometheus', uid: '$datasource' },
format='table',
hide=false,
exemplar=false,
instant=true,
interval='',
legendFormat='__auto',
range=false,
),
$.addTargetSchema(
expr='count(ALERTS{alertstate="firing", cluster=~"$cluster"})',
datasource={ type: 'prometheus', uid: '$datasource' },
format='table',
hide=false,
exemplar=false,
instant=true,
interval='',
legendFormat='__auto',
range=false,
),
$.addTargetSchema(
expr='ceph_cluster_by_class_total_used_bytes',
datasource={ type: 'prometheus', uid: '$datasource' },
format='table',
hide=false,
exemplar=false,
instant=true,
interval='',
legendFormat='__auto',
range=false,
),
]),
$.addRowSchema(false, true, 'Overview') + { gridPos: { x: 0, y: 9, w: 24, h: 1 } },
$.addStatPanel(
title='Cluster Count',
datasource='$datasource',
gridPosition={ x: 0, y: 10, w: 3, h: 4 },
graphMode='none',
colorMode='value',
orientation='auto',
justifyMode='center',
thresholdsMode='absolute',
pluginVersion='9.4.7',
).addThresholds([
{ color: 'text', value: null },
{ color: 'red', value: 80 },
])
.addTargets([
$.addTargetSchema(
expr='count(ceph_health_status{cluster=~"$cluster"}) or vector(0)',
datasource={ type: 'prometheus', uid: '$datasource' },
format='table',
hide=false,
exemplar=false,
instant=true,
interval='',
legendFormat='__auto',
range=false,
),
]),
$.addGaugePanel(
title='Capacity Used',
gridPosition={ h: 8, w: 4, x: 3, y: 10 },
unit='percentunit',
max=1,
min=0,
interval='1m',
pluginVersion='9.4.7'
)
.addThresholds([
{ color: 'green', value: null },
{ color: 'semi-dark-yellow', value: 0.75 },
{ color: 'red', value: 0.85 },
])
.addTarget($.addTargetSchema(
expr='sum(ceph_cluster_total_used_bytes{cluster=~"$cluster"}) / sum(ceph_cluster_total_bytes{cluster=~"$cluster"})',
instant=true,
legendFormat='Used',
datasource='$datasource',
)),
$.addStatPanel(
title='Total Capacity',
datasource='$datasource',
gridPosition={ x: 7, y: 10, w: 3, h: 4 },
graphMode='area',
colorMode='none',
orientation='auto',
justifyMode='auto',
thresholdsMode='absolute',
unit='bytes',
pluginVersion='9.4.7',
).addThresholds([
{ color: 'green', value: null },
])
.addTargets([
$.addTargetSchema(
expr='sum(ceph_cluster_total_bytes{cluster=~"$cluster"})',
datasource={ type: 'prometheus', uid: '$datasource' },
format='table',
hide=false,
exemplar=false,
instant=false,
interval='',
legendFormat='__auto',
range=true,
),
]),
$.addStatPanel(
title='OSDs',
datasource='$datasource',
gridPosition={ x: 10, y: 10, w: 3, h: 4 },
graphMode='area',
colorMode='none',
orientation='auto',
justifyMode='auto',
thresholdsMode='absolute',
unit='none',
pluginVersion='9.4.7',
).addThresholds([
{ color: 'green', value: null },
])
.addTargets([
$.addTargetSchema(
expr='count(ceph_osd_metadata{cluster=~"$cluster"})',
datasource={ type: 'prometheus', uid: '$datasource' },
format='table',
hide=false,
exemplar=false,
instant=false,
interval='',
legendFormat='__auto',
range=true,
),
]),
$.addStatPanel(
title='Hosts',
datasource='$datasource',
gridPosition={ x: 13, y: 10, w: 3, h: 4 },
graphMode='area',
colorMode='none',
orientation='auto',
justifyMode='auto',
thresholdsMode='absolute',
unit='none',
pluginVersion='9.4.7',
).addThresholds([
{ color: 'green', value: null },
])
.addTargets([
$.addTargetSchema(
expr='count(sum by (hostname) (ceph_osd_metadata{cluster=~"$cluster"}))',
datasource={ type: 'prometheus', uid: '$datasource' },
format='table',
hide=false,
exemplar=false,
instant=false,
interval='',
legendFormat='__auto',
range=true,
),
]),
$.addStatPanel(
title='Client IOPS',
datasource='$datasource',
gridPosition={ x: 16, y: 10, w: 4, h: 4 },
graphMode='area',
colorMode='none',
orientation='auto',
justifyMode='center',
thresholdsMode='absolute',
unit='ops',
pluginVersion='9.4.7',
).addThresholds([
{ color: 'green', value: null },
])
.addTargets([
$.addTargetSchema(
expr='sum(irate(ceph_pool_wr{cluster=~"$cluster"}[$__interval]))',
datasource={ type: 'prometheus', uid: '$datasource' },
hide=false,
exemplar=false,
instant=false,
legendFormat='Write',
range=true,
),
$.addTargetSchema(
expr='sum(irate(ceph_pool_rd{cluster=~"$cluster"}[$__interval]))',
datasource={ type: 'prometheus', uid: '$datasource' },
hide=false,
exemplar=false,
legendFormat='Read',
range=true,
),
]),
$.addStatPanel(
title='OSD Latencies',
datasource='$datasource',
gridPosition={ x: 20, y: 10, w: 4, h: 4 },
graphMode='area',
colorMode='none',
orientation='auto',
justifyMode='center',
thresholdsMode='absolute',
unit='ms',
pluginVersion='9.4.7',
).addThresholds([
{ color: 'green', value: null },
])
.addTargets([
$.addTargetSchema(
expr='avg(ceph_osd_apply_latency_ms{cluster=~"$cluster"})',
datasource={ type: 'prometheus', uid: '$datasource' },
hide=false,
exemplar=false,
instant=false,
legendFormat='Apply',
range=true,
),
$.addTargetSchema(
expr='avg(ceph_osd_commit_latency_ms{cluster=~"$cluster"})',
datasource={ type: 'prometheus', uid: '$datasource' },
hide=false,
exemplar=false,
legendFormat='Commit',
range=true,
),
]),
$.addStatPanel(
title='Alert Count',
datasource='$datasource',
gridPosition={ x: 0, y: 14, w: 3, h: 4 },
graphMode='none',
colorMode='value',
orientation='auto',
justifyMode='center',
thresholdsMode='absolute',
pluginVersion='9.4.7',
).addThresholds([
{ color: 'text', value: null },
{ color: 'red', value: 80 },
])
.addTargets([
$.addTargetSchema(
expr='count(ALERTS{alertstate="firing", cluster=~"$cluster"}) or vector(0)',
datasource={ type: 'prometheus', uid: '$datasource' },
format='table',
hide=false,
exemplar=false,
instant=true,
interval='',
legendFormat='__auto',
range=false,
),
]),
$.addStatPanel(
title='Total Used',
datasource='$datasource',
gridPosition={ x: 7, y: 14, w: 3, h: 4 },
graphMode='area',
colorMode='none',
orientation='auto',
justifyMode='auto',
thresholdsMode='absolute',
unit='bytes',
pluginVersion='9.4.7',
).addThresholds([
{ color: 'green', value: null },
])
.addTargets([
$.addTargetSchema(
expr='sum(ceph_cluster_total_used_bytes{cluster=~"$cluster"})',
datasource={ type: 'prometheus', uid: '$datasource' },
format='table',
hide=false,
exemplar=false,
instant=false,
interval='',
legendFormat='__auto',
range=true,
),
]),
$.addStatPanel(
title='Capacity Prediction',
datasource='$datasource',
gridPosition={ x: 10, y: 14, w: 3, h: 4 },
graphMode='none',
colorMode='none',
orientation='auto',
justifyMode='auto',
unit='s',
thresholdsMode='absolute',
pluginVersion='9.4.7',
).addThresholds([
{ color: 'green', value: null },
])
.addTargets([
$.addTargetSchema(
expr='predict_linear(avg(increase(ceph_cluster_total_used_bytes{cluster=~"${Cluster}"}[1d]))[7d:1h],120)',
datasource={ type: 'prometheus', uid: '$datasource' },
hide=false,
exemplar=false,
legendFormat='__auto',
range=true,
),
]),
$.addStatPanel(
title='Pools',
datasource='$datasource',
gridPosition={ x: 13, y: 14, w: 3, h: 4 },
graphMode='area',
colorMode='none',
orientation='auto',
justifyMode='auto',
thresholdsMode='absolute',
unit='none',
pluginVersion='9.4.7',
).addThresholds([
{ color: 'green', value: null },
])
.addTargets([
$.addTargetSchema(
expr='count(ceph_pool_metadata{cluster=~"$cluster"})',
datasource={ type: 'prometheus', uid: '$datasource' },
format='table',
hide=false,
exemplar=false,
instant=false,
interval='',
legendFormat='__auto',
range=true,
),
]),
$.addStatPanel(
title='Client Bandwidth',
datasource='$datasource',
gridPosition={ x: 16, y: 14, w: 4, h: 4 },
graphMode='area',
colorMode='none',
orientation='auto',
justifyMode='center',
thresholdsMode='absolute',
unit='binBps',
pluginVersion='9.4.7',
).addThresholds([
{ color: 'green', value: null },
])
.addTargets([
$.addTargetSchema(
expr='sum(irate(ceph_pool_rd_bytes{cluster=~"$cluster"}[$__interval]))',
datasource={ type: 'prometheus', uid: '$datasource' },
hide=false,
exemplar=false,
instant=false,
legendFormat='Write',
range=true,
),
$.addTargetSchema(
expr='sum(irate(ceph_pool_wr_bytes{cluster=~"$cluster"}[$__interval]))',
datasource={ type: 'prometheus', uid: '$datasource' },
hide=false,
exemplar=false,
legendFormat='Read',
range=true,
),
]),
$.addStatPanel(
title='Recovery Rate',
datasource='$datasource',
gridPosition={ x: 20, y: 14, w: 4, h: 4 },
graphMode='area',
colorMode='none',
orientation='auto',
justifyMode='center',
thresholdsMode='absolute',
unit='binBps',
pluginVersion='9.4.7',
).addThresholds([
{ color: 'green', value: null },
])
.addTargets([
$.addTargetSchema(
expr='sum(irate(ceph_osd_recovery_ops{cluster=~"$cluster"}[$__interval]))',
datasource={ type: 'prometheus', uid: '$datasource' },
hide=false,
exemplar=false,
instant=false,
legendFormat='Write',
range=true,
),
]),
$.addRowSchema(false, true, 'Alerts', collapsed=true)
.addPanels([
$.addStatPanel(
title='Status',
datasource='$datasource',
gridPosition={ x: 0, y: 19, w: 5, h: 7 },
graphMode='area',
colorMode='value',
orientation='auto',
justifyMode='center',
thresholdsMode='absolute',
pluginVersion='9.4.7',
).addThresholds([
{ color: 'text', value: null },
])
.addOverrides(
[
{
matcher: { id: 'byName', options: 'Critical' },
properties: [
{
id: 'thresholds',
value: { mode: 'absolute', steps: [{ color: 'text', value: null }, { color: 'semi-dark-red', value: 1 }] },
},
],
},
{
matcher: { id: 'byName', options: 'Warning' },
properties: [
{
id: 'thresholds',
value: { mode: 'absolute', steps: [{ color: 'text', value: null }, { color: 'semi-dark-yellow', value: 1 }] },
},
],
},
]
)
.addTargets([
$.addTargetSchema(
expr='count(ALERTS{alertstate="firing",severity="critical", cluster=~"$cluster"}) OR vector(0)',
datasource='$datasource',
legendFormat='Critical',
instant=true,
range=false
),
$.addTargetSchema(
expr='count(ALERTS{alertstate="firing",severity="warning", cluster=~"$cluster"}) OR vector(0)',
datasource='$datasource',
legendFormat='Warning',
instant=true,
range=false
),
]),
$.addTableExtended(
datasource='$datasource',
title='Alerts',
gridPosition={ h: 7, w: 19, x: 5, y: 19 },
options={
footer: {
fields: '',
reducer: ['sum'],
countRows: false,
enablePagination: false,
show: false,
},
frameIndex: 1,
showHeader: true,
sortBy: [{ desc: false, displayName: 'Severity' }],
},
custom={ align: 'auto', cellOptions: { type: 'auto' }, filterable: true, inspect: false },
thresholds={
mode: 'absolute',
steps: [
{ color: 'green' },
{ color: 'red', value: 80 },
],
},
pluginVersion='9.4.7'
)
.addTransformations([
{
id: 'joinByField',
options: { byField: 'cluster', mode: 'outer' },
},
{
id: 'organize',
options: {
excludeByName: {
Time: true,
Value: true,
__name__: true,
instance: true,
job: true,
oid: true,
replica: true,
type: true,
},
indexByName: {
Time: 0,
Value: 9,
__name__: 1,
alertname: 2,
alertstate: 4,
cluster: 3,
instance: 6,
job: 7,
severity: 5,
type: 8,
},
renameByName: {
alertname: 'Name',
alertstate: 'State',
cluster: 'Cluster',
severity: 'Severity',
},
},
},
]).addTargets([
$.addTargetSchema(
expr='ALERTS{alertstate="firing", %(matchers)s}' % $.matchers(),
datasource={ type: 'prometheus', uid: '$datasource' },
format='table',
hide=false,
exemplar=false,
instant=true,
interval='',
legendFormat='__auto',
range=false,
),
]),
$.addAlertListPanel(
title='Alerts(Grouped)',
datasource={
type: 'datasource',
uid: 'grafana',
},
gridPosition={ h: 8, w: 24, x: 0, y: 26 },
alertName='',
dashboardAlerts=false,
groupBy=[],
groupMode='default',
maxItems=20,
sortOrder=1,
stateFilter={
'error': true,
firing: true,
noData: false,
normal: false,
pending: true,
},
),
]) + { gridPos: { x: 0, y: 18, w: 24, h: 1 } },
$.addRowSchema(false, true, 'Cluster Stats', collapsed=true)
.addPanels([
$.timeSeriesPanel(
lineInterpolation='linear',
lineWidth=1,
drawStyle='line',
axisPlacement='auto',
title='Top 5 - Capacity Utilization(%)',
datasource='$datasource',
gridPosition={ h: 7, w: 8, x: 0, y: 30 },
fillOpacity=0,
pointSize=5,
showPoints='auto',
unit='percentunit',
displayMode='table',
showLegend=true,
placement='bottom',
tooltip={ mode: 'multi', sort: 'desc' },
stackingMode='none',
spanNulls=false,
decimals=2,
thresholdsMode='percentage',
sortBy='Last',
sortDesc=true
)
.addCalcs(['last'])
.addThresholds([
{ color: 'green' },
])
.addTargets(
[
$.addTargetSchema(
expr='topk(5, ceph_cluster_total_used_bytes/ceph_cluster_total_bytes)',
datasource='$datasource',
instant=false,
legendFormat='{{cluster}}',
step=300,
range=true,
),
]
),
$.timeSeriesPanel(
lineInterpolation='linear',
lineWidth=1,
drawStyle='line',
axisPlacement='auto',
title='Top 5 - Cluster IOPS',
datasource='$datasource',
gridPosition={ h: 7, w: 8, x: 8, y: 30 },
fillOpacity=0,
pointSize=5,
showPoints='auto',
unit='ops',
displayMode='table',
showLegend=true,
placement='bottom',
tooltip={ mode: 'multi', sort: 'desc' },
stackingMode='none',
spanNulls=false,
decimals=2,
thresholdsMode='percentage',
sortBy='Last',
sortDesc=true
)
.addCalcs(['last'])
.addThresholds([
{ color: 'green' },
])
.addTargets(
[
$.addTargetSchema(
expr='topk(10, sum by (cluster) (irate(ceph_osd_op_w[$__interval])) \n+ sum by (cluster) (irate(ceph_osd_op_r[$__interval])) )',
datasource='$datasource',
instant=false,
legendFormat='{{cluster}}',
step=300,
range=true,
),
]
),
$.timeSeriesPanel(
lineInterpolation='linear',
lineWidth=1,
drawStyle='line',
axisPlacement='auto',
title='Top 10 - Capacity Utilization(%) by Pool',
datasource='$datasource',
gridPosition={ h: 7, w: 8, x: 16, y: 30 },
fillOpacity=0,
pointSize=5,
showPoints='auto',
unit='percentunit',
displayMode='table',
showLegend=true,
placement='bottom',
tooltip={ mode: 'multi', sort: 'desc' },
stackingMode='none',
spanNulls=false,
decimals=2,
thresholdsMode='absolute',
sortBy='Last',
sortDesc=true
)
.addCalcs(['last'])
.addThresholds([
{ color: 'green' },
])
.addTargets(
[
$.addTargetSchema(
expr='topk(10, ceph_pool_bytes_used{%(matchers)s}/ceph_pool_max_avail{%(matchers)s} * on(pool_id, cluster) group_left(instance, name) ceph_pool_metadata{%(matchers)s})' % $.matchers(),
datasource='$datasource',
instant=false,
legendFormat='{{cluster}} - {{name}}',
step=300,
range=true,
),
]
),
]) + { gridPos: { x: 0, y: 29, w: 24, h: 1 } },
]),
}