467 lines
24 KiB
Plaintext
467 lines
24 KiB
Plaintext
local grafana = import 'github.com/grafana/grafonnet-lib/grafonnet/grafana.libsonnet';
|
|
local dashboard = grafana.dashboard;
|
|
local row = grafana.row;
|
|
local prometheus = grafana.prometheus;
|
|
local template = grafana.template;
|
|
local graphPanel = grafana.graphPanel;
|
|
|
|
local c = import '../config.libsonnet';
|
|
|
|
local datasourceTemplate = {
|
|
current: {
|
|
text: 'default',
|
|
value: 'default',
|
|
},
|
|
hide: 0,
|
|
label: 'Data Source',
|
|
name: 'datasource',
|
|
options: [],
|
|
query: 'prometheus',
|
|
refresh: 1,
|
|
regex: '',
|
|
type: 'datasource',
|
|
};
|
|
|
|
local CPUUtilisation =
|
|
graphPanel.new(
|
|
'CPU Utilisation',
|
|
datasource='$datasource',
|
|
span=6,
|
|
format='percentunit',
|
|
stack=true,
|
|
fill=10,
|
|
legend_show=false,
|
|
) { tooltip+: { sort: 2 } };
|
|
|
|
local CPUSaturation =
|
|
// TODO: Is this a useful panel? At least there should be some explanation how load
|
|
// average relates to the "CPU saturation" in the title.
|
|
graphPanel.new(
|
|
'CPU Saturation (Load1 per CPU)',
|
|
datasource='$datasource',
|
|
span=6,
|
|
format='percentunit',
|
|
stack=true,
|
|
fill=10,
|
|
legend_show=false,
|
|
) { tooltip+: { sort: 2 } };
|
|
|
|
local memoryUtilisation =
|
|
graphPanel.new(
|
|
'Memory Utilisation',
|
|
datasource='$datasource',
|
|
span=6,
|
|
format='percentunit',
|
|
stack=true,
|
|
fill=10,
|
|
legend_show=false,
|
|
) { tooltip+: { sort: 2 } };
|
|
|
|
local memorySaturation =
|
|
graphPanel.new(
|
|
'Memory Saturation (Major Page Faults)',
|
|
datasource='$datasource',
|
|
span=6,
|
|
format='rds',
|
|
stack=true,
|
|
fill=10,
|
|
legend_show=false,
|
|
) { tooltip+: { sort: 2 } };
|
|
|
|
local networkUtilisation =
|
|
graphPanel.new(
|
|
'Network Utilisation (Bytes Receive/Transmit)',
|
|
datasource='$datasource',
|
|
span=6,
|
|
format='Bps',
|
|
stack=true,
|
|
fill=10,
|
|
legend_show=false,
|
|
)
|
|
.addSeriesOverride({ alias: '/Receive/', stack: 'A' })
|
|
.addSeriesOverride({ alias: '/Transmit/', stack: 'B', transform: 'negative-Y' })
|
|
{ tooltip+: { sort: 2 } };
|
|
|
|
local networkSaturation =
|
|
graphPanel.new(
|
|
'Network Saturation (Drops Receive/Transmit)',
|
|
datasource='$datasource',
|
|
span=6,
|
|
format='Bps',
|
|
stack=true,
|
|
fill=10,
|
|
legend_show=false,
|
|
)
|
|
.addSeriesOverride({ alias: '/ Receive/', stack: 'A' })
|
|
.addSeriesOverride({ alias: '/ Transmit/', stack: 'B', transform: 'negative-Y' })
|
|
{ tooltip+: { sort: 2 } };
|
|
|
|
local diskIOUtilisation =
|
|
graphPanel.new(
|
|
'Disk IO Utilisation',
|
|
datasource='$datasource',
|
|
span=6,
|
|
format='percentunit',
|
|
stack=true,
|
|
fill=10,
|
|
legend_show=false,
|
|
) { tooltip+: { sort: 2 } };
|
|
|
|
local diskIOSaturation =
|
|
graphPanel.new(
|
|
'Disk IO Saturation',
|
|
datasource='$datasource',
|
|
span=6,
|
|
format='percentunit',
|
|
stack=true,
|
|
fill=10,
|
|
legend_show=false,
|
|
) { tooltip+: { sort: 2 } };
|
|
|
|
local diskSpaceUtilisation =
|
|
graphPanel.new(
|
|
'Disk Space Utilisation',
|
|
datasource='$datasource',
|
|
span=12,
|
|
format='percentunit',
|
|
stack=true,
|
|
fill=10,
|
|
legend_show=false,
|
|
) { tooltip+: { sort: 2 } };
|
|
|
|
{
|
|
_clusterTemplate:: template.new(
|
|
name='cluster',
|
|
datasource='$datasource',
|
|
query='label_values(node_time_seconds, %s)' % $._config.clusterLabel,
|
|
current='',
|
|
hide=if $._config.showMultiCluster then '' else '2',
|
|
refresh=2,
|
|
includeAll=false,
|
|
sort=1
|
|
),
|
|
|
|
grafanaDashboards+:: {
|
|
'node-rsrc-use.json':
|
|
|
|
dashboard.new(
|
|
'%sUSE Method / Node' % $._config.dashboardNamePrefix,
|
|
time_from='now-1h',
|
|
tags=($._config.dashboardTags),
|
|
timezone='utc',
|
|
refresh='30s',
|
|
graphTooltip='shared_crosshair'
|
|
)
|
|
.addTemplate(datasourceTemplate)
|
|
.addTemplate($._clusterTemplate)
|
|
.addTemplate(
|
|
template.new(
|
|
'instance',
|
|
'$datasource',
|
|
'label_values(node_exporter_build_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}, instance)' % $._config,
|
|
refresh='time',
|
|
sort=1
|
|
)
|
|
)
|
|
.addRow(
|
|
row.new('CPU')
|
|
.addPanel(CPUUtilisation.addTarget(prometheus.target('instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation')))
|
|
.addPanel(CPUSaturation.addTarget(prometheus.target('instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Saturation')))
|
|
)
|
|
.addRow(
|
|
row.new('Memory')
|
|
.addPanel(memoryUtilisation.addTarget(prometheus.target('instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Utilisation')))
|
|
.addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Major page Faults')))
|
|
)
|
|
.addRow(
|
|
row.new('Network')
|
|
.addPanel(
|
|
networkUtilisation
|
|
.addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive'))
|
|
.addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit'))
|
|
)
|
|
.addPanel(
|
|
networkSaturation
|
|
.addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Receive'))
|
|
.addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='Transmit'))
|
|
)
|
|
)
|
|
.addRow(
|
|
row.new('Disk IO')
|
|
.addPanel(diskIOUtilisation.addTarget(prometheus.target('instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}')))
|
|
.addPanel(diskIOSaturation.addTarget(prometheus.target('instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{device}}')))
|
|
)
|
|
.addRow(
|
|
row.new('Disk Space')
|
|
.addPanel(
|
|
diskSpaceUtilisation.addTarget(prometheus.target(
|
|
|||
|
|
sort_desc(1 -
|
|
(
|
|
max without (mountpoint, fstype) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"})
|
|
/
|
|
max without (mountpoint, fstype) (node_filesystem_size_bytes{%(nodeExporterSelector)s, fstype!="", instance="$instance", %(clusterLabel)s="$cluster"})
|
|
) != 0
|
|
)
|
|
||| % $._config, legendFormat='{{device}}'
|
|
))
|
|
)
|
|
),
|
|
|
|
'node-cluster-rsrc-use.json':
|
|
dashboard.new(
|
|
'%sUSE Method / Cluster' % $._config.dashboardNamePrefix,
|
|
time_from='now-1h',
|
|
tags=($._config.dashboardTags),
|
|
timezone='utc',
|
|
refresh='30s',
|
|
graphTooltip='shared_crosshair'
|
|
)
|
|
.addTemplate(datasourceTemplate)
|
|
.addTemplate($._clusterTemplate)
|
|
.addRow(
|
|
row.new('CPU')
|
|
.addPanel(
|
|
CPUUtilisation
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
((
|
|
instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
|
|
*
|
|
instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
|
|
) != 0 )
|
|
/ scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
|
|
||| % $._config, legendFormat='{{ instance }}'
|
|
))
|
|
)
|
|
.addPanel(
|
|
CPUSaturation
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
(
|
|
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
|
|
/ scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
|
|
) != 0
|
|
||| % $._config, legendFormat='{{instance}}'
|
|
))
|
|
)
|
|
)
|
|
.addRow(
|
|
row.new('Memory')
|
|
.addPanel(
|
|
memoryUtilisation
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
(
|
|
instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
|
|
/ scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
|
|
) != 0
|
|
||| % $._config, legendFormat='{{instance}}',
|
|
))
|
|
)
|
|
.addPanel(memorySaturation.addTarget(prometheus.target('instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}' % $._config, legendFormat='{{instance}}')))
|
|
)
|
|
.addRow(
|
|
row.new('Network')
|
|
.addPanel(
|
|
networkUtilisation
|
|
.addTarget(prometheus.target('instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive'))
|
|
.addTarget(prometheus.target('instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit'))
|
|
)
|
|
.addPanel(
|
|
networkSaturation
|
|
.addTarget(prometheus.target('instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Receive'))
|
|
.addTarget(prometheus.target('instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"} != 0' % $._config, legendFormat='{{instance}} Transmit'))
|
|
)
|
|
)
|
|
.addRow(
|
|
row.new('Disk IO')
|
|
.addPanel(
|
|
diskIOUtilisation
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
(
|
|
instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
|
|
/ scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
|
|
) != 0
|
|
||| % $._config, legendFormat='{{instance}} {{device}}'
|
|
))
|
|
)
|
|
.addPanel(
|
|
diskIOSaturation
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
(
|
|
instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}
|
|
/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster"}))
|
|
) != 0
|
|
||| % $._config, legendFormat='{{instance}} {{device}}'
|
|
))
|
|
)
|
|
)
|
|
.addRow(
|
|
row.new('Disk Space')
|
|
.addPanel(
|
|
diskSpaceUtilisation
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
sum without (device) (
|
|
max without (fstype, mountpoint) ((
|
|
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(clusterLabel)s="$cluster"}
|
|
-
|
|
node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(clusterLabel)s="$cluster"}
|
|
) != 0)
|
|
)
|
|
/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, %(clusterLabel)s="$cluster"})))
|
|
||| % $._config, legendFormat='{{instance}}'
|
|
))
|
|
)
|
|
),
|
|
} +
|
|
if $._config.showMultiCluster then {
|
|
'node-multicluster-rsrc-use.json':
|
|
dashboard.new(
|
|
'%sUSE Method / Multi-cluster' % $._config.dashboardNamePrefix,
|
|
time_from='now-1h',
|
|
tags=($._config.dashboardTags),
|
|
timezone='utc',
|
|
refresh='30s',
|
|
graphTooltip='shared_crosshair'
|
|
)
|
|
.addTemplate(datasourceTemplate)
|
|
.addRow(
|
|
row.new('CPU')
|
|
.addPanel(
|
|
CPUUtilisation
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
sum(
|
|
((
|
|
instance:node_cpu_utilisation:rate%(rateInterval)s{%(nodeExporterSelector)s}
|
|
*
|
|
instance:node_num_cpu:sum{%(nodeExporterSelector)s}
|
|
) != 0)
|
|
/ scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s}))
|
|
) by (%(clusterLabel)s)
|
|
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config
|
|
))
|
|
)
|
|
.addPanel(
|
|
CPUSaturation
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
sum((
|
|
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}
|
|
/ scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}))
|
|
) != 0) by (%(clusterLabel)s)
|
|
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config
|
|
))
|
|
)
|
|
)
|
|
.addRow(
|
|
row.new('Memory')
|
|
.addPanel(
|
|
memoryUtilisation
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
sum((
|
|
instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}
|
|
/ scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}))
|
|
) != 0) by (%(clusterLabel)s)
|
|
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config
|
|
))
|
|
)
|
|
.addPanel(
|
|
memorySaturation
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
sum((
|
|
instance:node_vmstat_pgmajfault:rate%(rateInterval)s{%(nodeExporterSelector)s}
|
|
) != 0) by (%(clusterLabel)s)
|
|
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config
|
|
))
|
|
)
|
|
)
|
|
.addRow(
|
|
row.new('Network')
|
|
.addPanel(
|
|
networkUtilisation
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
sum((
|
|
instance:node_network_receive_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
|
|
) != 0) by (%(clusterLabel)s)
|
|
||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config
|
|
))
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
sum((
|
|
instance:node_network_transmit_bytes_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
|
|
) != 0) by (%(clusterLabel)s)
|
|
||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config
|
|
))
|
|
)
|
|
.addPanel(
|
|
networkSaturation
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
sum((
|
|
instance:node_network_receive_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
|
|
) != 0) by (%(clusterLabel)s)
|
|
||| % $._config, legendFormat='{{%(clusterLabel)s}} Receive' % $._config
|
|
))
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
sum((
|
|
instance:node_network_transmit_drop_excluding_lo:rate%(rateInterval)s{%(nodeExporterSelector)s}
|
|
) != 0) by (%(clusterLabel)s)
|
|
||| % $._config, legendFormat='{{%(clusterLabel)s}} Transmit' % $._config
|
|
))
|
|
)
|
|
)
|
|
.addRow(
|
|
row.new('Disk IO')
|
|
.addPanel(
|
|
diskIOUtilisation
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
sum((
|
|
instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}
|
|
/ scalar(count(instance_device:node_disk_io_time_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}))
|
|
) != 0) by (%(clusterLabel)s, device)
|
|
||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config
|
|
))
|
|
)
|
|
.addPanel(
|
|
diskIOSaturation
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
sum((
|
|
instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}
|
|
/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate%(rateInterval)s{%(nodeExporterSelector)s}))
|
|
) != 0) by (%(clusterLabel)s, device)
|
|
||| % $._config, legendFormat='{{%(clusterLabel)s}} {{device}}' % $._config
|
|
))
|
|
)
|
|
)
|
|
.addRow(
|
|
row.new('Disk Space')
|
|
.addPanel(
|
|
diskSpaceUtilisation
|
|
.addTarget(prometheus.target(
|
|
|||
|
|
sum (
|
|
sum without (device) (
|
|
max without (fstype, mountpoint, instance, pod) ((
|
|
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
|
|
) != 0)
|
|
)
|
|
/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s})))
|
|
) by (%(clusterLabel)s)
|
|
||| % $._config, legendFormat='{{%(clusterLabel)s}}' % $._config
|
|
))
|
|
)
|
|
),
|
|
} else {},
|
|
}
|