Address first batch of old review comments
Signed-off-by: beorn7 <beorn@grafana.com>
This commit is contained in:
parent
b3b47f2d07
commit
2180c2f3bf
|
@ -21,22 +21,23 @@ local gauge = promgrafonnet.gauge;
|
|||
)
|
||||
.addTarget(prometheus.target(
|
||||
|||
|
||||
1 - avg by (cpu) (irate(node_cpu{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m]))
|
||||
1 - avg by (cpu) (irate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}[1m]))
|
||||
||| % $._config,
|
||||
legendFormat='{{cpu}}',
|
||||
intervalFactor=10,
|
||||
));
|
||||
|
||||
// TODO: Is this panel useful?
|
||||
local systemLoad =
|
||||
graphPanel.new(
|
||||
'System load',
|
||||
'Load Average',
|
||||
datasource='$datasource',
|
||||
span=6,
|
||||
format='percentunit',
|
||||
format='short',
|
||||
)
|
||||
.addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 1m'))
|
||||
.addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 5m'))
|
||||
.addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='load 15m'));
|
||||
.addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='1m load average'))
|
||||
.addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='5m load average'))
|
||||
.addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='15m load average'));
|
||||
|
||||
local memoryGraph =
|
||||
graphPanel.new(
|
||||
|
@ -48,27 +49,27 @@ local gauge = promgrafonnet.gauge;
|
|||
.addTarget(prometheus.target(
|
||||
|||
|
||||
(
|
||||
node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"}
|
||||
node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}
|
||||
-
|
||||
node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}
|
||||
node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}
|
||||
-
|
||||
node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}
|
||||
node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}
|
||||
-
|
||||
node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}
|
||||
node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}
|
||||
)
|
||||
||| % $._config, legendFormat='memory used'
|
||||
))
|
||||
.addTarget(prometheus.target('node_memory_Buffers{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers'))
|
||||
.addTarget(prometheus.target('node_memory_Cached{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached'))
|
||||
.addTarget(prometheus.target('node_memory_MemFree{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free'));
|
||||
.addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory buffers'))
|
||||
.addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory cached'))
|
||||
.addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % $._config, legendFormat='memory free'));
|
||||
|
||||
local memoryGauge = gauge.new(
|
||||
'Memory Usage',
|
||||
|||
|
||||
(
|
||||
node_memory_MemAvailable{%(nodeExporterSelector)s, instance="$instance"}
|
||||
node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"}
|
||||
/
|
||||
node_memory_MemTotal{%(nodeExporterSelector)s, instance="$instance"}
|
||||
node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}
|
||||
)
|
||||
* 100
|
||||
||| % $._config,
|
||||
|
@ -80,9 +81,9 @@ local gauge = promgrafonnet.gauge;
|
|||
datasource='$datasource',
|
||||
span=9,
|
||||
)
|
||||
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_read_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read'))
|
||||
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_bytes_written_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written'))
|
||||
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_ms{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) +
|
||||
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='read'))
|
||||
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='written'))
|
||||
.addTarget(prometheus.target('sum by (instance) (irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance"}[1m]))' % $._config, legendFormat='io time')) +
|
||||
{
|
||||
seriesOverrides: [
|
||||
{
|
||||
|
@ -96,18 +97,19 @@ local gauge = promgrafonnet.gauge;
|
|||
],
|
||||
yaxes: [
|
||||
self.yaxe(format='bytes'),
|
||||
self.yaxe(format='ms'),
|
||||
self.yaxe(format='s'),
|
||||
],
|
||||
};
|
||||
|
||||
// TODO: Should this be partitioned by mountpoint?
|
||||
local diskSpaceUsage = gauge.new(
|
||||
'Disk Space Usage',
|
||||
|||
|
||||
100 -
|
||||
(
|
||||
sum(node_filesystem_free{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}
|
||||
sum(node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}
|
||||
/
|
||||
sum(node_filesystem_size{%(nodeExporterSelector)s, device!="rootfs", instance="$instance"}
|
||||
sum(node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s, instance="$instance"}
|
||||
* 100
|
||||
)
|
||||
||| % $._config,
|
||||
|
@ -120,7 +122,7 @@ local gauge = promgrafonnet.gauge;
|
|||
span=6,
|
||||
format='bytes',
|
||||
)
|
||||
.addTarget(prometheus.target('irate(node_network_receive_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}'));
|
||||
.addTarget(prometheus.target('irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
|
||||
|
||||
local networkTransmitted =
|
||||
graphPanel.new(
|
||||
|
@ -129,7 +131,7 @@ local gauge = promgrafonnet.gauge;
|
|||
span=6,
|
||||
format='bytes',
|
||||
)
|
||||
.addTarget(prometheus.target('irate(node_network_transmit_bytes{%(nodeExporterSelector)s, instance="$instance", device!~"lo"}[1m])' % $._config, legendFormat='{{device}}'));
|
||||
.addTarget(prometheus.target('irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[1m])' % $._config, legendFormat='{{device}}'));
|
||||
|
||||
dashboard.new('Nodes', time_from='now-1h')
|
||||
.addTemplate(
|
||||
|
@ -152,7 +154,7 @@ local gauge = promgrafonnet.gauge;
|
|||
template.new(
|
||||
'instance',
|
||||
'$datasource',
|
||||
'label_values(node_boot_time{%(nodeExporterSelector)s}, instance)' % $._config,
|
||||
'label_values(node_boot_time_seconds{%(nodeExporterSelector)s}, instance)' % $._config,
|
||||
refresh='time',
|
||||
)
|
||||
)
|
||||
|
|
|
@ -10,16 +10,30 @@ local g = import 'grafana-builder/grafana.libsonnet';
|
|||
g.row('CPU')
|
||||
.addPanel(
|
||||
g.panel('CPU Utilisation') +
|
||||
g.queryPanel('instance:node_cpu_utilisation:avg1m * instance:node_num_cpu:sum / scalar(sum(instance:node_num_cpu:sum))', '{{instance}}', legendLink) +
|
||||
g.queryPanel(|||
|
||||
(
|
||||
instance:node_cpu_utilisation:avg1m
|
||||
*
|
||||
instance:node_num_cpu:sum
|
||||
/ ignoring (instance) group_left
|
||||
sum without (instance) (instance:node_num_cpu:sum)
|
||||
)
|
||||
|||, '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('CPU Saturation (Load1)') +
|
||||
// TODO: Is this a useful panel?
|
||||
g.panel('CPU Saturation (load1 per CPU)') +
|
||||
g.queryPanel(|||
|
||||
instance:node_cpu_saturation_load1: / scalar(sum(up{%(nodeExporterSelector)s}))
|
||||
||| % $._config, '{{instance}}', legendLink) +
|
||||
(
|
||||
instance:node_load1_per_cpu:ratio
|
||||
/ ignoring (instance) group_left
|
||||
count without (instance) (instance:node_load1_per_cpu:ratio)
|
||||
)
|
||||
|||, '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
// TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios.
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
)
|
||||
)
|
||||
|
@ -43,16 +57,26 @@ local g = import 'grafana-builder/grafana.libsonnet';
|
|||
.addPanel(
|
||||
g.panel('Disk IO Utilisation') +
|
||||
// Full utilisation would be all disks on each node spending an average of
|
||||
// 1 sec per second doing I/O, normalize by node count for stacked charts
|
||||
g.queryPanel('instance:node_disk_utilisation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s}))' % $._config, '{{instance}}', legendLink) +
|
||||
// 1 second per second doing I/O, normalize by metric cardinality for stacked charts.
|
||||
g.queryPanel(|||
|
||||
(
|
||||
instance:node_disk_utilisation:sum_irate
|
||||
/ ignoring (instance) group_left
|
||||
count without (instance) (instance:node_disk_utilisation:sum_irate)
|
||||
)
|
||||
|||, '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Disk IO Saturation') +
|
||||
g.queryPanel(|||
|
||||
instance:node_disk_saturation:sum_irate / scalar(sum(up{%(nodeExporterSelector)s}))
|
||||
||| % $._config, '{{instance}}', legendLink) +
|
||||
(
|
||||
instance:node_disk_saturation:sum_irate
|
||||
/ ignoring (instance) group_left
|
||||
count without (instance) (instance:node_disk_saturation:sum_irate)
|
||||
)
|
||||
|||, '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
)
|
||||
|
@ -76,7 +100,21 @@ local g = import 'grafana-builder/grafana.libsonnet';
|
|||
g.row('Storage')
|
||||
.addPanel(
|
||||
g.panel('Disk Capacity') +
|
||||
g.queryPanel('sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_free_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)) by (instance,namespace) / scalar(sum(max(node_filesystem_size_bytes{fstype=~"ext[24]"}) by (device,instance,namespace)))', '{{instance}}', legendLink) +
|
||||
g.queryPanel(|||
|
||||
(
|
||||
sum without (device) (
|
||||
max without (fstype, mountpoint) (
|
||||
node_filesystem_size_bytes{fstype=~"ext[24]"} - node_filesystem_avail_bytes{fstype=~"ext[24]"}
|
||||
)
|
||||
)
|
||||
/ ignoring (instance) group_left
|
||||
sum without (instance, device) (
|
||||
max without (fstype, mountpoint) (
|
||||
node_filesystem_size_bytes{fstype=~"ext[24]"}
|
||||
)
|
||||
)
|
||||
)
|
||||
|||, '{{instance}}', legendLink) +
|
||||
g.stack +
|
||||
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
|
||||
),
|
||||
|
@ -106,9 +144,9 @@ local g = import 'grafana-builder/grafana.libsonnet';
|
|||
{ yaxes: g.yaxes('percentunit') },
|
||||
)
|
||||
.addPanel(
|
||||
g.panel('Memory Saturation (Swap I/O)') +
|
||||
g.queryPanel('instance:node_memory_swap_io_bytes:sum_rate{instance="$instance"}', 'Swap IO') +
|
||||
{ yaxes: g.yaxes('Bps') },
|
||||
g.panel('Memory Saturation (pages swapped per second)') +
|
||||
g.queryPanel('instance:node_memory_swap_io_pages:sum_rate{instance="$instance"}', 'Swap IO') +
|
||||
{ yaxes: g.yaxes('short') },
|
||||
)
|
||||
)
|
||||
.addRow(
|
||||
|
@ -141,7 +179,14 @@ local g = import 'grafana-builder/grafana.libsonnet';
|
|||
g.row('Disk')
|
||||
.addPanel(
|
||||
g.panel('Disk Utilisation') +
|
||||
g.queryPanel('1 - sum(max by (device, node) (node_filesystem_free_bytes{fstype=~"ext[24]"})) / sum(max by (device, node) (node_filesystem_size_bytes{fstype=~"ext[24]"}))', 'Disk') +
|
||||
g.queryPanel(|||
|
||||
1 -
|
||||
(
|
||||
sum(max without (mountpoint, fstype) (node_filesystem_avail_bytes{fstype=~"ext[24]"}))
|
||||
/
|
||||
sum(max without (mountpoint, fstype) (node_filesystem_size_bytes{fstype=~"ext[24]"}))
|
||||
)
|
||||
|||, 'Disk') +
|
||||
{ yaxes: g.yaxes('percentunit') },
|
||||
),
|
||||
),
|
||||
|
|
|
@ -8,8 +8,8 @@
|
|||
// This rule gives the number of CPUs per node.
|
||||
record: 'instance:node_num_cpu:sum',
|
||||
expr: |||
|
||||
count by (instance) (
|
||||
sum by (instance, cpu) (
|
||||
count without (cpu) (
|
||||
sum without (mode) (
|
||||
node_cpu_seconds_total{%(nodeExporterSelector)s}
|
||||
)
|
||||
)
|
||||
|
@ -19,29 +19,20 @@
|
|||
// CPU utilisation is % CPU is not idle.
|
||||
record: 'instance:node_cpu_utilisation:avg1m',
|
||||
expr: |||
|
||||
1 - avg by (instance) (
|
||||
rate(node_cpu_seconds_total{%(nodeExporterSelector)s,mode="idle"}[1m])
|
||||
1 - avg without (cpu, mode) (
|
||||
rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}[1m])
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
// CPU saturation is 1min avg run queue length / number of CPUs.
|
||||
// Can go over 100%. >100% is bad.
|
||||
record: 'instance:node_cpu_saturation_load1:',
|
||||
// This is CPU saturation: 1min avg run queue length / number of CPUs.
|
||||
// Can go over 1. >1 is bad.
|
||||
record: 'instance:node_load1_per_cpu:ratio',
|
||||
expr: |||
|
||||
(
|
||||
sum by (instance) (node_load1{%(nodeExporterSelector)s})
|
||||
node_load1{%(nodeExporterSelector)s}
|
||||
/
|
||||
instance:node_num_cpu:sum
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
// Total memory per node
|
||||
record: 'instance:node_memory_bytes_total:sum',
|
||||
expr: |||
|
||||
sum by (instance) (
|
||||
node_memory_MemTotal_bytes{%(nodeExporterSelector)s}
|
||||
instance:node_num_cpu:sum{%(nodeExporterSelector)s}
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
|
@ -57,9 +48,9 @@
|
|||
||| % $._config,
|
||||
},
|
||||
{
|
||||
record: 'instance:node_memory_swap_io_bytes:sum_rate',
|
||||
record: 'instance:node_memory_swap_io_pages:sum_rate',
|
||||
expr: |||
|
||||
1e3 * sum by (instance) (
|
||||
(
|
||||
rate(node_vmstat_pgpgin{%(nodeExporterSelector)s}[1m])
|
||||
+
|
||||
rate(node_vmstat_pgpgout{%(nodeExporterSelector)s}[1m])
|
||||
|
@ -70,7 +61,7 @@
|
|||
// Disk utilisation (ms spent, 1 second irate())
|
||||
record: 'instance:node_disk_utilisation:sum_irate',
|
||||
expr: |||
|
||||
sum by (instance) (
|
||||
sum without (device) (
|
||||
irate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m])
|
||||
)
|
||||
||| % $._config,
|
||||
|
@ -79,28 +70,30 @@
|
|||
// Disk saturation (ms spent, by rate() it's bound by 1 second)
|
||||
record: 'instance:node_disk_saturation:sum_irate',
|
||||
expr: |||
|
||||
sum by (instance) (
|
||||
sum without (device) (
|
||||
irate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s,device=~"(sd|xvd).+"}[1m])
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
// TODO: For the following two rules, consider configurable filtering to exclude more network
|
||||
// device names than just "lo".
|
||||
{
|
||||
record: 'instance:node_net_utilisation:sum_irate',
|
||||
expr: |||
|
||||
sum by (instance) (
|
||||
irate(node_network_receive_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])
|
||||
sum without (device) (
|
||||
irate(node_network_receive_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
||||
+
|
||||
irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])
|
||||
irate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
{
|
||||
record: 'instance:node_net_saturation:sum_irate',
|
||||
expr: |||
|
||||
sum by (instance) (
|
||||
irate(node_network_receive_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])
|
||||
sum without (device) (
|
||||
irate(node_network_receive_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
||||
+
|
||||
irate(node_network_transmit_drop_total{%(nodeExporterSelector)s,device=~"eth[0-9]+"}[1m])
|
||||
irate(node_network_transmit_drop_total{%(nodeExporterSelector)s, device!="lo"}[1m])
|
||||
)
|
||||
||| % $._config,
|
||||
},
|
||||
|
|
Loading…
Reference in New Issue