Fix the normalization for the cluster-wide dashboards

We actually have to count or sum, respectively, _all_ the selected
metrics for the cluster-wide view. Which means it's easiest to use the
`scalar` approach after all (but only in the cluster dashboard). This
still propagates all the labels.

I have extended the comment for the `nodeExporterSelector` to note
that the cluster dashboard only makes sense if all the selected node
exporter actually belong to the same cluster.

Since this is jsonnet, users can easily disable the cluster
dashboard. Or even create multiple instances of the dashboards with
different `nodeExporterSelector`s for different clusters.

Signed-off-by: beorn7 <beorn@grafana.com>
This commit is contained in:
beorn7 2019-10-30 22:52:36 +01:00
parent f9d2bbe854
commit c6914477f5
2 changed files with 20 additions and 35 deletions

View File

@ -2,7 +2,12 @@
_config+:: {
// Selectors are inserted between {} in Prometheus queries.
// Select the metrics coming from the node exporter.
// Select the metrics coming from the node exporter. Note that all
// the selected metrics are shown stacked on top of each other in
// the 'USE Method / Cluster' dashboard. Consider disabling that
// dashboard if mixing up all those metrics in the same dashboard
// doesn't make sense (e.g. because they are coming from different
// clusters).
nodeExporterSelector: 'job="node"',
// Select the fstype for filesystem-related queries. If left

View File

@ -15,9 +15,8 @@ local g = import 'grafana-builder/grafana.libsonnet';
instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s}
*
instance:node_num_cpu:sum{%(nodeExporterSelector)s}
/ ignoring (instance) group_left
sum without (instance) (instance:node_num_cpu:sum{%(nodeExporterSelector)s})
)
/ scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
@ -27,11 +26,8 @@ local g = import 'grafana-builder/grafana.libsonnet';
// average relates to the "CPU saturation" in the title.
g.panel('CPU Saturation (load1 per CPU)') +
g.queryPanel(|||
(
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}
/ ignoring (instance) group_left
count without (instance) (instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s})
)
instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}
/ scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}}', legendLink) +
g.stack +
// TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios.
@ -43,11 +39,8 @@ local g = import 'grafana-builder/grafana.libsonnet';
.addPanel(
g.panel('Memory Utilisation') +
g.queryPanel(|||
(
instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}
/ ignoring (instance) group_left
count without (instance) (instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s})
)
instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}
/ scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
@ -123,11 +116,8 @@ local g = import 'grafana-builder/grafana.libsonnet';
// TODO: Does the partition by device make sense? Using the most utilized device per
// instance might make more sense.
g.queryPanel(|||
(
instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}
/ ignoring (instance, device) group_left
count without (instance, device) (instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s})
)
instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}
/ scalar(count(instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}} {{device}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
@ -135,11 +125,8 @@ local g = import 'grafana-builder/grafana.libsonnet';
.addPanel(
g.panel('Disk IO Saturation') +
g.queryPanel(|||
(
instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}
/ ignoring (instance, device) group_left
count without (instance, device) (instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s})
)
instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}
/ scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}))
||| % $._config, '{{instance}} {{device}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
@ -150,19 +137,12 @@ local g = import 'grafana-builder/grafana.libsonnet';
.addPanel(
g.panel('Disk Space Utilisation') +
g.queryPanel(|||
(
sum without (device) (
max without (fstype, mountpoint) (
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
)
)
/ ignoring (instance) group_left
sum without (instance, device) (
max without (fstype, mountpoint) (
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
)
sum without (device) (
max without (fstype, mountpoint) (
node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
)
)
)
/ scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s})))
||| % $._config, '{{instance}}', legendLink) +
g.stack +
{ yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },