From cc491333212472ba78ad421a3a2958846cfcb2ef Mon Sep 17 00:00:00 2001 From: Adrian Berger <43774417+adberger@users.noreply.github.com> Date: Fri, 8 Mar 2024 14:41:36 +0100 Subject: [PATCH] Add multi-cluster support for Nodes dashboard (#2945) Signed-off-by: Adrian Berger --- docs/node-mixin/lib/prom-mixin.libsonnet | 95 ++++++++++++++---------- 1 file changed, 56 insertions(+), 39 deletions(-) diff --git a/docs/node-mixin/lib/prom-mixin.libsonnet b/docs/node-mixin/lib/prom-mixin.libsonnet index 6c4d9904..5d2e7bd0 100644 --- a/docs/node-mixin/lib/prom-mixin.libsonnet +++ b/docs/node-mixin/lib/prom-mixin.libsonnet @@ -27,6 +27,23 @@ local table = grafana70.panel.table; type: 'datasource', }, + local clusterTemplatePrototype = + template.new( + 'cluster', + '$datasource', + '', + hide=if config.showMultiCluster then '' else '2', + refresh='time', + label='Cluster', + ), + local clusterTemplate = + if platform == 'Darwin' then + clusterTemplatePrototype + { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, sysname="Darwin"}, %(clusterLabel)s)' % config } + else + clusterTemplatePrototype + { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, sysname!="Darwin"}, %(clusterLabel)s)' % config }, + local instanceTemplatePrototype = template.new( 'instance', @@ -38,11 +55,10 @@ local table = grafana70.panel.table; local instanceTemplate = if platform == 'Darwin' then instanceTemplatePrototype - { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, sysname="Darwin"}, instance)' % config } + { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster", sysname="Darwin"}, instance)' % config } else instanceTemplatePrototype - { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, sysname!="Darwin"}, instance)' % config }, - + { query: 'label_values(node_uname_info{%(nodeExporterSelector)s, %(clusterLabel)s="$cluster", sysname!="Darwin"}, instance)' % config }, local idleCPU = graphPanel.new( @@ -57,9 +73,9 @@ local table = grafana70.panel.table; .addTarget(prometheus.target( ||| ( - (1 - sum without (mode) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode=~"idle|iowait|steal", instance="$instance"}[$__rate_interval]))) + (1 - sum without (mode) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode=~"idle|iowait|steal", instance="$instance", %(clusterLabel)s="$cluster"}[$__rate_interval]))) / ignoring(cpu) group_left - count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance"}) + count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle", instance="$instance", %(clusterLabel)s="$cluster"}) ) ||| % config, legendFormat='{{cpu}}', @@ -75,10 +91,10 @@ local table = grafana70.panel.table; min=0, fill=0, ) - .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='1m load average')) - .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='5m load average')) - .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='15m load average')) - .addTarget(prometheus.target('count(node_cpu_seconds_total{%(nodeExporterSelector)s, instance="$instance", mode="idle"})' % config, legendFormat='logical cores')), + .addTarget(prometheus.target('node_load1{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='1m load average')) + .addTarget(prometheus.target('node_load5{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='5m load average')) + .addTarget(prometheus.target('node_load15{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='15m load average')) + .addTarget(prometheus.target('count(node_cpu_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", mode="idle"})' % config, legendFormat='logical cores')), local memoryGraphPanelPrototype = graphPanel.new( @@ -94,44 +110,44 @@ local table = grafana70.panel.table; .addTarget(prometheus.target( ||| ( - node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"} + node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - - node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"} + node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - - node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"} + node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - - node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"} + node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} ) ||| % config, legendFormat='memory used' )) - .addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='memory buffers')) - .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='memory cached')) - .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='memory free')) + .addTarget(prometheus.target('node_memory_Buffers_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='memory buffers')) + .addTarget(prometheus.target('node_memory_Cached_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='memory cached')) + .addTarget(prometheus.target('node_memory_MemFree_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='memory free')) else if platform == 'Darwin' then // not useful to stack memoryGraphPanelPrototype { stack: false } - .addTarget(prometheus.target('node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='Physical Memory')) + .addTarget(prometheus.target('node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='Physical Memory')) .addTarget(prometheus.target( ||| ( - node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance"} + - node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance"} + node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - + node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} + + node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} + + node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} ) ||| % config, legendFormat='Memory Used' )) .addTarget(prometheus.target( ||| ( - node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance"} - - node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance"} + node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} - + node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"} ) ||| % config, legendFormat='App Memory' )) - .addTarget(prometheus.target('node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='Wired Memory')) - .addTarget(prometheus.target('node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance"}' % config, legendFormat='Compressed')), + .addTarget(prometheus.target('node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='Wired Memory')) + .addTarget(prometheus.target('node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}' % config, legendFormat='Compressed')), // NOTE: avg() is used to circumvent a label change caused by a node_exporter rollout. local memoryGaugePanelPrototype = @@ -155,8 +171,8 @@ local table = grafana70.panel.table; ||| 100 - ( - avg(node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance"}) / - avg(node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance"}) + avg(node_memory_MemAvailable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) / + avg(node_memory_MemTotal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) * 100 ) ||| % config, @@ -168,12 +184,12 @@ local table = grafana70.panel.table; ||| ( ( - avg(node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance"}) - - avg(node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance"}) + - avg(node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance"}) + - avg(node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance"}) + avg(node_memory_internal_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) - + avg(node_memory_purgeable_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) + + avg(node_memory_wired_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) + + avg(node_memory_compressed_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) ) / - avg(node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance"}) + avg(node_memory_total_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster"}) ) * 100 @@ -190,17 +206,17 @@ local table = grafana70.panel.table; ) // TODO: Does it make sense to have those three in the same panel? .addTarget(prometheus.target( - 'rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % config, + 'rate(node_disk_read_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(diskDeviceSelector)s}[$__rate_interval])' % config, legendFormat='{{device}} read', intervalFactor=1, )) .addTarget(prometheus.target( - 'rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % config, + 'rate(node_disk_written_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(diskDeviceSelector)s}[$__rate_interval])' % config, legendFormat='{{device}} written', intervalFactor=1, )) .addTarget(prometheus.target( - 'rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(diskDeviceSelector)s}[$__rate_interval])' % config, + 'rate(node_disk_io_time_seconds_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(diskDeviceSelector)s}[$__rate_interval])' % config, legendFormat='{{device}} io time', intervalFactor=1, )) + @@ -232,7 +248,7 @@ local table = grafana70.panel.table; .addThresholdStep(color='red', value=0.9) .addTarget(prometheus.target( ||| - max by (mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s, %(fsMountpointSelector)s}) + max by (mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(fsSelector)s, %(fsMountpointSelector)s}) ||| % config, legendFormat='', instant=true, @@ -240,7 +256,7 @@ local table = grafana70.panel.table; )) .addTarget(prometheus.target( ||| - max by (mountpoint) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, instance="$instance", %(fsSelector)s, %(fsMountpointSelector)s}) + max by (mountpoint) (node_filesystem_avail_bytes{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", %(fsSelector)s, %(fsMountpointSelector)s}) ||| % config, legendFormat='', instant=true, @@ -421,7 +437,7 @@ local table = grafana70.panel.table; fill=0, ) .addTarget(prometheus.target( - 'rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval]) * 8' % config, + 'rate(node_network_receive_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", device!="lo"}[$__rate_interval]) * 8' % config, legendFormat='{{device}}', intervalFactor=1, )), @@ -437,7 +453,7 @@ local table = grafana70.panel.table; fill=0, ) .addTarget(prometheus.target( - 'rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", device!="lo"}[$__rate_interval]) * 8' % config, + 'rate(node_network_transmit_bytes_total{%(nodeExporterSelector)s, instance="$instance", %(clusterLabel)s="$cluster", device!="lo"}[$__rate_interval]) * 8' % config, legendFormat='{{device}}', intervalFactor=1, )), @@ -473,6 +489,7 @@ local table = grafana70.panel.table; local templates = [ prometheusDatasourceTemplate, + clusterTemplate, instanceTemplate, ],