Merge pull request #1530 from prometheus/beorn7/mixin

Fix the normalization for the cluster-wide dashboards
2019-11-01 13:38:41 +01:00 · 2019-11-01 13:38:41 +01:00 · 44774994fe
parent f9d2bbe854 c6914477f5
commit 44774994fe
2 changed files with 20 additions and 35 deletions
--- a/docs/node-mixin/config.libsonnet
+++ b/docs/node-mixin/config.libsonnet
@ -2,7 +2,12 @@
  _config+:: {
    // Selectors are inserted between {} in Prometheus queries.

-    // Select the metrics coming from the node exporter.
+    // Select the metrics coming from the node exporter. Note that all
+    // the selected metrics are shown stacked on top of each other in
+    // the 'USE Method / Cluster' dashboard. Consider disabling that
+    // dashboard if mixing up all those metrics in the same dashboard
+    // doesn't make sense (e.g. because they are coming from different
+    // clusters).
    nodeExporterSelector: 'job="node"',

    // Select the fstype for filesystem-related queries. If left
--- a/docs/node-mixin/dashboards/use.libsonnet
+++ b/docs/node-mixin/dashboards/use.libsonnet
@ -15,9 +15,8 @@ local g = import 'grafana-builder/grafana.libsonnet';
              instance:node_cpu_utilisation:rate1m{%(nodeExporterSelector)s}
            *
              instance:node_num_cpu:sum{%(nodeExporterSelector)s}
-            / ignoring (instance) group_left
-              sum without (instance) (instance:node_num_cpu:sum{%(nodeExporterSelector)s})
            )
+	    / scalar(sum(instance:node_num_cpu:sum{%(nodeExporterSelector)s}))
          ||| % $._config, '{{instance}}', legendLink) +
          g.stack +
          { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
@ -27,11 +26,8 @@ local g = import 'grafana-builder/grafana.libsonnet';
          // average relates to the "CPU saturation" in the title.
          g.panel('CPU Saturation (load1 per CPU)') +
          g.queryPanel(|||
-            (
            instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}
-            / ignoring (instance) group_left
-              count without (instance) (instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s})
-            )
+            / scalar(count(instance:node_load1_per_cpu:ratio{%(nodeExporterSelector)s}))
          ||| % $._config, '{{instance}}', legendLink) +
          g.stack +
          // TODO: Does `max: 1` make sense? The stack can go over 1 in high-load scenarios.
@ -43,11 +39,8 @@ local g = import 'grafana-builder/grafana.libsonnet';
        .addPanel(
          g.panel('Memory Utilisation') +
          g.queryPanel(|||
-            (
            instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}
-            / ignoring (instance) group_left
-              count without (instance) (instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s})
-            )
+            / scalar(count(instance:node_memory_utilisation:ratio{%(nodeExporterSelector)s}))
          ||| % $._config, '{{instance}}', legendLink) +
          g.stack +
          { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
@ -123,11 +116,8 @@ local g = import 'grafana-builder/grafana.libsonnet';
          // TODO: Does the partition by device make sense? Using the most utilized device per
          // instance might make more sense.
          g.queryPanel(|||
-            (
            instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}
-            / ignoring (instance, device) group_left
-              count without (instance, device) (instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s})
-            )
+            / scalar(count(instance_device:node_disk_io_time_seconds:rate1m{%(nodeExporterSelector)s}))
          ||| % $._config, '{{instance}} {{device}}', legendLink) +
          g.stack +
          { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
@ -135,11 +125,8 @@ local g = import 'grafana-builder/grafana.libsonnet';
        .addPanel(
          g.panel('Disk IO Saturation') +
          g.queryPanel(|||
-            (
            instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}
-            / ignoring (instance, device) group_left
-              count without (instance, device) (instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s})
-            )
+            / scalar(count(instance_device:node_disk_io_time_weighted_seconds:rate1m{%(nodeExporterSelector)s}))
          ||| % $._config, '{{instance}} {{device}}', legendLink) +
          g.stack +
          { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },
@ -150,19 +137,12 @@ local g = import 'grafana-builder/grafana.libsonnet';
        .addPanel(
          g.panel('Disk Space Utilisation') +
          g.queryPanel(|||
-            (
            sum without (device) (
              max without (fstype, mountpoint) (
                node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s} - node_filesystem_avail_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
              )
            ) 
-            / ignoring (instance) group_left
-              sum without (instance, device) (
-                max without (fstype, mountpoint) (
-                  node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s}
-                )
-              )
-            )  
+            / scalar(sum(max without (fstype, mountpoint) (node_filesystem_size_bytes{%(nodeExporterSelector)s, %(fsSelector)s})))
          ||| % $._config, '{{instance}}', legendLink) +
          g.stack +
          { yaxes: g.yaxes({ format: 'percentunit', max: 1 }) },