From 77ae769179acf92cee40fa5d27aa41d817496a7a Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Thu, 6 Apr 2023 00:21:50 +0800 Subject: [PATCH] Add thresholds for memory alerts Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 10 +++++----- docs/node-mixin/config.libsonnet | 8 ++++++++ 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index 81ad3c64..a51e6f2c 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -344,7 +344,7 @@ { alert: 'NodeMemoryMajorPagesFaults', expr: ||| - rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > 500 + rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > %(memoryMajorPagesFaultsWarningThreshold)s ||| % $._config, 'for': '15m', labels: { @@ -353,15 +353,15 @@ annotations: { summary: 'Memory major page faults are occurring at very high rate.', description: ||| - Memory major pages are occurring at very high rate at {{ $labels.instance }}, 500 major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsWarningThreshold)s major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. Please check that there is enough memory available at this instance. - |||, + ||| % $._config, }, }, { alert: 'NodeMemoryHighUtilization', expr: ||| - 100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > 90 + 100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > %(memoryHighUtilizationThreshold)s ||| % $._config, 'for': '15m', labels: { @@ -370,7 +370,7 @@ annotations: { summary: 'Host is running out of memory.', description: ||| - Memory is filling up at {{ $labels.instance }}, has been above 90% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. + Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)s% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. |||, }, }, diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index 86179c8f..0e32ac15 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet @@ -60,6 +60,14 @@ fsSpaceAvailableWarningThreshold: 5, fsSpaceAvailableCriticalThreshold: 3, + // Memory utilzation (%) level on which to trigger the + // 'NodeMemoryHighUtilization' alert. + memoryHighUtilizationThreshold: 90, + + // Threshold for the rate of memory major page faults to trigger + // 'NodeMemoryMajorPagesFaults' alert. + memoryMajorPagesFaultsWarningThreshold: 500, + rateInterval: '5m', // Opt-in for multi-cluster support. showMultiCluster: false,