From 6bdc1d9c98f237b6c165b5365c96a200fe3c667a Mon Sep 17 00:00:00 2001 From: Vitaly Zhuravlev Date: Thu, 6 Apr 2023 00:56:00 +0800 Subject: [PATCH] Add thresholds for memory, disk and system alerts Signed-off-by: Vitaly Zhuravlev --- docs/node-mixin/alerts/alerts.libsonnet | 22 +++++++++++----------- docs/node-mixin/config.libsonnet | 11 ++++++++++- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/docs/node-mixin/alerts/alerts.libsonnet b/docs/node-mixin/alerts/alerts.libsonnet index a51e6f2c..68455e44 100644 --- a/docs/node-mixin/alerts/alerts.libsonnet +++ b/docs/node-mixin/alerts/alerts.libsonnet @@ -327,7 +327,7 @@ alert: 'NodeSystemSaturation', expr: ||| node_load1{%(nodeExporterSelector)s} - / count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}) > 2 + / count without (cpu, mode) (node_cpu_seconds_total{%(nodeExporterSelector)s, mode="idle"}) > %(systemSaturationPerCoreThreshold)d ||| % $._config, 'for': '15m', labels: { @@ -336,15 +336,15 @@ annotations: { summary: 'System saturated, load per core is very high.', description: ||| - System load per core at {{ $labels.instance }} has been above 2 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + System load per core at {{ $labels.instance }} has been above %(systemSaturationPerCoreThreshold)d for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. This might indicate this instance resources saturation and can cause it becoming unresponsive. - |||, + ||| % $._config, }, }, { alert: 'NodeMemoryMajorPagesFaults', expr: ||| - rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > %(memoryMajorPagesFaultsWarningThreshold)s + rate(node_vmstat_pgmajfault{%(nodeExporterSelector)s}[5m]) > %(memoryMajorPagesFaultsThreshold)d ||| % $._config, 'for': '15m', labels: { @@ -353,7 +353,7 @@ annotations: { summary: 'Memory major page faults are occurring at very high rate.', description: ||| - Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsWarningThreshold)s major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsThreshold)d major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. Please check that there is enough memory available at this instance. ||| % $._config, }, @@ -361,7 +361,7 @@ { alert: 'NodeMemoryHighUtilization', expr: ||| - 100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > %(memoryHighUtilizationThreshold)s + 100 - (node_memory_MemAvailable_bytes{%(nodeExporterSelector)s} / node_memory_MemTotal_bytes{%(nodeExporterSelector)s} * 100) > %(memoryHighUtilizationThreshold)d ||| % $._config, 'for': '15m', labels: { @@ -370,14 +370,14 @@ annotations: { summary: 'Host is running out of memory.', description: ||| - Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)s% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. - |||, + Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)d%% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%. + ||| % $._config, }, }, { alert: 'NodeDiskIOSaturation', expr: ||| - rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[5m]) > 10 + rate(node_disk_io_time_weighted_seconds_total{%(nodeExporterSelector)s, %(diskDeviceSelector)s}[5m]) > %(diskIOSaturationThreshold)d ||| % $._config, 'for': '30m', labels: { @@ -386,9 +386,9 @@ annotations: { summary: 'Disk IO queue is high.', description: ||| - Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above 10 for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. + Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%.2f" $value }}. This symptom might indicate disk saturation. - |||, + ||| % $._config, }, }, { diff --git a/docs/node-mixin/config.libsonnet b/docs/node-mixin/config.libsonnet index 0e32ac15..49ca6ff8 100644 --- a/docs/node-mixin/config.libsonnet +++ b/docs/node-mixin/config.libsonnet @@ -43,6 +43,11 @@ // just a warning for K8s nodes. nodeCriticalSeverity: 'critical', + + // Load average 1m (per core) on which to trigger the + // 'NodeSystemSaturation' alert. + systemSaturationPerCoreThreshold: 2, + // Available disk space (%) thresholds on which to trigger the // 'NodeFilesystemSpaceFillingUp' alerts. These alerts fire if the disk // usage grows in a way that it is predicted to run out in 4h or 1d @@ -66,7 +71,11 @@ // Threshold for the rate of memory major page faults to trigger // 'NodeMemoryMajorPagesFaults' alert. - memoryMajorPagesFaultsWarningThreshold: 500, + memoryMajorPagesFaultsThreshold: 500, + + // Disk IO queue level above which to trigger + // 'NodeDiskIOSaturation' alert. + diskIOSaturationThreshold: 10, rateInterval: '5m', // Opt-in for multi-cluster support.