Set severity to NodeCPUHighUsage to info
Signed-off-by: Vitaly Zhuravlev <v-zhuravlev@users.noreply.github.com>
This commit is contained in:
parent
6bdc1d9c98
commit
b7dfb32bfc
|
@ -312,15 +312,17 @@
|
||||||
{
|
{
|
||||||
alert: 'NodeCPUHighUsage',
|
alert: 'NodeCPUHighUsage',
|
||||||
expr: |||
|
expr: |||
|
||||||
sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode!="idle"}[2m]))) > 0.8
|
sum without(mode) (avg without (cpu) (rate(node_cpu_seconds_total{%(nodeExporterSelector)s, mode!="idle"}[2m]))) * 100 > %(cpuHighUsageThreshold)d
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
'for': '15m',
|
'for': '15m',
|
||||||
labels: {
|
labels: {
|
||||||
severity: 'warning',
|
severity: 'info',
|
||||||
},
|
},
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'High CPU usage.',
|
summary: 'High CPU usage.',
|
||||||
description: 'CPU usage at {{ $labels.instance }} has been above 80% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.',
|
description: |||
|
||||||
|
CPU usage at {{ $labels.instance }} has been above %(cpuHighUsageThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%.
|
||||||
|
||| % $._config,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -336,7 +338,7 @@
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'System saturated, load per core is very high.',
|
summary: 'System saturated, load per core is very high.',
|
||||||
description: |||
|
description: |||
|
||||||
System load per core at {{ $labels.instance }} has been above %(systemSaturationPerCoreThreshold)d for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
System load per core at {{ $labels.instance }} has been above %(systemSaturationPerCoreThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}.
|
||||||
This might indicate this instance resources saturation and can cause it becoming unresponsive.
|
This might indicate this instance resources saturation and can cause it becoming unresponsive.
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
},
|
},
|
||||||
|
@ -353,7 +355,7 @@
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Memory major page faults are occurring at very high rate.',
|
summary: 'Memory major page faults are occurring at very high rate.',
|
||||||
description: |||
|
description: |||
|
||||||
Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsThreshold)d major page faults per second for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
Memory major pages are occurring at very high rate at {{ $labels.instance }}, %(memoryMajorPagesFaultsThreshold)d major page faults per second for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}.
|
||||||
Please check that there is enough memory available at this instance.
|
Please check that there is enough memory available at this instance.
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
},
|
},
|
||||||
|
@ -370,7 +372,7 @@
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Host is running out of memory.',
|
summary: 'Host is running out of memory.',
|
||||||
description: |||
|
description: |||
|
||||||
Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)d%% for the last 15 minutes, is currently at {{ printf "%.2f" $value }}%.
|
Memory is filling up at {{ $labels.instance }}, has been above %(memoryHighUtilizationThreshold)d%% for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}%%.
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
|
@ -386,7 +388,7 @@
|
||||||
annotations: {
|
annotations: {
|
||||||
summary: 'Disk IO queue is high.',
|
summary: 'Disk IO queue is high.',
|
||||||
description: |||
|
description: |||
|
||||||
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%.2f" $value }}.
|
Disk IO queue (aqu-sq) is high on {{ $labels.device }} at {{ $labels.instance }}, has been above %(diskIOSaturationThreshold)d for the last 15 minutes, is currently at {{ printf "%%.2f" $value }}.
|
||||||
This symptom might indicate disk saturation.
|
This symptom might indicate disk saturation.
|
||||||
||| % $._config,
|
||| % $._config,
|
||||||
},
|
},
|
||||||
|
|
|
@ -43,7 +43,9 @@
|
||||||
// just a warning for K8s nodes.
|
// just a warning for K8s nodes.
|
||||||
nodeCriticalSeverity: 'critical',
|
nodeCriticalSeverity: 'critical',
|
||||||
|
|
||||||
|
// CPU utilization (%) on which to trigger the
|
||||||
|
// 'NodeCPUHighUsage' alert.
|
||||||
|
cpuHighUsageThreshold: 90,
|
||||||
// Load average 1m (per core) on which to trigger the
|
// Load average 1m (per core) on which to trigger the
|
||||||
// 'NodeSystemSaturation' alert.
|
// 'NodeSystemSaturation' alert.
|
||||||
systemSaturationPerCoreThreshold: 2,
|
systemSaturationPerCoreThreshold: 2,
|
||||||
|
|
Loading…
Reference in New Issue