Merge pull request #60404 from VallariAg/wip-nvmeof-listeners-prometheus-alerts

monitoring: add 2 nvmeof alerts to prometheus_alerts.yaml
This commit is contained in:
Vallari Agrawal 2024-11-14 16:23:55 +05:30 committed by GitHub
commit 4509ec2d14
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 107 additions and 0 deletions

View File

@ -935,6 +935,26 @@
description: 'The supported limit for clients connecting to a subsystem is %(NVMeoFHighClientCount)d' % $._config,
},
},
{
alert: 'NVMeoFMissingListener',
'for': '10m',
expr: 'ceph_nvmeof_subsystem_listener_count == 0 and on(nqn) sum(ceph_nvmeof_subsystem_listener_count) by (nqn) > 0',
labels: { severity: 'warning', type: 'ceph_default' },
annotations: {
summary: 'No listener added for {{ $labels.instance }} NVMe-oF Gateway to {{ $labels.nqn }} subsystem',
description: 'For every subsystem, each gateway should have a listener to balance traffic between gateways.',
},
},
{
alert: 'NVMeoFZeroListenerSubsystem',
'for': '10m',
expr: 'sum(ceph_nvmeof_subsystem_listener_count) by (nqn) == 0',
labels: { severity: 'warning', type: 'ceph_default' },
annotations: {
summary: 'No listeners added to {{ $labels.nqn }} subsystem',
description: 'NVMeoF gateway configuration incomplete; one of the subsystems have zero listeners.',
},
},
{
alert: 'NVMeoFHighHostCPU',
'for': '10m',

View File

@ -837,6 +837,24 @@ groups:
labels:
severity: "warning"
type: "ceph_default"
- alert: "NVMeoFMissingListener"
annotations:
description: "For every subsystem, each gateway should have a listener to balance traffic between gateways."
summary: "No listener added for {{ $labels.instance }} NVMe-oF Gateway to {{ $labels.nqn }} subsystem"
expr: "ceph_nvmeof_subsystem_listener_count == 0 and on(nqn) sum(ceph_nvmeof_subsystem_listener_count) by (nqn) > 0"
for: "10m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "NVMeoFZeroListenerSubsystem"
annotations:
description: "NVMeoF gateway configuration incomplete; one of the subsystems have zero listeners."
summary: "No listeners added to {{ $labels.nqn }} subsystem"
expr: "sum(ceph_nvmeof_subsystem_listener_count) by (nqn) == 0"
for: "10m"
labels:
severity: "warning"
type: "ceph_default"
- alert: "NVMeoFHighHostCPU"
annotations:
description: "High CPU on a gateway host can lead to CPU contention and performance degradation"

View File

@ -2522,6 +2522,75 @@ tests:
exp_annotations:
summary: "The number of clients connected to nqn1 is too high on cluster mycluster"
description: "The supported limit for clients connecting to a subsystem is 32"
# NVMeoFMissingListener
- interval: 1m
input_series:
- series: 'ceph_nvmeof_subsystem_listener_count{nqn="nqn1", instance="node-1:9100"}'
values: '0 0 0 0 0 0 0 0 0 0 0'
- series: 'ceph_nvmeof_subsystem_listener_count{nqn="nqn1", instance="node-2:9100"}'
values: '1 1 1 1 1 1 1 1 1 1 1'
- series: 'ceph_nvmeof_subsystem_listener_count{nqn="nqn1", instance="node-3:9100"}'
values: '1 1 1 1 1 1 1 1 1 1 1'
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.1", instance="node-1:9100"}'
values: '1+0x20'
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.2", instance="node-2:9100"}'
values: '1+0x20'
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.3", instance="node-3:9100"}'
values: '1+0x20'
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.4", instance="node-4:9100"}'
values: '1+0x20'
promql_expr_test:
- expr: ceph_nvmeof_subsystem_listener_count == 0 and on(nqn) sum(ceph_nvmeof_subsystem_listener_count) by (nqn) > 0
eval_time: 1m
exp_samples:
- labels: '{__name__="ceph_nvmeof_subsystem_listener_count", instance="node-1:9100", nqn="nqn1"}'
value: 0
alert_rule_test:
- eval_time: 10m
alertname: NVMeoFMissingListener
exp_alerts:
- exp_labels:
instance: node-1:9100
nqn: nqn1
severity: warning
type: ceph_default
exp_annotations:
summary: "No listener added for node-1:9100 NVMe-oF Gateway to nqn1 subsystem"
description: "For every subsystem, each gateway should have a listener to balance traffic between gateways."
# NVMeoFZeroListenerSubsystem
- interval: 1m
input_series:
- series: 'ceph_nvmeof_subsystem_listener_count{nqn="nqn1"}'
values: '0 0 0 0 0 0 0 0'
- series: 'ceph_nvmeof_subsystem_listener_count{nqn="nqn2"}'
values: '0 1 1 1 2 2 3 4'
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.1"}'
values: '1+0x20'
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.2"}'
values: '1+0x20'
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.3"}'
values: '1+0x20'
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.4"}'
values: '1+0x20'
promql_expr_test:
- expr: ceph_nvmeof_subsystem_listener_count == 0
eval_time: 1m
exp_samples:
- labels: '{__name__="ceph_nvmeof_subsystem_listener_count",nqn="nqn1"}'
value: 0
alert_rule_test:
- eval_time: 10m
alertname: NVMeoFZeroListenerSubsystem
exp_alerts:
- exp_labels:
nqn: nqn1
severity: warning
type: ceph_default
exp_annotations:
summary: "No listeners added to nqn1 subsystem"
description: "NVMeoF gateway configuration incomplete; one of the subsystems have zero listeners."
# NVMeoFHighHostCPU
- interval: 1m