diff --git a/monitoring/ceph-mixin/prometheus_alerts.libsonnet b/monitoring/ceph-mixin/prometheus_alerts.libsonnet index cde1a736f8c..30b6b07d463 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.libsonnet +++ b/monitoring/ceph-mixin/prometheus_alerts.libsonnet @@ -935,6 +935,26 @@ description: 'The supported limit for clients connecting to a subsystem is %(NVMeoFHighClientCount)d' % $._config, }, }, + { + alert: 'NVMeoFMissingListener', + 'for': '10m', + expr: 'ceph_nvmeof_subsystem_listener_count == 0 and on(nqn) sum(ceph_nvmeof_subsystem_listener_count) by (nqn) > 0', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'No listener added for {{ $labels.instance }} NVMe-oF Gateway to {{ $labels.nqn }} subsystem', + description: 'For every subsystem, each gateway should have a listener to balance traffic between gateways.', + }, + }, + { + alert: 'NVMeoFZeroListenerSubsystem', + 'for': '10m', + expr: 'sum(ceph_nvmeof_subsystem_listener_count) by (nqn) == 0', + labels: { severity: 'warning', type: 'ceph_default' }, + annotations: { + summary: 'No listeners added to {{ $labels.nqn }} subsystem', + description: 'NVMeoF gateway configuration incomplete; one of the subsystems have zero listeners.', + }, + }, { alert: 'NVMeoFHighHostCPU', 'for': '10m', diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index ba6a6ded0a3..805ecb1188a 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -837,6 +837,24 @@ groups: labels: severity: "warning" type: "ceph_default" + - alert: "NVMeoFMissingListener" + annotations: + description: "For every subsystem, each gateway should have a listener to balance traffic between gateways." + summary: "No listener added for {{ $labels.instance }} NVMe-oF Gateway to {{ $labels.nqn }} subsystem" + expr: "ceph_nvmeof_subsystem_listener_count == 0 and on(nqn) sum(ceph_nvmeof_subsystem_listener_count) by (nqn) > 0" + for: "10m" + labels: + severity: "warning" + type: "ceph_default" + - alert: "NVMeoFZeroListenerSubsystem" + annotations: + description: "NVMeoF gateway configuration incomplete; one of the subsystems have zero listeners." + summary: "No listeners added to {{ $labels.nqn }} subsystem" + expr: "sum(ceph_nvmeof_subsystem_listener_count) by (nqn) == 0" + for: "10m" + labels: + severity: "warning" + type: "ceph_default" - alert: "NVMeoFHighHostCPU" annotations: description: "High CPU on a gateway host can lead to CPU contention and performance degradation" diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index a269ff74227..6bcaa53b851 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -2522,6 +2522,75 @@ tests: exp_annotations: summary: "The number of clients connected to nqn1 is too high on cluster mycluster" description: "The supported limit for clients connecting to a subsystem is 32" + + # NVMeoFMissingListener + - interval: 1m + input_series: + - series: 'ceph_nvmeof_subsystem_listener_count{nqn="nqn1", instance="node-1:9100"}' + values: '0 0 0 0 0 0 0 0 0 0 0' + - series: 'ceph_nvmeof_subsystem_listener_count{nqn="nqn1", instance="node-2:9100"}' + values: '1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_nvmeof_subsystem_listener_count{nqn="nqn1", instance="node-3:9100"}' + values: '1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.1", instance="node-1:9100"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.2", instance="node-2:9100"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.3", instance="node-3:9100"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.4", instance="node-4:9100"}' + values: '1+0x20' + promql_expr_test: + - expr: ceph_nvmeof_subsystem_listener_count == 0 and on(nqn) sum(ceph_nvmeof_subsystem_listener_count) by (nqn) > 0 + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_nvmeof_subsystem_listener_count", instance="node-1:9100", nqn="nqn1"}' + value: 0 + alert_rule_test: + - eval_time: 10m + alertname: NVMeoFMissingListener + exp_alerts: + - exp_labels: + instance: node-1:9100 + nqn: nqn1 + severity: warning + type: ceph_default + exp_annotations: + summary: "No listener added for node-1:9100 NVMe-oF Gateway to nqn1 subsystem" + description: "For every subsystem, each gateway should have a listener to balance traffic between gateways." + + # NVMeoFZeroListenerSubsystem + - interval: 1m + input_series: + - series: 'ceph_nvmeof_subsystem_listener_count{nqn="nqn1"}' + values: '0 0 0 0 0 0 0 0' + - series: 'ceph_nvmeof_subsystem_listener_count{nqn="nqn2"}' + values: '0 1 1 1 2 2 3 4' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.1"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.2"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.3"}' + values: '1+0x20' + - series: 'ceph_nvmeof_gateway_info{addr="1.1.1.4"}' + values: '1+0x20' + promql_expr_test: + - expr: ceph_nvmeof_subsystem_listener_count == 0 + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_nvmeof_subsystem_listener_count",nqn="nqn1"}' + value: 0 + alert_rule_test: + - eval_time: 10m + alertname: NVMeoFZeroListenerSubsystem + exp_alerts: + - exp_labels: + nqn: nqn1 + severity: warning + type: ceph_default + exp_annotations: + summary: "No listeners added to nqn1 subsystem" + description: "NVMeoF gateway configuration incomplete; one of the subsystems have zero listeners." # NVMeoFHighHostCPU - interval: 1m