mirror of https://github.com/ceph/ceph
Merge pull request #60404 from VallariAg/wip-nvmeof-listeners-prometheus-alerts
monitoring: add 2 nvmeof alerts to prometheus_alerts.yaml
This commit is contained in:
commit
4509ec2d14
|
@ -935,6 +935,26 @@
|
|||
description: 'The supported limit for clients connecting to a subsystem is %(NVMeoFHighClientCount)d' % $._config,
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NVMeoFMissingListener',
|
||||
'for': '10m',
|
||||
expr: 'ceph_nvmeof_subsystem_listener_count == 0 and on(nqn) sum(ceph_nvmeof_subsystem_listener_count) by (nqn) > 0',
|
||||
labels: { severity: 'warning', type: 'ceph_default' },
|
||||
annotations: {
|
||||
summary: 'No listener added for {{ $labels.instance }} NVMe-oF Gateway to {{ $labels.nqn }} subsystem',
|
||||
description: 'For every subsystem, each gateway should have a listener to balance traffic between gateways.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NVMeoFZeroListenerSubsystem',
|
||||
'for': '10m',
|
||||
expr: 'sum(ceph_nvmeof_subsystem_listener_count) by (nqn) == 0',
|
||||
labels: { severity: 'warning', type: 'ceph_default' },
|
||||
annotations: {
|
||||
summary: 'No listeners added to {{ $labels.nqn }} subsystem',
|
||||
description: 'NVMeoF gateway configuration incomplete; one of the subsystems have zero listeners.',
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'NVMeoFHighHostCPU',
|
||||
'for': '10m',
|
||||
|
|
|
@ -837,6 +837,24 @@ groups:
|
|||
labels:
|
||||
severity: "warning"
|
||||
type: "ceph_default"
|
||||
- alert: "NVMeoFMissingListener"
|
||||
annotations:
|
||||
description: "For every subsystem, each gateway should have a listener to balance traffic between gateways."
|
||||
summary: "No listener added for {{ $labels.instance }} NVMe-oF Gateway to {{ $labels.nqn }} subsystem"
|
||||
expr: "ceph_nvmeof_subsystem_listener_count == 0 and on(nqn) sum(ceph_nvmeof_subsystem_listener_count) by (nqn) > 0"
|
||||
for: "10m"
|
||||
labels:
|
||||
severity: "warning"
|
||||
type: "ceph_default"
|
||||
- alert: "NVMeoFZeroListenerSubsystem"
|
||||
annotations:
|
||||
description: "NVMeoF gateway configuration incomplete; one of the subsystems have zero listeners."
|
||||
summary: "No listeners added to {{ $labels.nqn }} subsystem"
|
||||
expr: "sum(ceph_nvmeof_subsystem_listener_count) by (nqn) == 0"
|
||||
for: "10m"
|
||||
labels:
|
||||
severity: "warning"
|
||||
type: "ceph_default"
|
||||
- alert: "NVMeoFHighHostCPU"
|
||||
annotations:
|
||||
description: "High CPU on a gateway host can lead to CPU contention and performance degradation"
|
||||
|
|
|
@ -2522,6 +2522,75 @@ tests:
|
|||
exp_annotations:
|
||||
summary: "The number of clients connected to nqn1 is too high on cluster mycluster"
|
||||
description: "The supported limit for clients connecting to a subsystem is 32"
|
||||
|
||||
# NVMeoFMissingListener
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'ceph_nvmeof_subsystem_listener_count{nqn="nqn1", instance="node-1:9100"}'
|
||||
values: '0 0 0 0 0 0 0 0 0 0 0'
|
||||
- series: 'ceph_nvmeof_subsystem_listener_count{nqn="nqn1", instance="node-2:9100"}'
|
||||
values: '1 1 1 1 1 1 1 1 1 1 1'
|
||||
- series: 'ceph_nvmeof_subsystem_listener_count{nqn="nqn1", instance="node-3:9100"}'
|
||||
values: '1 1 1 1 1 1 1 1 1 1 1'
|
||||
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.1", instance="node-1:9100"}'
|
||||
values: '1+0x20'
|
||||
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.2", instance="node-2:9100"}'
|
||||
values: '1+0x20'
|
||||
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.3", instance="node-3:9100"}'
|
||||
values: '1+0x20'
|
||||
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.4", instance="node-4:9100"}'
|
||||
values: '1+0x20'
|
||||
promql_expr_test:
|
||||
- expr: ceph_nvmeof_subsystem_listener_count == 0 and on(nqn) sum(ceph_nvmeof_subsystem_listener_count) by (nqn) > 0
|
||||
eval_time: 1m
|
||||
exp_samples:
|
||||
- labels: '{__name__="ceph_nvmeof_subsystem_listener_count", instance="node-1:9100", nqn="nqn1"}'
|
||||
value: 0
|
||||
alert_rule_test:
|
||||
- eval_time: 10m
|
||||
alertname: NVMeoFMissingListener
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
instance: node-1:9100
|
||||
nqn: nqn1
|
||||
severity: warning
|
||||
type: ceph_default
|
||||
exp_annotations:
|
||||
summary: "No listener added for node-1:9100 NVMe-oF Gateway to nqn1 subsystem"
|
||||
description: "For every subsystem, each gateway should have a listener to balance traffic between gateways."
|
||||
|
||||
# NVMeoFZeroListenerSubsystem
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'ceph_nvmeof_subsystem_listener_count{nqn="nqn1"}'
|
||||
values: '0 0 0 0 0 0 0 0'
|
||||
- series: 'ceph_nvmeof_subsystem_listener_count{nqn="nqn2"}'
|
||||
values: '0 1 1 1 2 2 3 4'
|
||||
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.1"}'
|
||||
values: '1+0x20'
|
||||
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.2"}'
|
||||
values: '1+0x20'
|
||||
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.3"}'
|
||||
values: '1+0x20'
|
||||
- series: 'ceph_nvmeof_gateway_info{addr="1.1.1.4"}'
|
||||
values: '1+0x20'
|
||||
promql_expr_test:
|
||||
- expr: ceph_nvmeof_subsystem_listener_count == 0
|
||||
eval_time: 1m
|
||||
exp_samples:
|
||||
- labels: '{__name__="ceph_nvmeof_subsystem_listener_count",nqn="nqn1"}'
|
||||
value: 0
|
||||
alert_rule_test:
|
||||
- eval_time: 10m
|
||||
alertname: NVMeoFZeroListenerSubsystem
|
||||
exp_alerts:
|
||||
- exp_labels:
|
||||
nqn: nqn1
|
||||
severity: warning
|
||||
type: ceph_default
|
||||
exp_annotations:
|
||||
summary: "No listeners added to nqn1 subsystem"
|
||||
description: "NVMeoF gateway configuration incomplete; one of the subsystems have zero listeners."
|
||||
|
||||
# NVMeoFHighHostCPU
|
||||
- interval: 1m
|
||||
|
|
Loading…
Reference in New Issue