diff --git a/monitoring/ceph-mixin/prometheus_alerts.yml b/monitoring/ceph-mixin/prometheus_alerts.yml index fc38678f99d..578596f4af0 100644 --- a/monitoring/ceph-mixin/prometheus_alerts.yml +++ b/monitoring/ceph-mixin/prometheus_alerts.yml @@ -704,7 +704,18 @@ groups: rate of the past 48 hours. - alert: CephNodeInconsistentMTU - expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left() (quantile(0.5, node_network_mtu_bytes{device!="lo"})) + expr: | + node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == + scalar( + max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != + quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) + ) + or + node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == + scalar( + min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != + quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) + ) labels: severity: warning type: ceph_default @@ -712,7 +723,7 @@ groups: summary: MTU settings across Ceph hosts are inconsistent description: > Node {{ $labels.instance }} has a different MTU size ({{ $value }}) - than the median value on device {{ $labels.device }}. + than the median of devices named {{ $labels.device }}. - name: pools rules: diff --git a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml index d7fe0132d2d..6ff221e7000 100644 --- a/monitoring/ceph-mixin/tests_alerts/test_alerts.yml +++ b/monitoring/ceph-mixin/tests_alerts/test_alerts.yml @@ -546,6 +546,12 @@ tests: - series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter", job="node-exporter"}' values: '9000 9000 9000 9000 9000' + - series: 'node_network_mtu_bytes{device="eth4",instance="hostname1", + job="node-exporter"}' + values: '2200 2200 2200 2200 2200' + - series: 'node_network_mtu_bytes{device="eth4",instance="hostname2", + job="node-exporter"}' + values: '2400 2400 2400 2400 2400' - series: 'node_network_up{device="eth0",instance="node-exporter", job="node-exporter"}' values: '0 0 0 0 0' @@ -557,21 +563,50 @@ tests: values: '1 1 1 1 1' - series: 'node_network_up{device="eth3",instance="node-exporter", job="node-exporter"}' - values: '0 0 0 0 0' + values: '1 1 1 1 1' - series: 'node_network_up{device="eth4",instance="node-exporter", job="node-exporter"}' values: '1 1 1 1 1' + - series: 'node_network_up{device="eth4",instance="hostname1", + job="node-exporter"}' + values: '1 1 1 1 1' + - series: 'node_network_up{device="eth4",instance="hostname2", + job="node-exporter"}' + values: '0 0 0 0 0' promql_expr_test: - - expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left() - (quantile(0.5, node_network_mtu_bytes{device!="lo"})) + - expr: | + node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == + scalar( + max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != + quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) + ) + or + node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) == + scalar( + min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) != + quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) + ) eval_time: 1m exp_samples: - labels: '{device="eth4", instance="node-exporter", job="node-exporter"}' value: 9000 + - labels: '{device="eth4", instance="hostname1", job="node-exporter"}' + value: 2200 alert_rule_test: - eval_time: 1m alertname: CephNodeInconsistentMTU exp_alerts: + - exp_labels: + device: eth4 + instance: hostname1 + job: node-exporter + severity: warning + type: ceph_default + exp_annotations: + summary: MTU settings across Ceph hosts are inconsistent + description: > + Node hostname1 has a different MTU size (2200) + than the median of devices named eth4. - exp_labels: device: eth4 instance: node-exporter @@ -582,7 +617,7 @@ tests: summary: MTU settings across Ceph hosts are inconsistent description: > Node node-exporter has a different MTU size (9000) - than the median value on device eth4. + than the median of devices named eth4. # pool full, data series has 6 but using topk(5) so to ensure the # results are working as expected