Merge pull request #45583 from p-se/monitoring-alert-mtu-group-by-devices

mgr/dashboard: Compare values of MTU alert by device

Reviewed-by: Aashish Sharma <aasharma@redhat.com>
Reviewed-by: Ernesto Puerta <epuertat@redhat.com>
Reviewed-by: Nizamudeen A <nia@redhat.com>
Reviewed-by: p-se <NOT@FOUND>
This commit is contained in:
Ernesto Puerta 2022-04-01 11:11:30 +02:00 committed by GitHub
commit 2d1c480f5a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 52 additions and 6 deletions

View File

@ -704,7 +704,18 @@ groups:
rate of the past 48 hours.
- alert: CephNodeInconsistentMTU
expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left() (quantile(0.5, node_network_mtu_bytes{device!="lo"}))
expr: |
node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
scalar(
max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
)
or
node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
scalar(
min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
)
labels:
severity: warning
type: ceph_default
@ -712,7 +723,7 @@ groups:
summary: MTU settings across Ceph hosts are inconsistent
description: >
Node {{ $labels.instance }} has a different MTU size ({{ $value }})
than the median value on device {{ $labels.device }}.
than the median of devices named {{ $labels.device }}.
- name: pools
rules:

View File

@ -546,6 +546,12 @@ tests:
- series: 'node_network_mtu_bytes{device="eth4",instance="node-exporter",
job="node-exporter"}'
values: '9000 9000 9000 9000 9000'
- series: 'node_network_mtu_bytes{device="eth4",instance="hostname1",
job="node-exporter"}'
values: '2200 2200 2200 2200 2200'
- series: 'node_network_mtu_bytes{device="eth4",instance="hostname2",
job="node-exporter"}'
values: '2400 2400 2400 2400 2400'
- series: 'node_network_up{device="eth0",instance="node-exporter",
job="node-exporter"}'
values: '0 0 0 0 0'
@ -557,21 +563,50 @@ tests:
values: '1 1 1 1 1'
- series: 'node_network_up{device="eth3",instance="node-exporter",
job="node-exporter"}'
values: '0 0 0 0 0'
values: '1 1 1 1 1'
- series: 'node_network_up{device="eth4",instance="node-exporter",
job="node-exporter"}'
values: '1 1 1 1 1'
- series: 'node_network_up{device="eth4",instance="hostname1",
job="node-exporter"}'
values: '1 1 1 1 1'
- series: 'node_network_up{device="eth4",instance="hostname2",
job="node-exporter"}'
values: '0 0 0 0 0'
promql_expr_test:
- expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left()
(quantile(0.5, node_network_mtu_bytes{device!="lo"}))
- expr: |
node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
scalar(
max by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
)
or
node_network_mtu_bytes * (node_network_up{device!="lo"} > 0) ==
scalar(
min by (device) (node_network_mtu_bytes * (node_network_up{device!="lo"} > 0)) !=
quantile by (device) (.5, node_network_mtu_bytes * (node_network_up{device!="lo"} > 0))
)
eval_time: 1m
exp_samples:
- labels: '{device="eth4", instance="node-exporter", job="node-exporter"}'
value: 9000
- labels: '{device="eth4", instance="hostname1", job="node-exporter"}'
value: 2200
alert_rule_test:
- eval_time: 1m
alertname: CephNodeInconsistentMTU
exp_alerts:
- exp_labels:
device: eth4
instance: hostname1
job: node-exporter
severity: warning
type: ceph_default
exp_annotations:
summary: MTU settings across Ceph hosts are inconsistent
description: >
Node hostname1 has a different MTU size (2200)
than the median of devices named eth4.
- exp_labels:
device: eth4
instance: node-exporter
@ -582,7 +617,7 @@ tests:
summary: MTU settings across Ceph hosts are inconsistent
description: >
Node node-exporter has a different MTU size (9000)
than the median value on device eth4.
than the median of devices named eth4.
# pool full, data series has 6 but using topk(5) so to ensure the
# results are working as expected