Merge pull request #38030 from p-se/prom-alert-package-drops-leeway

mgr/dashboard: prometheus alerting: add some leeway for package drops and errors

Reviewed-by: Stephan Müller <smueller@suse.com>
Reviewed-by: Ernesto Puerta <epuertat@redhat.com>
Reviewed-by: Nizamudeen A <nia@redhat.com>
This commit is contained in:
Ernesto Puerta 2021-02-16 20:45:44 +01:00 committed by GitHub
commit e2d73297cf
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -175,30 +175,48 @@ groups:
description: >
Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.
# alert on nic packet errors and drops rates > 1 packet/s
# alert on nic packet errors and drops rates > 1% packets/s
- alert: network packets dropped
expr: irate(node_network_receive_drop_total{device!="lo"}[5m]) + irate(node_network_transmit_drop_total{device!="lo"}[5m]) > 1
expr: |
(
increase(node_network_receive_drop_total{device!="lo"}[1m]) +
increase(node_network_transmit_drop_total{device!="lo"}[1m])
) / (
increase(node_network_receive_packets_total{device!="lo"}[1m]) +
increase(node_network_transmit_packets_total{device!="lo"}[1m])
) >= 0.0001 or (
increase(node_network_receive_drop_total{device!="lo"}[1m]) +
increase(node_network_transmit_drop_total{device!="lo"}[1m])
) >= 10
labels:
severity: warning
type: ceph_default
oid: 1.3.6.1.4.1.50495.15.1.2.8.2
annotations:
description: >
Node {{ $labels.instance }} experiences packet drop > 1
packet/s on interface {{ $labels.device }}.
Node {{ $labels.instance }} experiences packet drop > 0.01% or >
10 packets/s on interface {{ $labels.device }}.
- alert: network packet errors
expr: |
irate(node_network_receive_errs_total{device!="lo"}[5m]) +
irate(node_network_transmit_errs_total{device!="lo"}[5m]) > 1
(
increase(node_network_receive_errs_total{device!="lo"}[1m]) +
increase(node_network_transmit_errs_total{device!="lo"}[1m])
) / (
increase(node_network_receive_packets_total{device!="lo"}[1m]) +
increase(node_network_transmit_packets_total{device!="lo"}[1m])
) >= 0.0001 or (
increase(node_network_receive_errs_total{device!="lo"}[1m]) +
increase(node_network_transmit_errs_total{device!="lo"}[1m])
) >= 10
labels:
severity: warning
type: ceph_default
oid: 1.3.6.1.4.1.50495.15.1.2.8.3
annotations:
description: >
Node {{ $labels.instance }} experiences packet errors > 1
packet/s on interface {{ $labels.device }}.
Node {{ $labels.instance }} experiences packet errors > 0.01% or
> 10 packets/s on interface {{ $labels.device }}.
- alert: storage filling up
expr: |