mirror of
https://github.com/ceph/ceph
synced 2025-04-27 21:29:57 +00:00
Merge pull request #38030 from p-se/prom-alert-package-drops-leeway
mgr/dashboard: prometheus alerting: add some leeway for package drops and errors Reviewed-by: Stephan Müller <smueller@suse.com> Reviewed-by: Ernesto Puerta <epuertat@redhat.com> Reviewed-by: Nizamudeen A <nia@redhat.com>
This commit is contained in:
commit
e2d73297cf
@ -175,30 +175,48 @@ groups:
|
|||||||
description: >
|
description: >
|
||||||
Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.
|
Root volume (OSD and MON store) is dangerously full: {{ $value | humanize }}% free.
|
||||||
|
|
||||||
# alert on nic packet errors and drops rates > 1 packet/s
|
# alert on nic packet errors and drops rates > 1% packets/s
|
||||||
- alert: network packets dropped
|
- alert: network packets dropped
|
||||||
expr: irate(node_network_receive_drop_total{device!="lo"}[5m]) + irate(node_network_transmit_drop_total{device!="lo"}[5m]) > 1
|
expr: |
|
||||||
|
(
|
||||||
|
increase(node_network_receive_drop_total{device!="lo"}[1m]) +
|
||||||
|
increase(node_network_transmit_drop_total{device!="lo"}[1m])
|
||||||
|
) / (
|
||||||
|
increase(node_network_receive_packets_total{device!="lo"}[1m]) +
|
||||||
|
increase(node_network_transmit_packets_total{device!="lo"}[1m])
|
||||||
|
) >= 0.0001 or (
|
||||||
|
increase(node_network_receive_drop_total{device!="lo"}[1m]) +
|
||||||
|
increase(node_network_transmit_drop_total{device!="lo"}[1m])
|
||||||
|
) >= 10
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
type: ceph_default
|
type: ceph_default
|
||||||
oid: 1.3.6.1.4.1.50495.15.1.2.8.2
|
oid: 1.3.6.1.4.1.50495.15.1.2.8.2
|
||||||
annotations:
|
annotations:
|
||||||
description: >
|
description: >
|
||||||
Node {{ $labels.instance }} experiences packet drop > 1
|
Node {{ $labels.instance }} experiences packet drop > 0.01% or >
|
||||||
packet/s on interface {{ $labels.device }}.
|
10 packets/s on interface {{ $labels.device }}.
|
||||||
|
|
||||||
- alert: network packet errors
|
- alert: network packet errors
|
||||||
expr: |
|
expr: |
|
||||||
irate(node_network_receive_errs_total{device!="lo"}[5m]) +
|
(
|
||||||
irate(node_network_transmit_errs_total{device!="lo"}[5m]) > 1
|
increase(node_network_receive_errs_total{device!="lo"}[1m]) +
|
||||||
|
increase(node_network_transmit_errs_total{device!="lo"}[1m])
|
||||||
|
) / (
|
||||||
|
increase(node_network_receive_packets_total{device!="lo"}[1m]) +
|
||||||
|
increase(node_network_transmit_packets_total{device!="lo"}[1m])
|
||||||
|
) >= 0.0001 or (
|
||||||
|
increase(node_network_receive_errs_total{device!="lo"}[1m]) +
|
||||||
|
increase(node_network_transmit_errs_total{device!="lo"}[1m])
|
||||||
|
) >= 10
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
type: ceph_default
|
type: ceph_default
|
||||||
oid: 1.3.6.1.4.1.50495.15.1.2.8.3
|
oid: 1.3.6.1.4.1.50495.15.1.2.8.3
|
||||||
annotations:
|
annotations:
|
||||||
description: >
|
description: >
|
||||||
Node {{ $labels.instance }} experiences packet errors > 1
|
Node {{ $labels.instance }} experiences packet errors > 0.01% or
|
||||||
packet/s on interface {{ $labels.device }}.
|
> 10 packets/s on interface {{ $labels.device }}.
|
||||||
|
|
||||||
- alert: storage filling up
|
- alert: storage filling up
|
||||||
expr: |
|
expr: |
|
||||||
|
Loading…
Reference in New Issue
Block a user