mirror of
https://github.com/ceph/ceph
synced 2025-02-21 18:17:42 +00:00
ceph-mixin: fix CephNodeNetworkPacket alerts
Signed-off-by: Aswin Toni <aswin.toni@cern.ch>
This commit is contained in:
parent
50f4580686
commit
351e1ac639
@ -4,5 +4,8 @@
|
||||
|
||||
clusterLabel: 'cluster',
|
||||
showMultiCluster: false,
|
||||
|
||||
CephNodeNetworkPacketDropsThreshold: 0.005,
|
||||
CephNodeNetworkPacketDropsPerSec: 10,
|
||||
},
|
||||
}
|
||||
|
@ -512,16 +512,38 @@
|
||||
},
|
||||
{
|
||||
alert: 'CephNodeNetworkPacketDrops',
|
||||
expr: '( increase(node_network_receive_drop_total{device!="lo"}[1m]) + increase(node_network_transmit_drop_total{device!="lo"}[1m])) / ( increase(node_network_receive_packets_total{device!="lo"}[1m]) + increase(node_network_transmit_packets_total{device!="lo"}[1m])) >= 0.0001 or ( increase(node_network_receive_drop_total{device!="lo"}[1m]) + increase(node_network_transmit_drop_total{device!="lo"}[1m])) >= 10',
|
||||
expr: |||
|
||||
(
|
||||
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_drop_total{device!="lo"}[1m])
|
||||
) / (
|
||||
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_packets_total{device!="lo"}[1m])
|
||||
) >= %(CephNodeNetworkPacketDropsThreshold)s and (
|
||||
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_drop_total{device!="lo"}[1m])
|
||||
) >= %(CephNodeNetworkPacketDropsPerSec)s
|
||||
||| % $._config,
|
||||
labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.2' },
|
||||
annotations: {
|
||||
summary: 'One or more NICs reports packet drops%(cluster)s' % $.MultiClusterSummary(),
|
||||
description: 'Node {{ $labels.instance }} experiences packet drop > 0.01% or > 10 packets/s on interface {{ $labels.device }}.',
|
||||
description: 'Node {{ $labels.instance }} experiences packet drop > %(CephNodeNetworkPacketDropsThreshold)s%% or > %(CephNodeNetworkPacketDropsPerSec)s packets/s on interface {{ $labels.device }}.' % { CephNodeNetworkPacketDropsThreshold: $._config.CephNodeNetworkPacketDropsThreshold * 100, CephNodeNetworkPacketDropsPerSec: $._config.CephNodeNetworkPacketDropsPerSec },
|
||||
},
|
||||
},
|
||||
{
|
||||
alert: 'CephNodeNetworkPacketErrors',
|
||||
expr: '( increase(node_network_receive_errs_total{device!="lo"}[1m]) + increase(node_network_transmit_errs_total{device!="lo"}[1m])) / ( increase(node_network_receive_packets_total{device!="lo"}[1m]) + increase(node_network_transmit_packets_total{device!="lo"}[1m])) >= 0.0001 or ( increase(node_network_receive_errs_total{device!="lo"}[1m]) + increase(node_network_transmit_errs_total{device!="lo"}[1m])) >= 10',
|
||||
expr: |||
|
||||
(
|
||||
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_errs_total{device!="lo"}[1m])
|
||||
) / (
|
||||
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_packets_total{device!="lo"}[1m])
|
||||
) >= 0.0001 or (
|
||||
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_errs_total{device!="lo"}[1m])
|
||||
) >= 10
|
||||
|||,
|
||||
labels: { severity: 'warning', type: 'ceph_default', oid: '1.3.6.1.4.1.50495.1.2.1.8.3' },
|
||||
annotations: {
|
||||
summary: 'One or more NICs reports packet errors%(cluster)s' % $.MultiClusterSummary(),
|
||||
|
@ -459,9 +459,19 @@ groups:
|
||||
type: "ceph_default"
|
||||
- alert: "CephNodeNetworkPacketDrops"
|
||||
annotations:
|
||||
description: "Node {{ $labels.instance }} experiences packet drop > 0.01% or > 10 packets/s on interface {{ $labels.device }}."
|
||||
description: "Node {{ $labels.instance }} experiences packet drop > 0.5% or > 10 packets/s on interface {{ $labels.device }}."
|
||||
summary: "One or more NICs reports packet drops"
|
||||
expr: "( increase(node_network_receive_drop_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_drop_total{device!=\"lo\"}[1m])) / ( increase(node_network_receive_packets_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_packets_total{device!=\"lo\"}[1m])) >= 0.0001 or ( increase(node_network_receive_drop_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_drop_total{device!=\"lo\"}[1m])) >= 10"
|
||||
expr: |
|
||||
(
|
||||
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_drop_total{device!="lo"}[1m])
|
||||
) / (
|
||||
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_packets_total{device!="lo"}[1m])
|
||||
) >= 0.0050000000000000001 and (
|
||||
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_drop_total{device!="lo"}[1m])
|
||||
) >= 10
|
||||
labels:
|
||||
oid: "1.3.6.1.4.1.50495.1.2.1.8.2"
|
||||
severity: "warning"
|
||||
@ -470,7 +480,17 @@ groups:
|
||||
annotations:
|
||||
description: "Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}."
|
||||
summary: "One or more NICs reports packet errors"
|
||||
expr: "( increase(node_network_receive_errs_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_errs_total{device!=\"lo\"}[1m])) / ( increase(node_network_receive_packets_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_packets_total{device!=\"lo\"}[1m])) >= 0.0001 or ( increase(node_network_receive_errs_total{device!=\"lo\"}[1m]) + increase(node_network_transmit_errs_total{device!=\"lo\"}[1m])) >= 10"
|
||||
expr: |
|
||||
(
|
||||
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_errs_total{device!="lo"}[1m])
|
||||
) / (
|
||||
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_packets_total{device!="lo"}[1m])
|
||||
) >= 0.0001 or (
|
||||
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_errs_total{device!="lo"}[1m])
|
||||
) >= 10
|
||||
labels:
|
||||
oid: "1.3.6.1.4.1.50495.1.2.1.8.3"
|
||||
severity: "warning"
|
||||
|
@ -375,32 +375,38 @@ tests:
|
||||
description: "Root volume is dangerously full: 4.811% free."
|
||||
|
||||
# network packets dropped
|
||||
- interval: 1s
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'node_network_receive_drop_total{device="eth0",
|
||||
instance="node-exporter",job="node-exporter"}'
|
||||
values: '1+1x500'
|
||||
values: '0+600x10'
|
||||
- series: 'node_network_transmit_drop_total{device="eth0",
|
||||
instance="node-exporter",job="node-exporter"}'
|
||||
values: '1+1x500'
|
||||
values: '0+600x10'
|
||||
- series: 'node_network_receive_packets_total{device="eth0",
|
||||
instance="node-exporter",job="node-exporter"}'
|
||||
values: '0+750x10'
|
||||
- series: 'node_network_transmit_packets_total{device="eth0",
|
||||
instance="node-exporter",job="node-exporter"}'
|
||||
values: '0+750x10'
|
||||
promql_expr_test:
|
||||
- expr: |
|
||||
(
|
||||
increase(node_network_receive_drop_total{device!="lo"}[1m]) +
|
||||
increase(node_network_transmit_drop_total{device!="lo"}[1m])
|
||||
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_drop_total{device!="lo"}[1m])
|
||||
) / (
|
||||
increase(node_network_receive_packets_total{device!="lo"}[1m]) +
|
||||
increase(node_network_transmit_packets_total{device!="lo"}[1m])
|
||||
) >= 0.0001 or (
|
||||
increase(node_network_receive_drop_total{device!="lo"}[1m]) +
|
||||
increase(node_network_transmit_drop_total{device!="lo"}[1m])
|
||||
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_packets_total{device!="lo"}[1m])
|
||||
) >= 0.0050000000000000001 and (
|
||||
rate(node_network_receive_drop_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_drop_total{device!="lo"}[1m])
|
||||
) >= 10
|
||||
|
||||
eval_time: 5m
|
||||
exp_samples:
|
||||
- labels: '{device="eth0", instance="node-exporter",
|
||||
job="node-exporter"}'
|
||||
value: 1.2E+02
|
||||
value: 8E-1
|
||||
alert_rule_test:
|
||||
- eval_time: 5m
|
||||
alertname: CephNodeNetworkPacketDrops
|
||||
@ -414,35 +420,41 @@ tests:
|
||||
type: ceph_default
|
||||
exp_annotations:
|
||||
summary: One or more NICs reports packet drops
|
||||
description: "Node node-exporter experiences packet drop > 0.01% or > 10 packets/s on interface eth0."
|
||||
description: "Node node-exporter experiences packet drop > 0.5% or > 10 packets/s on interface eth0."
|
||||
|
||||
# network packets errors
|
||||
- interval: 1s
|
||||
- interval: 1m
|
||||
input_series:
|
||||
- series: 'node_network_receive_errs_total{device="eth0",
|
||||
instance="node-exporter",job="node-exporter"}'
|
||||
values: '1+1x500'
|
||||
values: '0+600x10'
|
||||
- series: 'node_network_transmit_errs_total{device="eth0",
|
||||
instance="node-exporter",job="node-exporter"}'
|
||||
values: '1+1x500'
|
||||
values: '0+600x10'
|
||||
- series: 'node_network_transmit_packets_total{device="eth0",
|
||||
instance="node-exporter",job="node-exporter"}'
|
||||
values: '0+750x10'
|
||||
- series: 'node_network_receive_packets_total{device="eth0",
|
||||
instance="node-exporter",job="node-exporter"}'
|
||||
values: '0+750x10'
|
||||
promql_expr_test:
|
||||
- expr: |
|
||||
(
|
||||
increase(node_network_receive_errs_total{device!="lo"}[1m]) +
|
||||
increase(node_network_transmit_errs_total{device!="lo"}[1m])
|
||||
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_errs_total{device!="lo"}[1m])
|
||||
) / (
|
||||
increase(node_network_receive_packets_total{device!="lo"}[1m]) +
|
||||
increase(node_network_transmit_packets_total{device!="lo"}[1m])
|
||||
rate(node_network_receive_packets_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_packets_total{device!="lo"}[1m])
|
||||
) >= 0.0001 or (
|
||||
increase(node_network_receive_errs_total{device!="lo"}[1m]) +
|
||||
increase(node_network_transmit_errs_total{device!="lo"}[1m])
|
||||
rate(node_network_receive_errs_total{device!="lo"}[1m]) +
|
||||
rate(node_network_transmit_errs_total{device!="lo"}[1m])
|
||||
) >= 10
|
||||
|
||||
eval_time: 5m
|
||||
exp_samples:
|
||||
- labels: '{device="eth0", instance="node-exporter",
|
||||
job="node-exporter"}'
|
||||
value: 1.2E+02
|
||||
value: 8E-01
|
||||
alert_rule_test:
|
||||
- eval_time: 5m
|
||||
alertname: CephNodeNetworkPacketErrors
|
||||
|
Loading…
Reference in New Issue
Block a user