groups: - name: cluster health rules: - alert: health error expr: ceph_health_status == 2 for: 5m labels: severity: critical type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.2.1 annotations: description: Ceph in health_error state for more than 5m. - alert: health warn expr: ceph_health_status == 1 for: 15m labels: severity: warning type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.2.2 annotations: description: Ceph in health_warn for more than 15m. - name: mon rules: - alert: low monitor quorum count expr: sum(ceph_mon_quorum_status) < 3 labels: severity: critical type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.3.1 annotations: description: Monitor count in quorum is low. - name: osd rules: - alert: 10% OSDs down expr: sum(ceph_osd_up) / count(ceph_osd_in) <= 0.9 labels: severity: critical type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.4.1 annotations: description: More than 10% of OSDs are down. - alert: OSD down expr: count(ceph_osd_up == 0) > 0 for: 15m labels: severity: warning type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.4.2 annotations: description: One or more OSDs down for more than 15 minutes. - alert: OSDs near full expr: ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) ceph_osd_up == 1) > 0.8 labels: severity: critical type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.4.3 annotations: description: OSD {{ $labels.ceph_daemon }} is dangerously full, over 80%. # alert on single OSDs flapping - alert: flap osd expr: rate(ceph_osd_up[5m])*60 > 1 labels: severity: warning type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.4.4 annotations: description: > OSD {{ $labels.ceph_daemon }} was marked down and back up at least once a minute for 5 minutes. # alert on high deviation from average PG count - alert: high pg count deviation expr: abs(((ceph_osd_numpg > 0) - on (job) group_left avg(ceph_osd_numpg > 0) by (job)) / on (job) group_left avg(ceph_osd_numpg > 0) by (job)) > 0.35 for: 5m labels: severity: warning type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.4.5 annotations: description: > OSD {{ $labels.ceph_daemon }} deviates by more than 30% from average PG count. # alert on high commit latency...but how high is too high - name: mds rules: # no mds metrics are exported yet - name: mgr rules: # no mgr metrics are exported yet - name: pgs rules: - alert: pgs inactive expr: ceph_pg_total - ceph_pg_active > 0 for: 5m labels: severity: critical type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.7.1 annotations: description: One or more PGs are inactive for more than 5 minutes. - alert: pgs unclean expr: ceph_pg_total - ceph_pg_clean > 0 for: 15m labels: severity: warning type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.7.2 annotations: description: One or more PGs are not clean for more than 15 minutes. - name: nodes rules: - alert: root volume full expr: node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"} < 0.05 labels: severity: critical type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.8.1 annotations: description: Root volume (OSD and MON store) is dangerously full (< 5% free). # alert on nic packet errors and drops rates > 1 packet/s - alert: network packets dropped expr: irate(node_network_receive_drop_total{device!="lo"}[5m]) + irate(node_network_transmit_drop_total{device!="lo"}[5m]) > 1 labels: severity: warning type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.8.2 annotations: description: > Node {{ $labels.instance }} experiences packet drop > 1 packet/s on interface {{ $labels.device }}. - alert: network packet errors expr: irate(node_network_receive_errs_total{device!="lo"}[5m]) + irate(node_network_transmit_errs_total{device!="lo"}[5m]) > 1 labels: severity: warning type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.8.3 annotations: description: > Node {{ $labels.instance }} experiences packet errors > 1 packet/s on interface {{ $labels.device }}. # predict fs fillup times - alert: storage filling expr: ((node_filesystem_free_bytes) / deriv(node_filesystem_free_bytes[2d]) <= 5) > 0 labels: severity: warning type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.8.4 annotations: description: > Mountpoint {{ $labels.mountpoint }} will be full in less than 5 days assuming the average fillup rate of the past 48 hours. - name: pools rules: - alert: pool full expr: ceph_pool_stored / ceph_pool_max_avail * on(pool_id) group_right ceph_pool_metadata > 0.9 labels: severity: critical type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.9.1 annotations: description: Pool {{ $labels.name }} at 90% capacity or over. - alert: pool filling up expr: (((ceph_pool_max_avail - ceph_pool_stored) / deriv(ceph_pool_max_avail[2d])) * on(pool_id) group_right ceph_pool_metadata <=5) > 0 labels: severity: warning type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.9.2 annotations: description: > Pool {{ $labels.name }} will be full in less than 5 days assuming the average fillup rate of the past 48 hours.