diff --git a/doc/mgr/prometheus.rst b/doc/mgr/prometheus.rst index 0328a582f14..733c4bfdb4f 100644 --- a/doc/mgr/prometheus.rst +++ b/doc/mgr/prometheus.rst @@ -54,7 +54,7 @@ is registered with Prometheus's `registry The :confval:`mgr/prometheus/scrape_interval` of this module should always be set to match Prometheus' scrape interval to work properly and not cause any issues. - + The scrape interval in the module is used for caching purposes and to determine when a cache is stale. @@ -98,6 +98,43 @@ If you are confident that you don't require the cache, you can disable it:: .. _prometheus-rbd-io-statistics: +Ceph Health Checks +------------------ + +The mgr/prometheus module also tracks and maintains a history of Ceph health checks, +exposing them to the Prometheus server as discrete metrics. This allows Prometheus +alert rules to be configured for specific health check events. + +The metrics take the following form; + +:: + + # HELP ceph_health_detail healthcheck status by type (0=inactive, 1=active) + # TYPE ceph_health_detail gauge + ceph_health_detail{name="OSDMAP_FLAGS",severity="HEALTH_WARN"} 0.0 + ceph_health_detail{name="OSD_DOWN",severity="HEALTH_WARN"} 1.0 + ceph_health_detail{name="PG_DEGRADED",severity="HEALTH_WARN"} 1.0 + +The health check history is made available through the following commands; + +:: + + healthcheck history ls [--format {plain|json|json-pretty}] + healthcheck history clear + +The ``ls`` command provides an overview of the health checks that the cluster has +encountered, or since the last ``clear`` command was issued. The example below; + +:: + + [ceph: root@c8-node1 /]# ceph healthcheck history ls + Healthcheck Name First Seen (UTC) Last seen (UTC) Count Active + OSDMAP_FLAGS 2021/09/16 03:17:47 2021/09/16 22:07:40 2 No + OSD_DOWN 2021/09/17 00:11:59 2021/09/17 00:11:59 1 Yes + PG_DEGRADED 2021/09/17 00:11:59 2021/09/17 00:11:59 1 Yes + 3 health check(s) listed + + RBD IO statistics ----------------- @@ -311,8 +348,8 @@ node_targets.yml Notes ===== -Counters and gauges are exported; currently histograms and long-running -averages are not. It's possible that Ceph's 2-D histograms could be +Counters and gauges are exported; currently histograms and long-running +averages are not. It's possible that Ceph's 2-D histograms could be reduced to two separate 1-D histograms, and that long-running averages could be exported as Prometheus' Summary type. diff --git a/monitoring/prometheus/alerts/ceph_default_alerts.yml b/monitoring/prometheus/alerts/ceph_default_alerts.yml index 71fc864cddf..420472d35ee 100644 --- a/monitoring/prometheus/alerts/ceph_default_alerts.yml +++ b/monitoring/prometheus/alerts/ceph_default_alerts.yml @@ -27,22 +27,86 @@ groups: - name: mon rules: - - alert: low monitor quorum count - expr: sum(ceph_mon_quorum_status) < 3 + - alert: Monitor down, quorum is at risk + expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1 + for: 30s labels: severity: critical type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.3.1 annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-down description: | - Monitor count in quorum is below three. - - Only {{ $value }} of {{ with query "count(ceph_mon_quorum_status)" }}{{ . | first | value }}{{ end }} monitors are active. + {{ $min := query "floor(count(ceph_mon_metadata) / 2) +1" | first | value }}Quorum requires a majority of monitors (x {{ $min }}) to be active + Without quorum the cluster will become inoperable, affecting all connected clients and services. The following monitors are down: {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }} + - alert: Monitor down + expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1)) + for: 30s + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-down + description: | + {{ $down := query "count(ceph_mon_quorum_status == 0)" | first | value }}{{ $s := "" }}{{ if gt $down 1.0 }}{{ $s = "s" }}{{ end }}You have {{ $down }} monitor{{ $s }} down. + Quorum is still intact, but the loss of further monitors will make your cluster inoperable. + + The following monitors are down: + {{- range query "(ceph_mon_quorum_status == 0) + on(ceph_daemon) group_left(hostname) (ceph_mon_metadata * 0)" }} + - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} + {{- end }} + - alert: Ceph mon disk space critically low + expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1 + for: 1m + labels: + severity: critical + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-disk-crit + description: | + The free space available to a monitor's store is critically low (<5% by default). + You should increase the space available to the monitor(s). The + default location for the store sits under /var/lib/ceph. Your monitor hosts are; + {{- range query "ceph_mon_metadata"}} + - {{ .Labels.hostname }} + {{- end }} + + - alert: Ceph mon disk space running low + expr: ceph_health_detail{name="MON_DISK_LOW"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-disk-low + description: | + The space available to a monitor's store is approaching full (>70% is the default). + You should increase the space available to the monitor store. The + default location for the store sits under /var/lib/ceph. Your monitor hosts are; + {{- range query "ceph_mon_metadata"}} + - {{ .Labels.hostname }} + {{- end }} + + - alert: Clock skew detected across Ceph Monitor daemons + expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-clock-skew + description: | + The ceph monitors rely on a consistent time reference to maintain + quorum and cluster consistency. This event indicates that at least + one of your mons is not sync'd correctly. + + Review the cluster status with ceph -s. This will show which monitors + are affected. Check the time sync status on each monitor host. - name: osd rules: @@ -60,43 +124,163 @@ groups: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }} - + - alert: OSD Host is down + expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + annotations: + description: | + The following OSDs are down: + {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0" }} + - {{ .Labels.hostname }} : {{ .Labels.ceph_daemon }} + {{- end }} - alert: OSD down - expr: count(ceph_osd_up == 0) > 0 - for: 15m + expr: ceph_health_detail{name="OSD_DOWN"} == 1 + for: 5m labels: severity: warning type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.4.2 annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-down description: | - {{ $s := "" }}{{ if gt $value 1.0 }}{{ $s = "s" }}{{ end }} - {{ $value }} OSD{{ $s }} down for more than 15 minutes. - - {{ $value }} of {{ query "count(ceph_osd_up)" | first | value }} OSDs are down. + {{ $num := query "count(ceph_osd_up == 0)" | first | value }}{{ $s := "" }}{{ if gt $num 1.0 }}{{ $s = "s" }}{{ end }}{{ $num }} OSD{{ $s }} down for over 5mins. The following OSD{{ $s }} {{ if eq $s "" }}is{{ else }}are{{ end }} down: {{- range query "(ceph_osd_up * on(ceph_daemon) group_left(hostname) ceph_osd_metadata) == 0"}} - - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} + - {{ .Labels.ceph_daemon }} on {{ .Labels.hostname }} {{- end }} - alert: OSDs near full - expr: | - ( - ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) ceph_osd_up == 1) - * on(ceph_daemon) group_left(hostname) ceph_osd_metadata - ) * 100 > 90 + expr: ceph_health_detail{name="OSD_NEARFULL"} == 1 for: 5m labels: - severity: critical + severity: warning type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.4.3 annotations: - description: > - OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} is - dangerously full: {{ $value | humanize }}% + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-nearfull + description: | + One or more OSDs have reached their NEARFULL threshold - - alert: flapping OSD + Use 'ceph health detail' to identify which OSDs have reached this threshold. + To resolve, either add capacity to the cluster, or delete unwanted data + - alert: OSD Full + expr: ceph_health_detail{name="OSD_FULL"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-full + description: | + An OSD has reached it's full threshold. Writes from all pools that share the + affected OSD will be blocked. + + To resolve, either add capacity to the cluster, or delete unwanted data + - alert: OSD unable to perform rebalance + expr: ceph_health_detail{name="OSD_BACKFILLFULL"} > 0 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-backfillfull + description: | + An OSD has reached it's BACKFILL FULL threshold. This will prevent rebalance operations + completing for some pools. Check the current capacity utilisation with 'ceph df' + + To resolve, either add capacity to the cluster, or delete unwanted data + - alert: OSD too many read repairs + expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 1 + for: 30s + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-too-many-repairs + description: | + Reads from an OSD have used a secondary PG to return data to the client, indicating + a potential failing disk. + - alert: OSD hearbeats running slow (frontend) + expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + description: | + OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network + for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs. + - alert: OSD hearbeats running slow (backend) + expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + description: | + OSD heartbeats on the cluster's 'cluster' network (backend) are running slow. Investigate the network + for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs. + - alert: OSD disk size mismatch + expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#bluestore-disk-size-mismatch + description: | + One or more OSDs have an internal inconsistency between the size of the physical device and it's metadata. + This could lead to the OSD(s) crashing in future. You should redeploy the effected OSDs. + - alert: Device failure predicted + expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#id2 + description: | + The device health module has determined that one or more devices will fail + soon. To review the device states use 'ceph device ls'. To show a specific + device use 'ceph device info '. + + Mark the OSD as out (so data may migrate to other OSDs in the cluster). Once + the osd is empty remove and replace the OSD. + - alert: Too many devices predicted to fail + expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1 + for: 1m + labels: + severity: critical + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#device-health-toomany + description: | + The device health module has determined that the number of devices predicted to + fail can not be remediated automatically, since it would take too many osd's out of + the cluster, impacting performance and potentially availabililty. You should add new + OSDs to the cluster to allow data to be relocated to avoid the data integrity issues. + - alert: Device failure predicted, but automatic drain is incomplete + expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#device-health-in-use + description: | + The device health module has determined that one or more devices will fail + soon, but the normal process of relocating the data on the device to other + OSDs in the cluster is blocked. + + Check the the cluster has available freespace. It may be necessary to add + more disks to the cluster to allow the data from the failing device to + successfully migrate. + + - alert: Flapping OSD expr: | ( rate(ceph_osd_up[5m]) @@ -107,11 +291,25 @@ groups: type: ceph_default oid: 1.3.6.1.4.1.50495.15.1.2.4.4 annotations: + documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd/#flapping-osds description: > OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} was marked down and back up at {{ $value | humanize }} times once a - minute for 5 minutes. + minute for 5 minutes. This could indicate a network issue (latency, + packet drop, disruption) on the clusters "cluster network". Check the + network environment on the listed host(s). + - alert: OSD Read errors + expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1 + for: 30s + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#bluestore-spurious-read-errors + description: > + An OSD has encountered read errors, but the OSD has recovered by retrying + the reads. This may indicate an issue with the Hardware or Kernel. # alert on high deviation from average PG count - alert: high pg count deviation expr: | @@ -130,12 +328,69 @@ groups: OSD {{ $labels.ceph_daemon }} on {{ $labels.hostname }} deviates by more than 30% from average PG count. # alert on high commit latency...but how high is too high + - name: mds rules: - # no mds metrics are exported yet + - alert: Ceph Filesystem damage detected + expr: ceph_health_detail{name="MDS_DAMAGE"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#cephfs-health-messages + description: > + The filesystems metadata has been corrupted. Data access + may be blocked. + + Either analyse the output from the mds daemon admin socket, or + escalate to support + - alert: Ceph Filesystem switched to READ ONLY + expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#cephfs-health-messages + description: > + The filesystem has switched to READ ONLY due to an unexpected + write error, when writing to the metadata pool + + Either analyse the output from the mds daemon admin socket, or + escalate to support + - name: mgr rules: - # no mgr metrics are exported yet + - alert: mgr module failure + expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1 + for: 5m + labels: + severity: critical + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-mgr-module-crash + description: > + One or more mgr modules have crashed and are yet to be acknowledged by the administrator. A + crashed module may impact functionality within the cluster. Use the 'ceph crash' commands to + investigate which module has failed, and archive it to acknowledge the failure. + - alert: mgr prometheus module is not active + expr: up{job="ceph"} == 0 + for: 1m + labels: + severity: critical + type: ceph_default + annotations: + description: > + The mgr/prometheus module at {{ $labels.instance }} is unreachable. This + could mean that the module has been disabled or the mgr itself is down. + + Without the mgr/prometheus module metrics and alerts will no longer + function. Open a shell to ceph and use 'ceph -s' to to determine whether the + mgr is active. If the mgr is not active, restart it, otherwise you can check + the mgr/prometheus module is loaded with 'ceph mgr module ls' and if it's + not listed as enabled, enable it with 'ceph mgr module enable prometheus' + - name: pgs rules: - alert: pgs inactive @@ -160,8 +415,89 @@ groups: annotations: description: > {{ $value }} PGs haven't been clean for more than 15 minutes in pool {{ $labels.name }}. - Unclean PGs haven't been able to completely recover from a - previous failure. + Unclean PGs haven't been able to completely recover from a previous failure. + - alert: Placement Group (PG) damaged + expr: ceph_health_detail{name=~"PG_DAMAGED|OSD_SCRUB_ERRORS"} == 1 + for: 5m + labels: + severity: critical + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-damaged + description: > + During data consistency checks (scrub), at least one PG has been flagged as being + damaged or inconsistent. + + Check to see which PG is affected, and attempt a manual repair if neccessary. To list + problematic placement groups, use 'rados list-inconsistent-pg '. To repair PGs use + the 'ceph pg repair ' command. + - alert: Recovery at risk, cluster too full + expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 1 + for: 1m + labels: + severity: critical + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-recovery-full + description: > + Data redundancy may be reduced, or is at risk, since one or more OSDs are at or above their + 'full' threshold. Add more capacity to the cluster, or delete unwanted data. + - alert: I/O blocked to some data + # PG_AVAILABILITY, but an OSD is not in a DOWN state + expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) == 1 + for: 1m + labels: + severity: critical + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-availability + description: > + Data availability is reduced impacting the clusters abilty to service I/O to some data. One or + more placement groups (PGs) are in a state that blocks IO. + - alert: Cluster too full, automatic data recovery impaired + expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 1 + for: 1m + labels: + severity: critical + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-backfill-full + description: > + Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs + have breached their 'backfillfull' threshold. Add more capacity, or delete unwanted data. + - alert: Placement Group(s) have not been scrubbed + expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-not-scrubbed + description: | + One or more PGs have not been scrubbed recently. The scrub process is a data integrity + feature, protectng against bit-rot. It checks that objects and their metadata (size and + attributes) match across object replicas. When PGs miss their scrub window, it may + indicate the scrub window is too small, or PGs were not in a 'clean' state during the + scrub window. + + You can manually initiate a scrub with: ceph pg scrub + - alert: Placement Group(s) have not been 'DEEP' scrubbed + expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1 + for: 5m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-not-deep-scrubbed + description: | + One or more PGs have not been deep scrubbed recently. Deep scrub is a data integrity + feature, protectng against bit-rot. It compares the contents of objects and their + replicas for inconsistency. When PGs miss their deep scrub window, it may indicate + that the window is too small or PGs were not in a 'clean' state during the deep-scrub + window. + + You can manually initiate a deep scrub with: ceph pg deep-scrub + - name: nodes rules: - alert: root volume full @@ -218,9 +554,11 @@ groups: Node {{ $labels.instance }} experiences packet errors > 0.01% or > 10 packets/s on interface {{ $labels.device }}. + # Restrict to device names beginning with '/' to skip false alarms from + # tmpfs, overlay type filesystems - alert: storage filling up expr: | - predict_linear(node_filesystem_free_bytes[2d], 3600 * 24 * 5) * + predict_linear(node_filesystem_free_bytes{device=~"/.*"}[2d], 3600 * 24 * 5) * on(instance) group_left(nodename) node_uname_info < 0 labels: severity: warning @@ -256,7 +594,7 @@ groups: annotations: description: Pool {{ $labels.name }} at {{ $value | humanize }}% capacity. - - alert: pool filling up + - alert: pool filling up (growth forecast) expr: | ( predict_linear(ceph_pool_stored[2d], 3600 * 24 * 5) @@ -271,6 +609,51 @@ groups: Pool {{ $labels.name }} will be full in less than 5 days assuming the average fill-up rate of the past 48 hours. + - alert: Ceph pool is too full for recovery/rebalance + expr: ceph_health_detail{name="POOL_BACKFILLFULL"} > 0 + labels: + severity: warning + type: ceph_default + annotations: + description: > + A pool is approaching it's near full threshold, which will + prevent rebalance operations from completing. You should + consider adding more capacity to the pool. + + - alert: Ceph pool is full - writes blocked + expr: ceph_health_detail{name="POOL_FULL"} > 0 + for: 1m + labels: + severity: critical + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pool-full + description: | + A pool has reached it's MAX quota, or the OSDs supporting the pool + have reached their FULL threshold. Until this is resolved, writes to + the pool will be blocked. + + Determine the affected pool with 'ceph df detail', for example looking + at QUOTA BYTES and STORED. Either increase the pools quota, or add + capacity to the cluster first then increase it's quota + (e.g. ceph osd pool set quota max_bytes ) + - alert: Ceph pool is approaching full + expr: ceph_health_detail{name="POOL_NEAR_FULL"} > 0 + for: 5m + labels: + severity: warning + type: ceph_default + annotations: + description: | + A pool has exceeeded it warning (percent full) threshold, or the OSDs + supporting the pool have reached their NEARFULL thresholds. Writes may + continue, but you are at risk of the pool going read only if more capacity + isn't made available. + + Determine the affected pool with 'ceph df detail', for example looking + at QUOTA BYTES and STORED. Either increase the pools quota, or add + capacity to the cluster first then increase it's quota + (e.g. ceph osd pool set quota max_bytes ) - name: healthchecks rules: - alert: Slow OSD Ops @@ -280,5 +663,76 @@ groups: severity: warning type: ceph_default annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#slow-ops description: > {{ $value }} OSD requests are taking too long to process (osd_op_complaint_time exceeded) +# cephadm alerts + - name: cephadm + rules: + - alert: Cluster upgrade has failed + expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0 + for: 30s + labels: + severity: critical + type: ceph_default + annotations: + description: > + The cephadm cluster upgrade process has failed. The cluster remains in + an undetermined state. + + Please review the cephadm logs, to understand the nature of the issue + - alert: A daemon managed by cephadm is down + expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0 + for: 30s + labels: + severity: critical + type: ceph_default + annotations: + description: > + A daemon managed by cephadm is no longer active. Determine, which + daemon is down with 'ceph health detail'. you may start daemons with + the 'ceph orch daemon start ' + - alert: cephadm management has been paused + expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0 + for: 1m + labels: + severity: warning + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/cephadm/operations/#cephadm-paused + description: > + Cluster management has been paused manually. This will prevent the + orchestrator from service management and reconciliation. If this is + not intentional, resume cephadm operations with 'ceph orch resume' + +# prometheus alerts + - name: prometheus + rules: + - alert: Scrape job is missing + expr: absent(up{job="ceph"}) + for: 30s + labels: + severity: critical + type: ceph_default + annotations: + description: | + The prometheus job that scrapes from Ceph is no longer defined, this + will effectively mean you'll have no metrics or alerts for the cluster. + + Please review the job definitions in the prometheus.yml file of the prometheus + instance. +# Object related events + - name: rados + rules: + - alert: Data not found/missing + expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1 + for: 30s + labels: + severity: critical + type: ceph_default + annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#object-unfound + description: | + A version of a RADOS object can not be found, even though all OSDs are up. I/O + requests for this object from clients will block (hang). Resolving this issue may + require the object to be rolled back to a prior version manually, and manually verified. \ No newline at end of file diff --git a/monitoring/prometheus/alerts/test_alerts.yml b/monitoring/prometheus/alerts/test_alerts.yml index 913c207339b..1e855c0902b 100644 --- a/monitoring/prometheus/alerts/test_alerts.yml +++ b/monitoring/prometheus/alerts/test_alerts.yml @@ -59,54 +59,54 @@ tests: Please check "ceph health detail" for more information. # low monitor quorum count - - interval: 1m - input_series: - - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a",instance="ceph:9283", - job="ceph"}' - values: '1 1 1 1 1' - - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b",instance="ceph:9283", - job="ceph"}' - values: '1 1 1 1 1' - - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c",instance="ceph:9283", - job="ceph"}' - values: '0 0 0 0 0' - - series: 'ceph_mon_metadata{ceph_daemon="mon.a",ceph_version="ceph version - 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific - (dev)",hostname="ceph",instance="ceph:9283",job="ceph", - public_addr="172.20.0.2",rank="0"}' - values: '1 1 1 1 1' - - series: 'ceph_mon_metadata{ceph_daemon="mon.b",ceph_version="ceph version - 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific - (dev)",hostname="ceph",instance="ceph:9283",job="ceph", - public_addr="172.20.0.2",rank="1"}' - values: '1 1 1 1 1' - - series: 'ceph_mon_metadata{ceph_daemon="mon.c",ceph_version="ceph version - 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific - (dev)",hostname="ceph",instance="ceph:9283",job="ceph", - public_addr="172.20.0.2",rank="2"}' - values: '1 1 1 1 1' - promql_expr_test: - - expr: sum(ceph_mon_quorum_status) < 3 - eval_time: 1m - exp_samples: - - labels: '{}' - value: 2 - alert_rule_test: - - eval_time: 1m - alertname: low monitor quorum count - exp_alerts: - - exp_labels: - oid: 1.3.6.1.4.1.50495.15.1.2.3.1 - type: ceph_default - severity: critical - exp_annotations: - description: | - Monitor count in quorum is below three. +# - interval: 1m +# input_series: +# - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a",instance="ceph:9283", +# job="ceph"}' +# values: '1 1 1 1 1' +# - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b",instance="ceph:9283", +# job="ceph"}' +# values: '1 1 1 1 1' +# - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c",instance="ceph:9283", +# job="ceph"}' +# values: '0 0 0 0 0' +# - series: 'ceph_mon_metadata{ceph_daemon="mon.a",ceph_version="ceph version +# 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific +# (dev)",hostname="ceph",instance="ceph:9283",job="ceph", +# public_addr="172.20.0.2",rank="0"}' +# values: '1 1 1 1 1' +# - series: 'ceph_mon_metadata{ceph_daemon="mon.b",ceph_version="ceph version +# 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific +# (dev)",hostname="ceph",instance="ceph:9283",job="ceph", +# public_addr="172.20.0.2",rank="1"}' +# values: '1 1 1 1 1' +# - series: 'ceph_mon_metadata{ceph_daemon="mon.c",ceph_version="ceph version +# 17.0.0-189-g3558fd72 (3558fd7291855971aa6481a2ade468ad61fbb346) pacific +# (dev)",hostname="ceph",instance="ceph:9283",job="ceph", +# public_addr="172.20.0.2",rank="2"}' +# values: '1 1 1 1 1' +# promql_expr_test: +# - expr: sum(ceph_mon_quorum_status) < 3 +# eval_time: 1m +# exp_samples: +# - labels: '{}' +# value: 2 +# alert_rule_test: +# - eval_time: 1m +# alertname: low monitor quorum count +# exp_alerts: +# - exp_labels: +# oid: 1.3.6.1.4.1.50495.15.1.2.3.1 +# type: ceph_default +# severity: critical +# exp_annotations: +# description: | +# Monitor count in quorum is below three. - Only 2 of 3 monitors are active. +# Only 2 of 3 monitors are active. - The following monitors are down: - - mon.c on ceph +# The following monitors are down: +# - mon.c on ceph # 10% OSDs down @@ -161,141 +161,141 @@ tests: - osd.1 on ceph # OSD down - - interval: 1m - input_series: - - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}' - values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' - - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' - promql_expr_test: - - expr: count(ceph_osd_up == 0) > 0 - eval_time: 1m - exp_samples: - - labels: '{}' - value: 1 - alert_rule_test: - - eval_time: 15m - alertname: OSD down - exp_alerts: - - exp_labels: - oid: 1.3.6.1.4.1.50495.15.1.2.4.2 - type: ceph_default - severity: warning - exp_annotations: - description: | +# - interval: 1m +# input_series: +# - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}' +# values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' +# - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}' +# values: '0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' +# - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}' +# values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' +# - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", +# ceph_version="ceph version 17.0.0-189-g3558fd72 +# (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", +# cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", +# hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", +# public_addr="172.20.0.2"}' +# values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' +# - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", +# ceph_version="ceph version 17.0.0-189-g3558fd72 +# (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", +# cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", +# hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", +# public_addr="172.20.0.2"}' +# values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' +# - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", +# ceph_version="ceph version 17.0.0-189-g3558fd72 +# (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", +# cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", +# hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", +# public_addr="172.20.0.2"}' +# values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' +# promql_expr_test: +# - expr: count(ceph_osd_up == 0) > 0 +# eval_time: 1m +# exp_samples: +# - labels: '{}' +# value: 1 +# alert_rule_test: +# - eval_time: 15m +# alertname: OSD down +# exp_alerts: +# - exp_labels: +# oid: 1.3.6.1.4.1.50495.15.1.2.4.2 +# type: ceph_default +# severity: warning +# exp_annotations: +# description: | - 1 OSD down for more than 15 minutes. +# 1 OSD down for more than 15 minutes. - 1 of 3 OSDs are down. +# 1 of 3 OSDs are down. - The following OSD is down: - - osd.1 on ceph +# The following OSD is down: +# - osd.1 on ceph - # OSDs near full - - interval: 1m - input_series: - - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.0",instance="ceph:9283" - ,job="ceph"}' - values: '1076310016 1076310016 1076310016 1076310016 1076310016 - 1076310016' - - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.1",instance="ceph:9283" - ,job="ceph"}' - values: '1076310016 1076310016 1076310016 1076310016 1076310016 - 1076310016' - - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.2",instance="ceph:9283" - ,job="ceph"}' - values: '1076310016 1076310016 1076310016 1076310016 1076310016 - 100856561909.76' - - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.0",instance="ceph:9283" - ,job="ceph"}' - values: '108447916032 108447916032 108447916032 108447916032 108447916032 - 108447916032' - - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.1",instance="ceph:9283" - ,job="ceph"}' - values: '108447916032 108447916032 108447916032 108447916032 108447916032 - 108447916032' - - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.2",instance="ceph:9283" - ,job="ceph"}' - values: '108447916032 108447916032 108447916032 108447916032 108447916032 - 108447916032' - - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}' - values: '1 1 1 1 1 1' - - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}' - values: '1 1 1 1 1 1' - - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}' - values: '1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1' - - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", - ceph_version="ceph version 17.0.0-189-g3558fd72 - (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", - cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", - hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", - public_addr="172.20.0.2"}' - values: '1 1 1 1 1 1' - promql_expr_test: - - expr: | - ( - ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) - ceph_osd_up == 1) * on(ceph_daemon) group_left(hostname) - ceph_osd_metadata - ) * 100 > 90 +# OSDs near full +# - interval: 1m +# input_series: +# - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.0",instance="ceph:9283" +# ,job="ceph"}' +# values: '1076310016 1076310016 1076310016 1076310016 1076310016 +# 1076310016' +# - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.1",instance="ceph:9283" +# ,job="ceph"}' +# values: '1076310016 1076310016 1076310016 1076310016 1076310016 +# 1076310016' +# - series: 'ceph_osd_stat_bytes_used{ceph_daemon="osd.2",instance="ceph:9283" +# ,job="ceph"}' +# values: '1076310016 1076310016 1076310016 1076310016 1076310016 +# 100856561909.76' +# - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.0",instance="ceph:9283" +# ,job="ceph"}' +# values: '108447916032 108447916032 108447916032 108447916032 108447916032 +# 108447916032' +# - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.1",instance="ceph:9283" +# ,job="ceph"}' +# values: '108447916032 108447916032 108447916032 108447916032 108447916032 +# 108447916032' +# - series: 'ceph_osd_stat_bytes{ceph_daemon="osd.2",instance="ceph:9283" +# ,job="ceph"}' +# values: '108447916032 108447916032 108447916032 108447916032 108447916032 +# 108447916032' +# - series: 'ceph_osd_up{ceph_daemon="osd.0",instance="ceph:9283",job="ceph"}' +# values: '1 1 1 1 1 1' +# - series: 'ceph_osd_up{ceph_daemon="osd.1",instance="ceph:9283",job="ceph"}' +# values: '1 1 1 1 1 1' +# - series: 'ceph_osd_up{ceph_daemon="osd.2",instance="ceph:9283",job="ceph"}' +# values: '1 1 1 1 1 1' +# - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.0", +# ceph_version="ceph version 17.0.0-189-g3558fd72 +# (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", +# cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", +# hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", +# public_addr="172.20.0.2"}' +# values: '1 1 1 1 1 1' +# - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.1", +# ceph_version="ceph version 17.0.0-189-g3558fd72 +# (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", +# cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", +# hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", +# public_addr="172.20.0.2"}' +# values: '1 1 1 1 1 1' +# - series: 'ceph_osd_metadata{back_iface="eth0",ceph_daemon="osd.2", +# ceph_version="ceph version 17.0.0-189-g3558fd72 +# (3558fd7291855971aa6481a2ade468ad61fbb346) pacific (dev)", +# cluster_addr="172.20.0.2",device_class="hdd",front_iface="eth0", +# hostname="ceph",instance="ceph:9283",job="ceph",objectstore="bluestore", +# public_addr="172.20.0.2"}' +# values: '1 1 1 1 1 1' +# promql_expr_test: +# - expr: | +# ( +# ((ceph_osd_stat_bytes_used / ceph_osd_stat_bytes) and on(ceph_daemon) +# ceph_osd_up == 1) * on(ceph_daemon) group_left(hostname) +# ceph_osd_metadata +# ) * 100 > 90 - eval_time: 5m - exp_samples: - - labels: '{ceph_daemon="osd.2",hostname="ceph",instance="ceph:9283", - job="ceph"}' - value: 9.3E+01 - alert_rule_test: - - eval_time: 10m - alertname: OSDs near full - exp_alerts: - - exp_labels: - ceph_daemon: osd.2 - hostname: ceph - instance: ceph:9283 - job: ceph - oid: 1.3.6.1.4.1.50495.15.1.2.4.3 - type: ceph_default - severity: critical - exp_annotations: - description: > - OSD osd.2 on ceph is dangerously full: 93% +# eval_time: 5m +# exp_samples: +# - labels: '{ceph_daemon="osd.2",hostname="ceph",instance="ceph:9283", +# job="ceph"}' +# value: 9.3E+01 +# alert_rule_test: +# - eval_time: 10m +# alertname: OSDs near full +# exp_alerts: +# - exp_labels: +# ceph_daemon: osd.2 +# hostname: ceph +# instance: ceph:9283 +# job: ceph +# oid: 1.3.6.1.4.1.50495.15.1.2.4.3 +# type: ceph_default +# severity: critical +# exp_annotations: +# description: > +# OSD osd.2 on ceph is dangerously full: 93% # flapping OSD - interval: 1s @@ -340,7 +340,7 @@ tests: value: 1.2200000000000001E+01 alert_rule_test: - eval_time: 5m - alertname: flapping OSD + alertname: Flapping OSD exp_alerts: - exp_labels: ceph_daemon: osd.0 @@ -351,10 +351,13 @@ tests: severity: warning type: ceph_default exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/troubleshooting/troubleshooting-osd/#flapping-osds description: > OSD osd.0 on ceph was - marked down and back up at 20.1 times once a - minute for 5 minutes. + marked down and back up at 20.1 times once a minute for 5 minutes. + This could indicate a network issue (latency, packet drop, disruption) + on the clusters "cluster network". Check the network environment on the + listed host(s). # high pg count deviation - interval: 1m @@ -694,7 +697,7 @@ tests: values: '0 0 0 0 0' - series: 'node_network_up{device="eth4",instance="node-exporter", job="node-exporter"}' - values: '1 1 1 1 1' + values: '1 1 1 1 1' promql_expr_test: - expr: node_network_mtu_bytes{device!="lo"} * (node_network_up{device!="lo"} > 0) != on() group_left() (quantile(0.5, node_network_mtu_bytes{device!="lo"})) @@ -792,6 +795,1012 @@ tests: severity: warning type: ceph_default exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#slow-ops description: > 1 OSD requests are taking too long to process (osd_op_complaint_time exceeded) + +# CEPHADM orchestrator alert triggers + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="UPGRADE_EXCEPTION"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="UPGRADE_EXCEPTION"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="UPGRADE_EXCEPTION"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Cluster upgrade has failed + - eval_time: 5m + alertname: Cluster upgrade has failed + exp_alerts: + - exp_labels: + name: UPGRADE_EXCEPTION + severity: critical + type: ceph_default + exp_annotations: + description: > + The cephadm cluster upgrade process has failed. The cluster remains in + an undetermined state. + + Please review the cephadm logs, to understand the nature of the issue + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="CEPHADM_FAILED_DAEMON"}' + values: '1+0x40' + promql_expr_test: + - expr: ceph_health_detail{name="CEPHADM_FAILED_DAEMON"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="CEPHADM_FAILED_DAEMON"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: A daemon managed by cephadm is down + - eval_time: 5m + alertname: A daemon managed by cephadm is down + exp_alerts: + - exp_labels: + name: CEPHADM_FAILED_DAEMON + severity: critical + type: ceph_default + exp_annotations: + description: > + A daemon managed by cephadm is no longer active. Determine, which + daemon is down with 'ceph health detail'. you may start daemons with + the 'ceph orch daemon start ' + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="CEPHADM_PAUSED"}' + values: '1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_detail{name="CEPHADM_PAUSED"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="CEPHADM_PAUSED"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: cephadm management has been paused + - eval_time: 5m + alertname: cephadm management has been paused + exp_alerts: + - exp_labels: + name: CEPHADM_PAUSED + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/cephadm/operations/#cephadm-paused + description: > + Cluster management has been paused manually. This will prevent the + orchestrator from service management and reconciliation. If this is + not intentional, resume cephadm operations with 'ceph orch resume' +# MDS + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="MDS_DAMAGE"}' + values: '1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_detail{name="MDS_DAMAGE"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="MDS_DAMAGE"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Ceph Filesystem damage detected + - eval_time: 5m + alertname: Ceph Filesystem damage detected + exp_alerts: + - exp_labels: + name: MDS_DAMAGE + severity: critical + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#cephfs-health-messages + description: > + The filesystems metadata has been corrupted. Data access + may be blocked. + + Either analyse the output from the mds daemon admin socket, or + escalate to support + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="MDS_HEALTH_READ_ONLY"}' + values: '1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: ceph_health_detail{name="MDS_HEALTH_READ_ONLY"} > 0 + eval_time: 2m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="MDS_HEALTH_READ_ONLY"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Ceph Filesystem switched to READ ONLY + - eval_time: 5m + alertname: Ceph Filesystem switched to READ ONLY + exp_alerts: + - exp_labels: + name: MDS_HEALTH_READ_ONLY + severity: critical + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/cephfs/health-messages/#cephfs-health-messages + description: > + The filesystem has switched to READ ONLY due to an unexpected + write error, when writing to the metadata pool + + Either analyse the output from the mds daemon admin socket, or + escalate to support +# MGR + - interval: 1m + input_series: + - series: 'up{job="ceph", instance="ceph-mgr:9283"}' + values: '1+0x2 0+0x10' + promql_expr_test: + - expr: up{job="ceph"} == 0 + eval_time: 3m + exp_samples: + - labels: '{__name__="up", job="ceph", instance="ceph-mgr:9283"}' + value: 0 + alert_rule_test: + - eval_time: 1m + alertname: mgr prometheus module is not active + - eval_time: 10m + alertname: mgr prometheus module is not active + exp_alerts: + - exp_labels: + instance: ceph-mgr:9283 + job: ceph + severity: critical + type: ceph_default + exp_annotations: + description: > + The mgr/prometheus module at ceph-mgr:9283 is unreachable. This + could mean that the module has been disabled or the mgr itself is down. + + Without the mgr/prometheus module metrics and alerts will no longer + function. Open a shell to ceph and use 'ceph -s' to to determine whether the + mgr is active. If the mgr is not active, restart it, otherwise you can check + the mgr/prometheus module is loaded with 'ceph mgr module ls' and if it's + not listed as enabled, enable it with 'ceph mgr module enable prometheus' + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="RECENT_MGR_MODULE_CRASH"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="RECENT_MGR_MODULE_CRASH"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: mgr module failure + - eval_time: 15m + alertname: mgr module failure + exp_alerts: + - exp_labels: + name: RECENT_MGR_MODULE_CRASH + severity: critical + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#recent-mgr-module-crash + description: > + One or more mgr modules have crashed and are yet to be acknowledged by the administrator. A + crashed module may impact functionality within the cluster. Use the 'ceph crash' commands to + investigate which module has failed, and archive it to acknowledge the failure. +# MON + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="MON_DISK_CRIT"}' + values: '0+0x2 1+0x10' + - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}' + values: '1+0x13' + promql_expr_test: + - expr: ceph_health_detail{name="MON_DISK_CRIT"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="MON_DISK_CRIT"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Ceph mon disk space critically low + - eval_time: 10m + alertname: Ceph mon disk space critically low + exp_alerts: + - exp_labels: + name: "MON_DISK_CRIT" + severity: critical + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-disk-crit + description: | + The free space available to a monitor's store is critically low (<5% by default). + You should increase the space available to the monitor(s). The + default location for the store sits under /var/lib/ceph. Your monitor hosts are; + - ceph-mon-a + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="MON_DISK_LOW"}' + values: '0+0x2 1+0x10' + - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-a"}' + values: '1+0x13' + promql_expr_test: + - expr: ceph_health_detail{name="MON_DISK_LOW"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="MON_DISK_LOW"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Ceph mon disk space running low + - eval_time: 10m + alertname: Ceph mon disk space running low + exp_alerts: + - exp_labels: + name: "MON_DISK_LOW" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-disk-low + description: | + The space available to a monitor's store is approaching full (>70% is the default). + You should increase the space available to the monitor store. The + default location for the store sits under /var/lib/ceph. Your monitor hosts are; + - ceph-mon-a + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="MON_CLOCK_SKEW"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="MON_CLOCK_SKEW"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="MON_CLOCK_SKEW"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Clock skew detected across Ceph Monitor daemons + - eval_time: 10m + alertname: Clock skew detected across Ceph Monitor daemons + exp_alerts: + - exp_labels: + name: "MON_CLOCK_SKEW" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-clock-skew + description: | + The ceph monitors rely on a consistent time reference to maintain + quorum and cluster consistency. This event indicates that at least + one of your mons is not sync'd correctly. + + Review the cluster status with ceph -s. This will show which monitors + are affected. Check the time sync status on each monitor host. + +# Check 3 mons one down, quorum at risk + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="MON_DOWN"}' + values: '0+0x2 1+0x12' + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}' + values: '1+0x14' + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}' + values: '1+0x14' + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}' + values: '1+0x2 0+0x12' + - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}' + values: '1+0x14' + - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}' + values: '1+0x14' + - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}' + values: '1+0x14' + promql_expr_test: + - expr: ((ceph_health_detail{name="MON_DOWN"} == 1) * on() (count(ceph_mon_quorum_status == 1) == bool (floor(count(ceph_mon_metadata) / 2) + 1))) == 1 + eval_time: 3m + exp_samples: + - labels: '{}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Monitor down, quorum is at risk + # shouldn't fire + - eval_time: 10m + alertname: Monitor down, quorum is at risk + exp_alerts: + - exp_labels: + severity: critical + type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.3.1 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-down + description: | + Quorum requires a majority of monitors (x 2) to be active + Without quorum the cluster will become inoperable, affecting all connected clients and services. + + The following monitors are down: + - mon.c on ceph-mon-3 +# check 5 mons, 1 down - warning only + - interval: 1m + input_series: + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.a"}' + values: '1+0x14' + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.b"}' + values: '1+0x14' + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.c"}' + values: '1+0x14' + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.d"}' + values: '1+0x14' + - series: 'ceph_mon_quorum_status{ceph_daemon="mon.e"}' + values: '1+0x2 0+0x12' + - series: 'ceph_mon_metadata{ceph_daemon="mon.a", hostname="ceph-mon-1"}' + values: '1+0x14' + - series: 'ceph_mon_metadata{ceph_daemon="mon.b", hostname="ceph-mon-2"}' + values: '1+0x14' + - series: 'ceph_mon_metadata{ceph_daemon="mon.c", hostname="ceph-mon-3"}' + values: '1+0x14' + - series: 'ceph_mon_metadata{ceph_daemon="mon.d", hostname="ceph-mon-4"}' + values: '1+0x14' + - series: 'ceph_mon_metadata{ceph_daemon="mon.e", hostname="ceph-mon-5"}' + values: '1+0x14' + promql_expr_test: + - expr: (count(ceph_mon_quorum_status == 0) <= (count(ceph_mon_metadata) - floor(count(ceph_mon_metadata) / 2) + 1)) + eval_time: 3m + exp_samples: + - labels: '{}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Monitor down + - eval_time: 10m + alertname: Monitor down + exp_alerts: + - exp_labels: + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#mon-down + description: | + You have 1 monitor down. + Quorum is still intact, but the loss of further monitors will make your cluster inoperable. + + The following monitors are down: + - mon.e on ceph-mon-5 +# Device Health + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="DEVICE_HEALTH"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="DEVICE_HEALTH"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Device failure predicted + - eval_time: 10m + alertname: Device failure predicted + exp_alerts: + - exp_labels: + name: "DEVICE_HEALTH" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#id2 + description: | + The device health module has determined that one or more devices will fail + soon. To review the device states use 'ceph device ls'. To show a specific + device use 'ceph device info '. + + Mark the OSD as out (so data may migrate to other OSDs in the cluster). Once + the osd is empty remove and replace the OSD. + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="DEVICE_HEALTH_TOOMANY"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_TOOMANY"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Too many devices predicted to fail + - eval_time: 10m + alertname: Too many devices predicted to fail + exp_alerts: + - exp_labels: + name: "DEVICE_HEALTH_TOOMANY" + severity: critical + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#device-health-toomany + description: | + The device health module has determined that the number of devices predicted to + fail can not be remediated automatically, since it would take too many osd's out of + the cluster, impacting performance and potentially availabililty. You should add new + OSDs to the cluster to allow data to be relocated to avoid the data integrity issues. + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="DEVICE_HEALTH_IN_USE"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="DEVICE_HEALTH_IN_USE"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="DEVICE_HEALTH_IN_USE"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Device failure predicted, but automatic drain is incomplete + - eval_time: 10m + alertname: Device failure predicted, but automatic drain is incomplete + exp_alerts: + - exp_labels: + name: "DEVICE_HEALTH_IN_USE" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#device-health-in-use + description: | + The device health module has determined that one or more devices will fail + soon, but the normal process of relocating the data on the device to other + OSDs in the cluster is blocked. + + Check the the cluster has available freespace. It may be necessary to add + more disks to the cluster to allow the data from the failing device to + successfully migrate. +# OSD + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="OSD_HOST_DOWN"}' + values: '0+0x2 1+0x10' + - series: 'ceph_osd_up{ceph_daemon="osd.0"}' + values: '1+0x2 0+0x10' + - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}' + values: '1+0x12' + promql_expr_test: + - expr: ceph_health_detail{name="OSD_HOST_DOWN"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="OSD_HOST_DOWN"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: OSD Host is down + - eval_time: 10m + alertname: OSD Host is down + exp_alerts: + - exp_labels: + name: "OSD_HOST_DOWN" + severity: warning + type: ceph_default + exp_annotations: + description: | + The following OSDs are down: + - ceph-osd-1 : osd.0 + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_FRONT"} == 0 + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_FRONT"}' + value: 0 + alert_rule_test: + - eval_time: 1m + alertname: OSD hearbeats running slow (frontend) + - eval_time: 10m + alertname: OSD hearbeats running slow (frontend) + exp_alerts: + - exp_labels: + name: "OSD_SLOW_PING_TIME_FRONT" + severity: warning + type: ceph_default + exp_annotations: + description: | + OSD heartbeats on the cluster's 'public' network (frontend) are running slow. Investigate the network + for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs. + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="OSD_SLOW_PING_TIME_BACK"} == 0 + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="OSD_SLOW_PING_TIME_BACK"}' + value: 0 + alert_rule_test: + - eval_time: 1m + alertname: OSD hearbeats running slow (backend) + - eval_time: 10m + alertname: OSD hearbeats running slow (backend) + exp_alerts: + - exp_labels: + name: "OSD_SLOW_PING_TIME_BACK" + severity: warning + type: ceph_default + exp_annotations: + description: | + OSD heartbeats on the cluster's 'cluster' network (backend) are running slow. Investigate the network + for any latency issues on this subnet. Use 'ceph health detail' to show the affected OSDs. + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="BLUESTORE_DISK_SIZE_MISMATCH"} == 0 + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="BLUESTORE_DISK_SIZE_MISMATCH"}' + value: 0 + alert_rule_test: + - eval_time: 1m + alertname: OSD disk size mismatch + - eval_time: 10m + alertname: OSD disk size mismatch + exp_alerts: + - exp_labels: + name: "BLUESTORE_DISK_SIZE_MISMATCH" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#bluestore-disk-size-mismatch + description: | + One or more OSDs have an internal inconsistency between the size of the physical device and it's metadata. + This could lead to the OSD(s) crashing in future. You should redeploy the effected OSDs. + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="BLUESTORE_SPURIOUS_READ_ERRORS"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="BLUESTORE_SPURIOUS_READ_ERRORS"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: OSD Read errors + - eval_time: 10m + alertname: OSD Read errors + exp_alerts: + - exp_labels: + name: "BLUESTORE_SPURIOUS_READ_ERRORS" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#bluestore-spurious-read-errors + description: > + An OSD has encountered read errors, but the OSD has recovered by retrying + the reads. This may indicate an issue with the Hardware or Kernel. + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="OSD_DOWN"}' + values: '0+0x2 1+0x10' + - series: 'ceph_osd_up{ceph_daemon="osd.0"}' + values: '1+0x12' + - series: 'ceph_osd_up{ceph_daemon="osd.1"}' + values: '1+0x2 0+0x10' + - series: 'ceph_osd_up{ceph_daemon="osd.2"}' + values: '1+0x12' + - series: 'ceph_osd_metadata{ceph_daemon="osd.0", hostname="ceph-osd-1"}' + values: '1+0x12' + - series: 'ceph_osd_metadata{ceph_daemon="osd.1", hostname="ceph-osd-2"}' + values: '1+0x12' + - series: 'ceph_osd_metadata{ceph_daemon="osd.2", hostname="ceph-osd-3"}' + values: '1+0x12' + promql_expr_test: + - expr: ceph_health_detail{name="OSD_DOWN"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="OSD_DOWN"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: OSD down + - eval_time: 10m + alertname: OSD down + exp_alerts: + - exp_labels: + name: "OSD_DOWN" + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.4.2 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-down + description: | + 1 OSD down for over 5mins. + + The following OSD is down: + - osd.1 on ceph-osd-2 + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="OSD_NEARFULL"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="OSD_NEARFULL"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="OSD_NEARFULL"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: OSDs near full + - eval_time: 10m + alertname: OSDs near full + exp_alerts: + - exp_labels: + name: "OSD_NEARFULL" + severity: warning + type: ceph_default + oid: 1.3.6.1.4.1.50495.15.1.2.4.3 + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-nearfull + description: | + One or more OSDs have reached their NEARFULL threshold + + Use 'ceph health detail' to identify which OSDs have reached this threshold. + To resolve, either add capacity to the cluster, or delete unwanted data + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="OSD_BACKFILLFULL"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="OSD_BACKFILLFULL"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="OSD_BACKFILLFULL"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: OSD unable to perform rebalance + - eval_time: 10m + alertname: OSD unable to perform rebalance + exp_alerts: + - exp_labels: + name: "OSD_BACKFILLFULL" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-backfillfull + description: | + An OSD has reached it's BACKFILL FULL threshold. This will prevent rebalance operations + completing for some pools. Check the current capacity utilisation with 'ceph df' + + To resolve, either add capacity to the cluster, or delete unwanted data + - interval: 30s + input_series: + - series: 'ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="OSD_TOO_MANY_REPAIRS"} == 0 + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="OSD_TOO_MANY_REPAIRS"}' + value: 0 + alert_rule_test: + - eval_time: 1m + alertname: OSD too many read repairs + - eval_time: 10m + alertname: OSD too many read repairs + exp_alerts: + - exp_labels: + name: "OSD_TOO_MANY_REPAIRS" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#osd-too-many-repairs + description: | + Reads from an OSD have used a secondary PG to return data to the client, indicating + a potential failing disk. +# Pools + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="POOL_BACKFILLFULL"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="POOL_BACKFILLFULL"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="POOL_BACKFILLFULL"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Ceph pool is too full for recovery/rebalance + - eval_time: 5m + alertname: Ceph pool is too full for recovery/rebalance + exp_alerts: + - exp_labels: + name: "POOL_BACKFILLFULL" + severity: warning + type: ceph_default + exp_annotations: + description: > + A pool is approaching it's near full threshold, which will + prevent rebalance operations from completing. You should + consider adding more capacity to the pool. + + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="POOL_FULL"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="POOL_FULL"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="POOL_FULL"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Ceph pool is full - writes blocked + - eval_time: 10m + alertname: Ceph pool is full - writes blocked + exp_alerts: + - exp_labels: + name: "POOL_FULL" + severity: critical + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pool-full + description: | + A pool has reached it's MAX quota, or the OSDs supporting the pool + have reached their FULL threshold. Until this is resolved, writes to + the pool will be blocked. + + Determine the affected pool with 'ceph df detail', for example looking + at QUOTA BYTES and STORED. Either increase the pools quota, or add + capacity to the cluster first then increase it's quota + (e.g. ceph osd pool set quota max_bytes ) + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="POOL_NEAR_FULL"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="POOL_NEAR_FULL"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="POOL_NEAR_FULL"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Ceph pool is approaching full + - eval_time: 10m + alertname: Ceph pool is approaching full + exp_alerts: + - exp_labels: + name: "POOL_NEAR_FULL" + severity: warning + type: ceph_default + exp_annotations: + description: | + A pool has exceeeded it warning (percent full) threshold, or the OSDs + supporting the pool have reached their NEARFULL thresholds. Writes may + continue, but you are at risk of the pool going read only if more capacity + isn't made available. + + Determine the affected pool with 'ceph df detail', for example looking + at QUOTA BYTES and STORED. Either increase the pools quota, or add + capacity to the cluster first then increase it's quota + (e.g. ceph osd pool set quota max_bytes ) + +# PGs + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="PG_NOT_SCRUBBED"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="PG_NOT_SCRUBBED"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="PG_NOT_SCRUBBED"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Placement Group(s) have not been scrubbed + - eval_time: 10m + alertname: Placement Group(s) have not been scrubbed + exp_alerts: + - exp_labels: + name: "PG_NOT_SCRUBBED" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-not-scrubbed + description: | + One or more PGs have not been scrubbed recently. The scrub process is a data integrity + feature, protectng against bit-rot. It checks that objects and their metadata (size and + attributes) match across object replicas. When PGs miss their scrub window, it may + indicate the scrub window is too small, or PGs were not in a 'clean' state during the + scrub window. + + You can manually initiate a scrub with: ceph pg scrub + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="PG_RECOVERY_FULL"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="PG_RECOVERY_FULL"} == 0 + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="PG_RECOVERY_FULL"}' + value: 0 + alert_rule_test: + - eval_time: 1m + alertname: Recovery at risk, cluster too full + - eval_time: 10m + alertname: Recovery at risk, cluster too full + exp_alerts: + - exp_labels: + name: "PG_RECOVERY_FULL" + severity: critical + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-recovery-full + description: > + Data redundancy may be reduced, or is at risk, since one or more OSDs are at or above their + 'full' threshold. Add more capacity to the cluster, or delete unwanted data. + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="PG_BACKFILL_FULL"}' + values: '0+0x2 1+0x20' + promql_expr_test: + - expr: ceph_health_detail{name="PG_BACKFILL_FULL"} == 0 + eval_time: 1m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="PG_BACKFILL_FULL"}' + value: 0 + alert_rule_test: + - eval_time: 1m + alertname: Cluster too full, automatic data recovery impaired + - eval_time: 10m + alertname: Cluster too full, automatic data recovery impaired + exp_alerts: + - exp_labels: + name: "PG_BACKFILL_FULL" + severity: critical + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-backfill-full + description: > + Data redundancy may be at risk due to lack of free space within the cluster. One or more OSDs + have breached their 'backfillfull' threshold. Add more capacity, or delete unwanted data. + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="PG_AVAILABILITY"}' + values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_health_detail{name="OSD_DOWN"}' + values: '0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0' + promql_expr_test: + - expr: ((ceph_health_detail{name="PG_AVAILABILITY"} == 1) - scalar(ceph_health_detail{name="OSD_DOWN"})) + eval_time: 1m + # empty set at 1m + exp_samples: + alert_rule_test: + # PG_AVAILABILITY and OSD_DOWN not firing .. no alert + - eval_time: 1m + alertname: I/O blocked to some data + exp_alerts: + # PG_AVAILABILITY firing, but osd_down is active .. no alert + - eval_time: 5m + alertname: I/O blocked to some data + exp_alerts: + # PG_AVAILABILITY firing, AND OSD_DOWN is not active...raise the alert + - eval_time: 15m + alertname: I/O blocked to some data + exp_alerts: + - exp_labels: + name: "PG_AVAILABILITY" + severity: critical + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-availability + description: > + Data availability is reduced impacting the clusters abilty to service I/O to some data. One or + more placement groups (PGs) are in a state that blocks IO. + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"}' + values: '0+0x2 1+0x10' + promql_expr_test: + - expr: ceph_health_detail{name="PG_NOT_DEEP_SCRUBBED"} == 1 + eval_time: 3m + exp_samples: + - labels: '{__name__="ceph_health_detail", name="PG_NOT_DEEP_SCRUBBED"}' + value: 1 + alert_rule_test: + - eval_time: 1m + alertname: Placement Group(s) have not been 'DEEP' scrubbed + - eval_time: 10m + alertname: Placement Group(s) have not been 'DEEP' scrubbed + exp_alerts: + - exp_labels: + name: "PG_NOT_DEEP_SCRUBBED" + severity: warning + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#pg-not-deep-scrubbed + description: | + One or more PGs have not been deep scrubbed recently. Deep scrub is a data integrity + feature, protectng against bit-rot. It compares the contents of objects and their + replicas for inconsistency. When PGs miss their deep scrub window, it may indicate + that the window is too small or PGs were not in a 'clean' state during the deep-scrub + window. + + You can manually initiate a deep scrub with: ceph pg deep-scrub + +# Prometheus + - interval: 1m + input_series: + - series: 'up{job="myjob"}' + values: '1+0x10' + promql_expr_test: + - expr: absent(up{job="ceph"}) + eval_time: 1m + exp_samples: + - labels: '{job="ceph"}' + value: 1 + alert_rule_test: + - eval_time: 5m + alertname: Scrape job is missing + exp_alerts: + - exp_labels: + job: ceph + severity: critical + type: ceph_default + exp_annotations: + description: | + The prometheus job that scrapes from Ceph is no longer defined, this + will effectively mean you'll have no metrics or alerts for the cluster. + + Please review the job definitions in the prometheus.yml file of the prometheus + instance. +# RADOS + - interval: 1m + input_series: + - series: 'ceph_health_detail{name="OBJECT_UNFOUND"}' + values: '0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_osd_up{ceph_daemon="osd.0"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_osd_up{ceph_daemon="osd.1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_osd_up{ceph_daemon="osd.2"}' + values: '1 1 1 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_osd_metadata{ceph_daemon="osd.0"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_osd_metadata{ceph_daemon="osd.1"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'ceph_osd_metadata{ceph_daemon="osd.2"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + promql_expr_test: + - expr: (ceph_health_detail{name="OBJECT_UNFOUND"} == 1) * on() (count(ceph_osd_up == 1) == bool count(ceph_osd_metadata)) == 1 + eval_time: 1m + exp_samples: + alert_rule_test: + # OBJECT_UNFOUND but osd.2 is down, so don't fire + - eval_time: 5m + alertname: Data not found/missing + exp_alerts: + # OBJECT_UNFOUND and all osd's are online, so fire + - eval_time: 15m + alertname: Data not found/missing + exp_alerts: + - exp_labels: + severity: critical + type: ceph_default + exp_annotations: + documentation: https://docs.ceph.com/en/latest/rados/operations/health-checks/#object-unfound + description: | + A version of a RADOS object can not be found, even though all OSDs are up. I/O + requests for this object from clients will block (hang). Resolving this issue may + require the object to be rolled back to a prior version manually, and manually verified. \ No newline at end of file diff --git a/src/pybind/mgr/prometheus/module.py b/src/pybind/mgr/prometheus/module.py index 746ad9ceaf3..8806ccf5fbd 100644 --- a/src/pybind/mgr/prometheus/module.py +++ b/src/pybind/mgr/prometheus/module.py @@ -7,10 +7,12 @@ import os import re import threading import time -from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT +import enum +from mgr_module import CLIReadCommand, MgrModule, MgrStandbyModule, PG_STATES, Option, ServiceInfoT, HandleCommandResult, CLIWriteCommand from mgr_util import get_default_addr, profile_method, build_url from rbd import RBD from collections import namedtuple +import yaml from typing import DefaultDict, Optional, Dict, Any, Set, cast, Tuple, Union, List @@ -115,6 +117,189 @@ HEALTH_CHECKS = [ alert_metric('SLOW_OPS', 'OSD or Monitor requests taking a long time to process'), ] +HEALTHCHECK_DETAIL = ('name', 'severity') + + +class Severity(enum.Enum): + ok = "HEALTH_OK" + warn = "HEALTH_WARN" + error = "HEALTH_ERR" + + +class Format(enum.Enum): + plain = 'plain' + json = 'json' + json_pretty = 'json-pretty' + yaml = 'yaml' + + +class HealthCheckEvent: + + def __init__(self, name: str, severity: Severity, first_seen: float, last_seen: float, count: int, active: bool = True): + self.name = name + self.severity = severity + self.first_seen = first_seen + self.last_seen = last_seen + self.count = count + self.active = active + + def as_dict(self) -> Dict[str, Any]: + """Return the instance as a dictionary.""" + return self.__dict__ + + +class HealthHistory: + kv_name = 'health_history' + titles = "{healthcheck_name:<24} {first_seen:<20} {last_seen:<20} {count:>5} {active:^6}" + date_format = "%Y/%m/%d %H:%M:%S" + + def __init__(self, mgr: MgrModule): + self.mgr = mgr + self.lock = threading.Lock() + self.healthcheck: Dict[str, HealthCheckEvent] = {} + self._load() + + def _load(self) -> None: + """Load the current state from the mons KV store.""" + data = self.mgr.get_store(self.kv_name) + if data: + try: + healthcheck_data = json.loads(data) + except json.JSONDecodeError: + self.mgr.log.warn( + f"INVALID data read from mgr/prometheus/{self.kv_name}. Resetting") + self.reset() + return + else: + for k, v in healthcheck_data.items(): + self.healthcheck[k] = HealthCheckEvent( + name=k, + severity=v.get('severity'), + first_seen=v.get('first_seen', 0), + last_seen=v.get('last_seen', 0), + count=v.get('count', 1), + active=v.get('active', True)) + else: + self.reset() + + def reset(self) -> None: + """Reset the healthcheck history.""" + with self.lock: + self.mgr.set_store(self.kv_name, "{}") + self.healthcheck = {} + + def save(self) -> None: + """Save the current in-memory healthcheck history to the KV store.""" + with self.lock: + self.mgr.set_store(self.kv_name, self.as_json()) + + def check(self, health_checks: Dict[str, Any]) -> None: + """Look at the current health checks and compare existing the history. + + Args: + health_checks (Dict[str, Any]): current health check data + """ + + current_checks = health_checks.get('checks', {}) + changes_made = False + + # first turn off any active states we're tracking + for seen_check in self.healthcheck: + check = self.healthcheck[seen_check] + if check.active and seen_check not in current_checks: + check.active = False + changes_made = True + + # now look for any additions to track + now = time.time() + for name, info in current_checks.items(): + if name not in self.healthcheck: + # this healthcheck is new, so start tracking it + changes_made = True + self.healthcheck[name] = HealthCheckEvent( + name=name, + severity=info.get('severity'), + first_seen=now, + last_seen=now, + count=1, + active=True + ) + else: + # seen it before, so update its metadata + check = self.healthcheck[name] + if check.active: + # check has been registered as active already, so skip + continue + else: + check.last_seen = now + check.count += 1 + check.active = True + changes_made = True + + if changes_made: + self.save() + + def __str__(self) -> str: + """Print the healthcheck history. + + Returns: + str: Human readable representation of the healthcheck history + """ + out = [] + + if len(self.healthcheck.keys()) == 0: + out.append("No healthchecks have been recorded") + else: + out.append(self.titles.format( + healthcheck_name="Healthcheck Name", + first_seen="First Seen (UTC)", + last_seen="Last seen (UTC)", + count="Count", + active="Active") + ) + for k in sorted(self.healthcheck.keys()): + check = self.healthcheck[k] + out.append(self.titles.format( + healthcheck_name=check.name, + first_seen=time.strftime(self.date_format, time.localtime(check.first_seen)), + last_seen=time.strftime(self.date_format, time.localtime(check.last_seen)), + count=check.count, + active="Yes" if check.active else "No") + ) + out.extend([f"{len(self.healthcheck)} health check(s) listed", ""]) + + return "\n".join(out) + + def as_dict(self) -> Dict[str, Any]: + """Return the history in a dictionary. + + Returns: + Dict[str, Any]: dictionary indexed by the healthcheck name + """ + return {name: self.healthcheck[name].as_dict() for name in self.healthcheck} + + def as_json(self, pretty: bool = False) -> str: + """Return the healthcheck history object as a dict (JSON). + + Args: + pretty (bool, optional): whether to json pretty print the history. Defaults to False. + + Returns: + str: str representation of the healthcheck in JSON format + """ + if pretty: + return json.dumps(self.as_dict(), indent=2) + else: + return json.dumps(self.as_dict()) + + def as_yaml(self) -> str: + """Return the healthcheck history in yaml format. + + Returns: + str: YAML representation of the healthcheck history + """ + return yaml.safe_dump(self.as_dict(), explicit_start=True, default_flow_style=False) + class Metric(object): def __init__(self, mtype: str, name: str, desc: str, labels: Optional[Tuple[str, ...]] = None) -> None: @@ -335,6 +520,7 @@ class Module(MgrModule): global _global_instance _global_instance = self self.metrics_thread = MetricCollectionThread(_global_instance) + self.health_history = HealthHistory(self) def _setup_static_metrics(self) -> Dict[str, Metric]: metrics = {} @@ -436,6 +622,13 @@ class Module(MgrModule): ('pool_id',) ) + metrics['health_detail'] = Metric( + 'gauge', + 'health_detail', + 'healthcheck status by type (0=inactive, 1=active)', + HEALTHCHECK_DETAIL + ) + for flag in OSD_FLAGS: path = 'osd_flag_{}'.format(flag) metrics[path] = Metric( @@ -525,7 +718,7 @@ class Module(MgrModule): ) # Examine the health to see if any health checks triggered need to - # become a metric. + # become a specific metric with a value from the health detail active_healthchecks = health.get('checks', {}) active_names = active_healthchecks.keys() @@ -557,6 +750,15 @@ class Module(MgrModule): # health check is not active, so give it a default of 0 self.metrics[path].set(0) + self.health_history.check(health) + for name, info in self.health_history.healthcheck.items(): + v = 1 if info.active else 0 + self.metrics['health_detail'].set( + v, ( + name, + str(info.severity)) + ) + @profile_method() def get_pool_stats(self) -> None: # retrieve pool stats to provide per pool recovery metrics @@ -1424,6 +1626,37 @@ class Module(MgrModule): self.log.info('Stopping engine...') self.shutdown_event.set() + @CLIReadCommand('healthcheck history ls') + def _list_healthchecks(self, format: Format = Format.plain) -> HandleCommandResult: + """List all the healthchecks being tracked + + The format options are parsed in ceph_argparse, before they get evaluated here so + we can safely assume that what we have to process is valid. ceph_argparse will throw + a ValueError if the cast to our Format class fails. + + Args: + format (Format, optional): output format. Defaults to Format.plain. + + Returns: + HandleCommandResult: return code, stdout and stderr returned to the caller + """ + + out = "" + if format == Format.plain: + out = str(self.health_history) + elif format == Format.yaml: + out = self.health_history.as_yaml() + else: + out = self.health_history.as_json(format == Format.json_pretty) + + return HandleCommandResult(retval=0, stdout=out) + + @CLIWriteCommand('healthcheck history clear') + def _clear_healthchecks(self) -> HandleCommandResult: + """Clear the healthcheck history""" + self.health_history.reset() + return HandleCommandResult(retval=0, stdout="healthcheck history cleared") + class StandbyModule(MgrStandbyModule): def __init__(self, *args: Any, **kwargs: Any) -> None: