diff --git a/doc/rados/operations/health-checks.rst b/doc/rados/operations/health-checks.rst index cf10cd589fa..88ca193d8ad 100644 --- a/doc/rados/operations/health-checks.rst +++ b/doc/rados/operations/health-checks.rst @@ -489,16 +489,27 @@ The state of specific problematic PGs can be queried with:: ceph tell query -PG_DEGRADED_FULL +PG_RECOVERY_FULL ________________ Data redundancy may be reduced or at risk for some data due to a lack of free space in the cluster. Specifically, one or more PGs has the -*backfill_toofull* or *recovery_toofull* flag set, meaning that the +*recovery_toofull* flag set, meaning that the +cluster is unable to migrate or recover data because one or more OSDs +is above the *full* threshold. + +See the discussion for *OSD_FULL* above for steps to resolve this condition. + +PG_BACKFILL_FULL +________________ + +Data redundancy may be reduced or at risk for some data due to a lack +of free space in the cluster. Specifically, one or more PGs has the +*backfill_toofull* flag set, meaning that the cluster is unable to migrate or recover data because one or more OSDs is above the *backfillfull* threshold. -See the discussion for *OSD_BACKFILLFULL* or *OSD_FULL* above for +See the discussion for *OSD_BACKFILLFULL* above for steps to resolve this condition. PG_DAMAGED diff --git a/doc/rados/operations/monitoring-osd-pg.rst b/doc/rados/operations/monitoring-osd-pg.rst index c490e1c3eb0..630d268b458 100644 --- a/doc/rados/operations/monitoring-osd-pg.rst +++ b/doc/rados/operations/monitoring-osd-pg.rst @@ -385,6 +385,11 @@ and, ``backfill_toofull`` indicates that a backfill operation was requested, but couldn't be completed due to insufficient storage capacity. When a placement group cannot be backfilled, it may be considered ``incomplete``. +The ``backfill_toofull`` state may be transient. It is possible that as PGs +are moved around, space may become available. The ``backfill_toofull`` is +similar to ``backfill_wait`` in that as soon as conditions change +backfill can proceed. + Ceph provides a number of settings to manage the load spike associated with reassigning placement groups to an OSD (especially a new OSD). By default, ``osd_max_backfills`` sets the maximum number of concurrent backfills to and from diff --git a/doc/rados/operations/pg-states.rst b/doc/rados/operations/pg-states.rst index 651d924d2d5..c38a683f0c5 100644 --- a/doc/rados/operations/pg-states.rst +++ b/doc/rados/operations/pg-states.rst @@ -69,8 +69,8 @@ map is ``active + clean``. The placement group is waiting in line to start backfill. *backfill_toofull* - A backfill operation is waiting because the destination OSD is over its - full ratio. + A backfill operation is waiting because the destination OSD is over + the backfillfull ratio. *backfill_unfound* Backfill stopped due to unfound objects. diff --git a/qa/standalone/osd/osd-backfill-space.sh b/qa/standalone/osd/osd-backfill-space.sh index 936043250ed..636601ec7bd 100755 --- a/qa/standalone/osd/osd-backfill-space.sh +++ b/qa/standalone/osd/osd-backfill-space.sh @@ -247,6 +247,21 @@ function TEST_backfill_test_multi() { fi ceph pg dump pgs + ceph status + + ceph status --format=json-pretty > $dir/stat.json + + eval SEV=$(jq '.health.checks.PG_BACKFILL_FULL.severity' $dir/stat.json) + if [ "$SEV" != "HEALTH_WARN" ]; then + echo "PG_BACKFILL_FULL severity $SEV not HEALTH_WARN" + ERRORS="$(expr $ERRORS + 1)" + fi + eval MSG=$(jq '.health.checks.PG_BACKFILL_FULL.summary.message' $dir/stat.json) + if [ "$MSG" != "Low space hindering backfill (add storage if this doesn't resolve itself): 4 pgs backfill_toofull" ]; then + echo "PG_BACKFILL_FULL message '$MSG' mismatched" + ERRORS="$(expr $ERRORS + 1)" + fi + rm -f $dir/stat.json if [ $ERRORS != "0" ]; then diff --git a/qa/standalone/osd/osd-recovery-space.sh b/qa/standalone/osd/osd-recovery-space.sh new file mode 100755 index 00000000000..d12494a90ca --- /dev/null +++ b/qa/standalone/osd/osd-recovery-space.sh @@ -0,0 +1,179 @@ +#!/usr/bin/env bash +# +# Copyright (C) 2018 Red Hat +# +# Author: David Zafman +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU Library Public License as published by +# the Free Software Foundation; either version 2, or (at your option) +# any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Library Public License for more details. +# + +source $CEPH_ROOT/qa/standalone/ceph-helpers.sh + +function run() { + local dir=$1 + shift + + export CEPH_MON="127.0.0.1:7221" # git grep '\<7221\>' : there must be only one + export CEPH_ARGS + CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none " + CEPH_ARGS+="--mon-host=$CEPH_MON " + CEPH_ARGS+="--osd_max_backfills=10 " + export objects=600 + export poolprefix=test + + local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} + for func in $funcs ; do + setup $dir || return 1 + $func $dir || return 1 + teardown $dir || return 1 + done +} + + +function get_num_in_state() { + local state=$1 + local expression + expression+="select(contains(\"${state}\"))" + ceph --format json pg dump pgs 2>/dev/null | \ + jq ".pg_stats | [.[] | .state | $expression] | length" +} + + +function wait_for_state() { + local state=$1 + local num_in_state=-1 + local cur_in_state + local -a delays=($(get_timeout_delays $2 5)) + local -i loop=0 + + flush_pg_stats || return 1 + while test $(get_num_pgs) == 0 ; do + sleep 1 + done + + while true ; do + cur_in_state=$(get_num_in_state ${state}) + test $cur_in_state = "0" && break + if test $cur_in_state != $num_in_state ; then + loop=0 + num_in_state=$cur_in_state + elif (( $loop >= ${#delays[*]} )) ; then + ceph pg dump pgs + return 1 + fi + sleep ${delays[$loop]} + loop+=1 + done + return 0 +} + + +function wait_for_recovery_toofull() { + local timeout=$1 + wait_for_state recovery_toofull $timeout +} + + +# Create 1 pools with size 1 +# set ful-ratio to 50% +# Write data 600 5K (3000K) +# Inject fake_statfs_for_testing to 3600K (83% full) +# Incresase the pool size to 2 +# The pool shouldn't have room to recovery +function TEST_recovery_test_simple() { + local dir=$1 + local pools=1 + local OSDS=2 + + run_mon $dir a || return 1 + run_mgr $dir x || return 1 + export CEPH_ARGS + + for osd in $(seq 0 $(expr $OSDS - 1)) + do + run_osd $dir $osd || return 1 + done + + ceph osd set-nearfull-ratio .40 + ceph osd set-backfillfull-ratio .45 + ceph osd set-full-ratio .50 + + for p in $(seq 1 $pools) + do + create_pool "${poolprefix}$p" 1 1 + ceph osd pool set "${poolprefix}$p" size 1 + done + + wait_for_clean || return 1 + + dd if=/dev/urandom of=$dir/datafile bs=1024 count=5 + for o in $(seq 1 $objects) + do + rados -p "${poolprefix}$p" put obj$o $dir/datafile + done + + for o in $(seq 0 $(expr $OSDS - 1)) + do + ceph tell osd.$o injectargs '--fake_statfs_for_testing 3686400' || return 1 + done + sleep 5 + + ceph pg dump pgs + + for p in $(seq 1 $pools) + do + ceph osd pool set "${poolprefix}$p" size 2 + done + + # If this times out, we'll detected errors below + wait_for_recovery_toofull 30 + + ERRORS=0 + if [ "$(ceph pg dump pgs | grep +recovery_toofull | wc -l)" != "1" ]; + then + echo "One pool should have been in recovery_toofull" + ERRORS="$(expr $ERRORS + 1)" + fi + + ceph pg dump pgs + ceph status + ceph status --format=json-pretty > $dir/stat.json + + eval SEV=$(jq '.health.checks.PG_RECOVERY_FULL.severity' $dir/stat.json) + if [ "$SEV" != "HEALTH_ERR" ]; then + echo "PG_RECOVERY_FULL severity $SEV not HEALTH_ERR" + ERRORS="$(expr $ERRORS + 1)" + fi + eval MSG=$(jq '.health.checks.PG_RECOVERY_FULL.summary.message' $dir/stat.json) + if [ "$MSG" != "Full OSDs blocking recovery: 1 pg recovery_toofull" ]; then + echo "PG_RECOVERY_FULL message '$MSG' mismatched" + ERRORS="$(expr $ERRORS + 1)" + fi + rm -f $dir/stat.json + + if [ $ERRORS != "0" ]; + then + return 1 + fi + + for i in $(seq 1 $pools) + do + delete_pool "${poolprefix}$i" + done + kill_daemons $dir || return 1 +} + + +main osd-recovery-space "$@" + +# Local Variables: +# compile-command: "make -j4 && ../qa/run-standalone.sh osd-recovery-space.sh" +# End: diff --git a/src/mon/PGMap.cc b/src/mon/PGMap.cc index d2af015378e..11bf4fda886 100644 --- a/src/mon/PGMap.cc +++ b/src/mon/PGMap.cc @@ -2336,10 +2336,11 @@ void PGMap::get_health_checks( typedef enum pg_consequence_t { UNAVAILABLE = 1, // Client IO to the pool may block DEGRADED = 2, // Fewer than the requested number of replicas are present - DEGRADED_FULL = 3, // Fewer than the request number of replicas may be present - // and insufficiet resources are present to fix this - DAMAGED = 4 // The data may be missing or inconsistent on disk and + BACKFILL_FULL = 3, // Backfill is blocked for space considerations + // This may or may not be a deadlock condition. + DAMAGED = 4, // The data may be missing or inconsistent on disk and // requires repair + RECOVERY_FULL = 5 // Recovery is blocked because OSDs are full } pg_consequence_t; // For a given PG state, how should it be reported at the pool level? @@ -2382,8 +2383,8 @@ void PGMap::get_health_checks( { PG_STATE_SNAPTRIM_ERROR, {DAMAGED, {}} }, { PG_STATE_RECOVERY_UNFOUND, {DAMAGED, {}} }, { PG_STATE_BACKFILL_UNFOUND, {DAMAGED, {}} }, - { PG_STATE_BACKFILL_TOOFULL, {DEGRADED_FULL, {}} }, - { PG_STATE_RECOVERY_TOOFULL, {DEGRADED_FULL, {}} }, + { PG_STATE_BACKFILL_TOOFULL, {BACKFILL_FULL, {}} }, + { PG_STATE_RECOVERY_TOOFULL, {RECOVERY_FULL, {}} }, { PG_STATE_DEGRADED, {DEGRADED, {}} }, { PG_STATE_DOWN, {UNAVAILABLE, {}} }, // Delayed (wait until stuck) reports @@ -2527,16 +2528,21 @@ void PGMap::get_health_checks( summary = "Degraded data redundancy: "; sev = HEALTH_WARN; break; - case DEGRADED_FULL: - health_code = "PG_DEGRADED_FULL"; - summary = "Degraded data redundancy (low space): "; - sev = HEALTH_ERR; + case BACKFILL_FULL: + health_code = "PG_BACKFILL_FULL"; + summary = "Low space hindering backfill (add storage if this doesn't resolve itself): "; + sev = HEALTH_WARN; break; case DAMAGED: health_code = "PG_DAMAGED"; summary = "Possible data damage: "; sev = HEALTH_ERR; break; + case RECOVERY_FULL: + health_code = "PG_RECOVERY_FULL"; + summary = "Full OSDs blocking recovery: "; + sev = HEALTH_ERR; + break; default: ceph_abort(); }