ceph/qa/standalone/osd/osd-recovery-space.sh

#!/usr/bin/env bash
#
# Copyright (C) 2018 Red Hat <contact@redhat.com>
#
# Author: David Zafman <dzafman@redhat.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Library Public License for more details.
#

source $CEPH_ROOT/qa/standalone/ceph-helpers.sh

function run() {
    local dir=$1
    shift

    export CEPH_MON="127.0.0.1:7221" # git grep '\<7221\>' : there must be only one
    export CEPH_ARGS
    CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
    CEPH_ARGS+="--mon-host=$CEPH_MON "
    CEPH_ARGS+="--osd_max_backfills=10 "
    export objects=600
    export poolprefix=test

    local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
    for func in $funcs ; do
        setup $dir || return 1
        $func $dir || return 1
        teardown $dir || return 1
    done
}


function get_num_in_state() {
    local state=$1
    local expression
    expression+="select(contains(\"${state}\"))"
    ceph --format json pg dump pgs 2>/dev/null | \
        jq ".pg_stats | [.[] | .state | $expression] | length"
}


function wait_for_state() {
    local state=$1
    local cur_in_state
    local -a delays=($(get_timeout_delays $2 5))
    local -i loop=0

    flush_pg_stats || return 1
    while test $(get_num_pgs) == 0 ; do
	sleep 1
    done

    while true ; do
        cur_in_state=$(get_num_in_state ${state})
        test $cur_in_state -gt 0 && break
        if (( $loop >= ${#delays[*]} )) ; then
            ceph pg dump pgs
            return 1
        fi
        sleep ${delays[$loop]}
        loop+=1
    done
    return 0
}


function wait_for_recovery_toofull() {
    local timeout=$1
    wait_for_state recovery_toofull $timeout
}


# Create 1 pools with size 1
# set ful-ratio to 50%
# Write data 600 5K (3000K)
# Inject fake_statfs_for_testing to 3600K (83% full)
# Incresase the pool size to 2
# The pool shouldn't have room to recovery
function TEST_recovery_test_simple() {
    local dir=$1
    local pools=1
    local OSDS=2

    run_mon $dir a || return 1
    run_mgr $dir x || return 1
    export CEPH_ARGS

    for osd in $(seq 0 $(expr $OSDS - 1))
    do
      run_osd $dir $osd || return 1
    done

    ceph osd set-nearfull-ratio .40
    ceph osd set-backfillfull-ratio .45
    ceph osd set-full-ratio .50

    for p in $(seq 1 $pools)
    do
      create_pool "${poolprefix}$p" 1 1
      ceph osd pool set "${poolprefix}$p" size 1 --yes-i-really-mean-it
    done

    wait_for_clean || return 1

    dd if=/dev/urandom of=$dir/datafile bs=1024 count=5
    for o in $(seq 1 $objects)
    do
      rados -p "${poolprefix}$p" put obj$o $dir/datafile
    done

    for o in $(seq 0 $(expr $OSDS - 1))
    do
      ceph tell osd.$o injectargs '--fake_statfs_for_testing 3686400' || return 1
    done
    sleep 5

    ceph pg dump pgs

    for p in $(seq 1 $pools)
    do
      ceph osd pool set "${poolprefix}$p" size 2
    done

    # If this times out, we'll detected errors below
    wait_for_recovery_toofull 30

    ERRORS=0
    if [ "$(ceph pg dump pgs | grep +recovery_toofull | wc -l)" != "1" ];
    then
      echo "One pool should have been in recovery_toofull"
      ERRORS="$(expr $ERRORS + 1)"
    fi

    ceph pg dump pgs
    ceph status
    ceph status --format=json-pretty > $dir/stat.json

    eval SEV=$(jq '.health.checks.PG_RECOVERY_FULL.severity' $dir/stat.json)
    if [ "$SEV" != "HEALTH_ERR" ]; then
      echo "PG_RECOVERY_FULL severity $SEV not HEALTH_ERR"
      ERRORS="$(expr $ERRORS + 1)"
    fi
    eval MSG=$(jq '.health.checks.PG_RECOVERY_FULL.summary.message' $dir/stat.json)
    if [ "$MSG" != "Full OSDs blocking recovery: 1 pg recovery_toofull" ]; then
      echo "PG_RECOVERY_FULL message '$MSG' mismatched"
      ERRORS="$(expr $ERRORS + 1)"
    fi
    rm -f $dir/stat.json

    if [ $ERRORS != "0" ];
    then
      return 1
    fi

    for i in $(seq 1 $pools)
    do
      delete_pool "${poolprefix}$i"
    done
    kill_daemons $dir || return 1
}


main osd-recovery-space "$@"

# Local Variables:
# compile-command: "make -j4 && ../qa/run-standalone.sh osd-recovery-space.sh"
# End:
mon: Improve health status for backfill_toofull and recovery_toofull Treat backfull_toofull as a warning condition because it can resolve itself. Includes test case for PG_BACKFILL_FULL Includes test case for recovery_toofull / PG_RECOVERY_FULL Fixes: https://tracker.ceph.com/issues/39555 Signed-off-by: David Zafman <dzafman@redhat.com> 2019-05-22 01:29:30 +00:00			`#!/usr/bin/env bash`
			`#`
			`# Copyright (C) 2018 Red Hat <contact@redhat.com>`
			`#`
			`# Author: David Zafman <dzafman@redhat.com>`
			`#`
			`# This program is free software; you can redistribute it and/or modify`
			`# it under the terms of the GNU Library Public License as published by`
			`# the Free Software Foundation; either version 2, or (at your option)`
			`# any later version.`
			`#`
			`# This program is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the`
			`# GNU Library Public License for more details.`
			`#`

			`source $CEPH_ROOT/qa/standalone/ceph-helpers.sh`

			`function run() {`
			`local dir=$1`
			`shift`

			`export CEPH_MON="127.0.0.1:7221" # git grep '\<7221\>' : there must be only one`
			`export CEPH_ARGS`
			`CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "`
			`CEPH_ARGS+="--mon-host=$CEPH_MON "`
			`CEPH_ARGS+="--osd_max_backfills=10 "`
			`export objects=600`
			`export poolprefix=test`

			`local funcs=${@:-$(set \| sed -n -e 's/^\(TEST_[0-9a-z_]\) ./\1/p')}`
			`for func in $funcs ; do`
			`setup $dir \|\| return 1`
			`$func $dir \|\| return 1`
			`teardown $dir \|\| return 1`
			`done`
			`}`


			`function get_num_in_state() {`
			`local state=$1`
			`local expression`
			`expression+="select(contains(\"${state}\"))"`
			`ceph --format json pg dump pgs 2>/dev/null \| \`
			`jq ".pg_stats \| [.[] \| .state \| $expression] \| length"`
			`}`


			`function wait_for_state() {`
			`local state=$1`
			`local cur_in_state`
			`local -a delays=($(get_timeout_delays $2 5))`
			`local -i loop=0`

			`flush_pg_stats \|\| return 1`
			`while test $(get_num_pgs) == 0 ; do`
			`sleep 1`
			`done`

			`while true ; do`
			`cur_in_state=$(get_num_in_state ${state})`
test: Fix wait_for_state() to wait for a PG to get into a state To avoid confusion fix function names in osd-backfill-space.sh for how they actually work. Fixes: https://tracker.ceph.com/issues/43592 Signed-off-by: David Zafman <dzafman@redhat.com> 2020-01-14 01:29:11 +00:00			`test $cur_in_state -gt 0 && break`
			`if (( $loop >= ${#delays[*]} )) ; then`
mon: Improve health status for backfill_toofull and recovery_toofull Treat backfull_toofull as a warning condition because it can resolve itself. Includes test case for PG_BACKFILL_FULL Includes test case for recovery_toofull / PG_RECOVERY_FULL Fixes: https://tracker.ceph.com/issues/39555 Signed-off-by: David Zafman <dzafman@redhat.com> 2019-05-22 01:29:30 +00:00			`ceph pg dump pgs`
			`return 1`
			`fi`
			`sleep ${delays[$loop]}`
			`loop+=1`
			`done`
			`return 0`
			`}`


			`function wait_for_recovery_toofull() {`
			`local timeout=$1`
			`wait_for_state recovery_toofull $timeout`
			`}`


			`# Create 1 pools with size 1`
			`# set ful-ratio to 50%`
			`# Write data 600 5K (3000K)`
			`# Inject fake_statfs_for_testing to 3600K (83% full)`
			`# Incresase the pool size to 2`
			`# The pool shouldn't have room to recovery`
			`function TEST_recovery_test_simple() {`
			`local dir=$1`
			`local pools=1`
			`local OSDS=2`

			`run_mon $dir a \|\| return 1`
			`run_mgr $dir x \|\| return 1`
			`export CEPH_ARGS`

			`for osd in $(seq 0 $(expr $OSDS - 1))`
			`do`
			`run_osd $dir $osd \|\| return 1`
			`done`

			`ceph osd set-nearfull-ratio .40`
			`ceph osd set-backfillfull-ratio .45`
			`ceph osd set-full-ratio .50`

			`for p in $(seq 1 $pools)`
			`do`
			`create_pool "${poolprefix}$p" 1 1`
mon/OSDMonitor: add flag `--yes-i-really-mean-it` for setting pool size 1 Adds option `mon_allow_pool_size_one` which will be disabled by default to ensure pools are not configured without replicas. If the user still wants to use pool size 1, they will have to change the value of `mon_allow_pool_size_one` to true and then have to pass flag `--yes-i-really-mean-it` to cli command: Example: `ceph osd pool test set size 1 --yes-i-really-mean-it` Fixes: https://tracker.ceph.com/issues/44025 Signed-off-by: Deepika Upadhyay <dupadhya@redhat.com> 2020-02-12 14:38:29 +00:00			`ceph osd pool set "${poolprefix}$p" size 1 --yes-i-really-mean-it`
mon: Improve health status for backfill_toofull and recovery_toofull Treat backfull_toofull as a warning condition because it can resolve itself. Includes test case for PG_BACKFILL_FULL Includes test case for recovery_toofull / PG_RECOVERY_FULL Fixes: https://tracker.ceph.com/issues/39555 Signed-off-by: David Zafman <dzafman@redhat.com> 2019-05-22 01:29:30 +00:00			`done`

			`wait_for_clean \|\| return 1`

			`dd if=/dev/urandom of=$dir/datafile bs=1024 count=5`
			`for o in $(seq 1 $objects)`
			`do`
			`rados -p "${poolprefix}$p" put obj$o $dir/datafile`
			`done`

			`for o in $(seq 0 $(expr $OSDS - 1))`
			`do`
			`ceph tell osd.$o injectargs '--fake_statfs_for_testing 3686400' \|\| return 1`
			`done`
			`sleep 5`

			`ceph pg dump pgs`

			`for p in $(seq 1 $pools)`
			`do`
			`ceph osd pool set "${poolprefix}$p" size 2`
			`done`

			`# If this times out, we'll detected errors below`
			`wait_for_recovery_toofull 30`

			`ERRORS=0`
			`if [ "$(ceph pg dump pgs \| grep +recovery_toofull \| wc -l)" != "1" ];`
			`then`
			`echo "One pool should have been in recovery_toofull"`
			`ERRORS="$(expr $ERRORS + 1)"`
			`fi`

			`ceph pg dump pgs`
			`ceph status`
			`ceph status --format=json-pretty > $dir/stat.json`

			`eval SEV=$(jq '.health.checks.PG_RECOVERY_FULL.severity' $dir/stat.json)`
			`if [ "$SEV" != "HEALTH_ERR" ]; then`
			`echo "PG_RECOVERY_FULL severity $SEV not HEALTH_ERR"`
			`ERRORS="$(expr $ERRORS + 1)"`
			`fi`
			`eval MSG=$(jq '.health.checks.PG_RECOVERY_FULL.summary.message' $dir/stat.json)`
			`if [ "$MSG" != "Full OSDs blocking recovery: 1 pg recovery_toofull" ]; then`
			`echo "PG_RECOVERY_FULL message '$MSG' mismatched"`
			`ERRORS="$(expr $ERRORS + 1)"`
			`fi`
			`rm -f $dir/stat.json`

			`if [ $ERRORS != "0" ];`
			`then`
			`return 1`
			`fi`

			`for i in $(seq 1 $pools)`
			`do`
			`delete_pool "${poolprefix}$i"`
			`done`
			`kill_daemons $dir \|\| return 1`
			`}`


			`main osd-recovery-space "$@"`

			`# Local Variables:`
			`# compile-command: "make -j4 && ../qa/run-standalone.sh osd-recovery-space.sh"`
			`# End:`