ceph/qa/standalone/scrub/osd-scrub-dump.sh
Sridhar Seshasayee 33d2a2c93b qa/standalone/scrub: Force a subset of scrub tests to use "wpq" scheduler
The following tests in the test files mentioned below use the
"osd_scrub_sleep" option to introduce delays during scrubbing to help
determine scrubbing states, validate reservations during scrubbing etc..
This works when using the "wpq" scheduler.

But when the "mclock_scheduler" is enabled, the "osd_scrub_sleep" is
disabled and overridden to 0. This is done to delegate the scheduling of
the background scrubs to the "mclock_scheduler" based on the set QoS
parameters. Due to this, the checks to verify the scrub states,
reservations etc. fail since the window to check them is very short
due to scrubs completing very quickly. This affects a small subset of
scrub tests mentioned below,

1. osd-scrub-dump.sh -> TEST_recover_unexpected()
2. osd-scrub-repair.sh -> TEST_auto_repair_bluestore_tag()
3. osd-scrub-test.sh -> TEST_scrub_abort(), TEST_deep_scrub_abort()

Only for the above tests, until there's a reliable way to query scrub
states with "--osd-scrub-sleep" set to 0, the "osd_op_queue" config
option is set to "wpq".

Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
2021-07-30 18:16:00 +05:30

175 lines
4.6 KiB
Bash
Executable File

#!/usr/bin/env bash
#
# Copyright (C) 2019 Red Hat <contact@redhat.com>
#
# Author: David Zafman <dzafman@redhat.com>
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU Library Public License as published by
# the Free Software Foundation; either version 2, or (at your option)
# any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU Library Public License for more details.
#
source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
MAX_SCRUBS=4
SCRUB_SLEEP=2
POOL_SIZE=3
function run() {
local dir=$1
shift
local CHUNK_MAX=5
export CEPH_MON="127.0.0.1:7184" # git grep '\<7184\>' : there must be only one
export CEPH_ARGS
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
CEPH_ARGS+="--mon-host=$CEPH_MON "
CEPH_ARGS+="--osd_max_scrubs=$MAX_SCRUBS "
CEPH_ARGS+="--osd_scrub_chunk_max=$CHUNK_MAX "
CEPH_ARGS+="--osd_scrub_sleep=$SCRUB_SLEEP "
CEPH_ARGS+="--osd_pool_default_size=$POOL_SIZE "
# Set scheduler to "wpq" until there's a reliable way to query scrub states
# with "--osd-scrub-sleep" set to 0. The "mclock_scheduler" overrides the
# scrub sleep to 0 and as a result the checks in the test fail.
CEPH_ARGS+="--osd_op_queue=wpq "
export -n CEPH_CLI_TEST_DUP_COMMAND
local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')}
for func in $funcs ; do
setup $dir || return 1
$func $dir || return 1
teardown $dir || return 1
done
}
function TEST_recover_unexpected() {
local dir=$1
shift
local OSDS=6
local PGS=16
local POOLS=3
local OBJS=1000
run_mon $dir a || return 1
run_mgr $dir x || return 1
for o in $(seq 0 $(expr $OSDS - 1))
do
run_osd $dir $o
done
for i in $(seq 1 $POOLS)
do
create_pool test$i $PGS $PGS
done
wait_for_clean || return 1
dd if=/dev/urandom of=datafile bs=4k count=2
for i in $(seq 1 $POOLS)
do
for j in $(seq 1 $OBJS)
do
rados -p test$i put obj$j datafile
done
done
rm datafile
ceph osd set noscrub
ceph osd set nodeep-scrub
for qpg in $(ceph pg dump pgs --format=json-pretty | jq '.pg_stats[].pgid')
do
primary=$(ceph pg dump pgs --format=json | jq ".pg_stats[] | select(.pgid == $qpg) | .acting_primary")
eval pg=$qpg # strip quotes around qpg
ceph tell $pg scrub
done
ceph pg dump pgs
max=$(CEPH_ARGS='' ceph daemon $(get_asok_path osd.0) dump_scrub_reservations | jq '.osd_max_scrubs')
if [ $max != $MAX_SCRUBS ]; then
echo "ERROR: Incorrect osd_max_scrubs from dump_scrub_reservations"
return 1
fi
ceph osd unset noscrub
ok=false
for i in $(seq 0 300)
do
ceph pg dump pgs
if ceph pg dump pgs | grep scrubbing; then
ok=true
break
fi
sleep 1
done
if test $ok = "false"; then
echo "ERROR: Test set-up failed no scrubbing"
return 1
fi
local total=0
local zerocount=0
local maxzerocount=3
while(true)
do
pass=0
for o in $(seq 0 $(expr $OSDS - 1))
do
CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations
scrubs=$(CEPH_ARGS='' ceph daemon $(get_asok_path osd.$o) dump_scrub_reservations | jq '.scrubs_local + .scrubs_remote')
if [ $scrubs -gt $MAX_SCRUBS ]; then
echo "ERROR: More than $MAX_SCRUBS currently reserved"
return 1
fi
pass=$(expr $pass + $scrubs)
done
if [ $pass = "0" ]; then
zerocount=$(expr $zerocount + 1)
fi
if [ $zerocount -gt $maxzerocount ]; then
break
fi
total=$(expr $total + $pass)
sleep $(expr $SCRUB_SLEEP \* 2)
done
# Check that there are no more scrubs
for i in $(seq 0 5)
do
if ceph pg dump pgs | grep scrubbing; then
echo "ERROR: Extra scrubs after test completion...not expected"
return 1
fi
sleep $SCRUB_SLEEP
done
echo $total total reservations seen
# Sort of arbitraty number based on PGS * POOLS * POOL_SIZE as the number of total scrub
# reservations that must occur. However, the loop above might see the same reservation more
# than once.
actual_reservations=$(expr $PGS \* $POOLS \* $POOL_SIZE)
if [ $total -lt $actual_reservations ]; then
echo "ERROR: Unexpectedly low amount of scrub reservations seen during test"
return 1
fi
return 0
}
main osd-scrub-dump "$@"
# Local Variables:
# compile-command: "cd build ; make check && \
# ../qa/run-standalone.sh osd-scrub-dump.sh"
# End: