test/osd/osd-fast-mark-down.sh: introduce large timeout

One second might be not enough for loaded system to fully process
the fast mark down cycle, so introduce a loop that checks for OSD
to be marked as down within 30 seconds, later that can be extended
(or shortened) as necessary.

Fixes: http://tracker.ceph.com/issues/17918
Signed-off-by: Piotr Dałek <git@predictor.org.pl>
This commit is contained in:
Piotr Dałek 2016-10-14 04:42:18 +02:00
parent 8b3bc583c0
commit a269bb7188

View File

@ -18,6 +18,7 @@
source $(dirname $0)/../detect-build-env-vars.sh
source $CEPH_ROOT/qa/workunits/ceph-helpers.sh
MAX_PROPAGATION_TIME=30
function run() {
local dir=$1
@ -62,7 +63,7 @@ function test_fast_kill() {
killid=0
previd=0
# kill random osd and see if 1 sec after, the osd count decreased.
# kill random osd and see if after max MAX_PROPAGATION_TIME, the osd count decreased.
for i in {1..2}; do
while [ $killid -eq $previd ]; do
killid=${pids[$RANDOM%${#pids[@]}]}
@ -70,20 +71,37 @@ function test_fast_kill() {
previd=$killid
kill -9 $killid
sleep 1
time_left=$MAX_PROPAGATION_TIME
down_osds=0
while [ $time_left -gt 0 ]; do
sleep 1
time_left=$[$time_left - 1];
grep -m 1 -c -F "ms_handle_refused" $dir/osd.*.log > /dev/null
if [ $? -ne 0 ]; then
continue
fi
down_osds=$(ceph osd tree | grep -c down)
if [ $down_osds -lt $i ]; then
# osds not marked down yet, try again in a second
continue
elif [ $down_osds -gt $i ]; then
echo Too many \($down_osds\) osds died!
teardown $dir
return 1
else
break
fi
done
down_osds=$(ceph osd tree | grep -c down)
if [ $down_osds -lt $i ]; then
echo Killed the OSD, yet it is not marked down
ceph osd tree
teardown $dir
return 1
elif [ $down_osds -gt $i ]; then
echo Too many \($down_osds\) osds died!
teardown $dir
teardown $dir
return 1
fi
done
pkill -SIGTERM rados
teardown $dir || return 1