mirror of
https://github.com/ceph/ceph
synced 2025-01-01 00:22:25 +00:00
e6e10246c6
If the cluster dies during the rados bench, the maximum running time is no more considered and all emitted aios are pending. rados bench never quits and the global testing timeout (3600 sec : 1 hour) have to be reach to get a failure. This situation is dramatic for a background test or a CI run as it locks the whole job for too long for an event that will never occurs. This ideal solution would be having 'rados bench' considering a failure once the timeout is reached when aios are pending. A possible workaround here is to put use the system command 'timeout' before calling rados bench and fail if rados didn't completed on time. To avoid side effects, this patch is doubling rados timeout. If rados didn't completed after twice the expected time, it have to fail to avoid locking the whole testing job. Please find below the way it worked on a real test case. We can see no IO after t>2 but despite timeout=4 the bench continue. Thanks to this patch, the bench is stopped at t=8 and return 1. 5: /home/erwan/ceph/src/test/smoke.sh:55: TEST_multimon: timeout 8 rados -p foo bench 4 write -b 4096 --no-cleanup 5: hints = 1 5: Maintaining 16 concurrent writes of 4096 bytes to objects of size 4096 for up to 4 seconds or 0 objects 5: Object prefix: benchmark_data_mr-meeseeks_184960 5: sec Cur ops started finished avg MB/s cur MB/s last lat(s) avg lat(s) 5: 0 0 0 0 0 0 - 0 5: 1 16 1144 1128 4.40538 4.40625 0.00412965 0.0141116 5: 2 16 2147 2131 4.16134 3.91797 0.00985654 0.0109079 5: 3 16 2147 2131 2.77424 0 - 0.0109079 5: 4 16 2147 2131 2.0807 0 - 0.0109079 5: 5 16 2147 2131 1.66456 0 - 0.0109079 5: 6 16 2147 2131 1.38714 0 - 0.0109079 5: 7 16 2147 2131 1.18897 0 - 0.0109079 5: /home/erwan/ceph/src/test/smoke.sh:55: TEST_multimon: return 1 5: /home/erwan/ceph/src/test/smoke.sh:18: run: return 1 Signed-off-by: Erwan Velu <erwan@redhat.com>
117 lines
3.2 KiB
Bash
Executable File
117 lines
3.2 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
#
|
|
# Copyright (C) 2016 Piotr Dałek <git@predictor.org.pl>
|
|
# Copyright (C) 2014, 2015 Red Hat <contact@redhat.com>
|
|
#
|
|
# Author: Piotr Dałek <git@predictor.org.pl>
|
|
#
|
|
# This program is free software; you can redistribute it and/or modify
|
|
# it under the terms of the GNU Library Public License as published by
|
|
# the Free Software Foundation; either version 2, or (at your option)
|
|
# any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU Library Public License for more details.
|
|
#
|
|
|
|
source $CEPH_ROOT/qa/standalone/ceph-helpers.sh
|
|
MAX_PROPAGATION_TIME=30
|
|
|
|
function run() {
|
|
local dir=$1
|
|
shift
|
|
rm -f $dir/*.pid
|
|
export CEPH_MON="127.0.0.1:7126" # git grep '\<7126\>' : there must be only one
|
|
export CEPH_ARGS
|
|
CEPH_ARGS+="--fsid=$(uuidgen) --auth-supported=none "
|
|
CEPH_ARGS+="--mon-host=$CEPH_MON "
|
|
|
|
OLD_ARGS=$CEPH_ARGS
|
|
CEPH_ARGS+="--osd-fast-fail-on-connection-refused=false "
|
|
echo "Ensuring old behavior is there..."
|
|
test_fast_kill $dir && (echo "OSDs died too early! Old behavior doesn't work." ; return 1)
|
|
|
|
CEPH_ARGS=$OLD_ARGS"--osd-fast-fail-on-connection-refused=true "
|
|
OLD_ARGS=$CEPH_ARGS
|
|
|
|
CEPH_ARGS+="--ms_type=simple"
|
|
echo "Testing simple msgr..."
|
|
test_fast_kill $dir || return 1
|
|
|
|
CEPH_ARGS=$OLD_ARGS"--ms_type=async"
|
|
echo "Testing async msgr..."
|
|
test_fast_kill $dir || return 1
|
|
|
|
return 0
|
|
|
|
}
|
|
|
|
function test_fast_kill() {
|
|
# create cluster with 3 osds
|
|
setup $dir || return 1
|
|
run_mon $dir a --osd_pool_default_size=3 || return 1
|
|
run_mgr $dir x || return 1
|
|
for oi in {0..2}; do
|
|
run_osd $dir $oi || return 1
|
|
pids[$oi]=$(cat $dir/osd.$oi.pid)
|
|
done
|
|
|
|
create_rbd_pool || return 1
|
|
|
|
# make some objects so osds to ensure connectivity between osds
|
|
timeout 20 rados -p rbd bench 10 write -b 4096 --max-objects 128 --no-cleanup || return 1
|
|
sleep 1
|
|
|
|
killid=0
|
|
previd=0
|
|
|
|
# kill random osd and see if after max MAX_PROPAGATION_TIME, the osd count decreased.
|
|
for i in {1..2}; do
|
|
while [ $killid -eq $previd ]; do
|
|
killid=${pids[$RANDOM%${#pids[@]}]}
|
|
done
|
|
previd=$killid
|
|
|
|
kill -9 $killid
|
|
time_left=$MAX_PROPAGATION_TIME
|
|
down_osds=0
|
|
|
|
while [ $time_left -gt 0 ]; do
|
|
sleep 1
|
|
time_left=$[$time_left - 1];
|
|
|
|
grep -m 1 -c -F "ms_handle_refused" $dir/osd.*.log > /dev/null
|
|
if [ $? -ne 0 ]; then
|
|
continue
|
|
fi
|
|
|
|
down_osds=$(ceph osd tree | grep -c down)
|
|
if [ $down_osds -lt $i ]; then
|
|
# osds not marked down yet, try again in a second
|
|
continue
|
|
elif [ $down_osds -gt $i ]; then
|
|
echo Too many \($down_osds\) osds died!
|
|
return 1
|
|
else
|
|
break
|
|
fi
|
|
done
|
|
|
|
if [ $down_osds -lt $i ]; then
|
|
echo Killed the OSD, yet it is not marked down
|
|
ceph osd tree
|
|
return 1
|
|
fi
|
|
done
|
|
pkill -SIGTERM rados
|
|
teardown $dir || return 1
|
|
}
|
|
|
|
main osd-fast-mark-down "$@"
|
|
|
|
# Local Variables:
|
|
# compile-command: "cd ../.. ; make -j4 && test/osd/osd-fast-mark-down.sh"
|
|
# End:
|