tests: ceph-helpers kill_daemons fails when kill fails

Instead of silently leaving the daemons running, it returns failure so
the caller can decide what to do with this situation. The timeout is
also extended to minutes instead of seconds to gracefully handle the
rare situations when a machine is extra slow for some reason.

http://tracker.ceph.com/issues/11398 Fixes: #11398

Signed-off-by: Loic Dachary <ldachary@redhat.com>
This commit is contained in:
Loic Dachary 2015-05-06 20:14:37 +02:00
parent 4a68b737da
commit 0e26e9f72b

View File

@ -149,7 +149,7 @@ function test_teardown() {
##
# Kill all daemons for which a .pid file exists in **dir**. Each
# daemon is sent a **signal** and kill_daemons waits for it to exit
# during a few seconds. By default all daemons are killed. If a
# during a few minutes. By default all daemons are killed. If a
# **name_prefix** is provided, only the daemons matching it are
# killed.
#
@ -158,28 +158,63 @@ function test_teardown() {
# Send TERM to all osds : kill_daemons $dir TERM osd
#
# If a daemon is sent the TERM signal and does not terminate
# within a few seconds, it will still be running even after
# kill_daemons returns.
# within a few minutes, it will still be running even after
# kill_daemons returns.
#
# If all daemons are kill successfully the function returns 0
# if at least one daemon remains, this is treated as an
# error and the function return 1.
#
# After the daemon is sent **signal**, its actual termination
# will be verified by sending it signal 0. If the daemon is
# still alive, kill_daemons will pause for a few seconds and
# try again. This will repeat for a fixed number of times
# before kill_daemons returns on failure. The list of
# sleep intervals can be specified as **delays** and defaults
# to:
#
# 0 1 1 1 2 3 5 5 5 10 10 20 60
#
# This sequence is designed to not require a sleep time (0) if the
# machine is fast enough and the daemon terminates in a fraction of a
# second. The increasing sleep numbers should give plenty of time for
# the daemon to die even on the slowest running machine. If a daemon
# takes more than two minutes to stop (the sum of all sleep times),
# there probably is no point in waiting more and a number of things
# are likely to go wrong anyway: better give up and return on error.
#
# @param dir path name of the environment
# @param signal name of the first signal (defaults to KILL)
# @param name_prefix only kill match daemons (defaults to all)
# @params delays sequence of sleep times before failure
# @return 0 on success, 1 on error
#
function kill_daemons() {
local dir=$1
local signal=${2:-KILL}
local name_prefix=$3 # optional, osd, mon, osd.1
local delays=${4:-0 1 1 1 2 3 5 5 5 10 10 20 60}
local status=0
for pidfile in $(find $dir | grep $name_prefix'[^/]*\.pid') ; do
pid=$(cat $pidfile)
local send_signal=$signal
for try in 0 1 1 1 2 3 ; do
kill -$send_signal $pid 2> /dev/null || break
local kill_complete=false
for try in $delays ; do
if kill -$send_signal $pid 2> /dev/null ; then
kill_complete=false
else
kill_complete=true
break
fi
send_signal=0
sleep $try
done
if ! $kill_complete ; then
status=1
fi
done
return $status
}
function test_kill_daemons() {
@ -188,9 +223,14 @@ function test_kill_daemons() {
run_mon $dir a --osd_pool_default_size=1 || return 1
run_osd $dir 0 || return 1
ceph osd dump | grep "osd.0 up" || return 1
kill_daemons $dir TERM osd
# sending signal 0 won't kill the daemon
# waiting just for one second instead of the default schedule
# allows us to quickly verify what happens when kill fails
# to stop the daemon (i.e. it must return false)
! kill_daemons $dir 0 osd 1 || return 1
kill_daemons $dir TERM osd || return 1
ceph osd dump | grep "osd.0 down" || return 1
kill_daemons $dir TERM
kill_daemons $dir TERM || return 1
! ceph --connect-timeout 1 status || return 1
teardown $dir || return 1
}
@ -289,20 +329,20 @@ function test_run_mon() {
! CEPH_ARGS='' ceph status || return 1
CEPH_ARGS='' ceph --conf $dir/ceph.conf status || return 1
kill_daemons $dir
kill_daemons $dir || return 1
run_mon $dir a --osd_pool_default_size=1 || return 1
local size=$(CEPH_ARGS='' ceph --format=json daemon $dir/ceph-mon.a.asok \
config get osd_pool_default_size)
test "$size" = '{"osd_pool_default_size":"1"}' || return 1
kill_daemons $dir
kill_daemons $dir || return 1
CEPH_ARGS="$CEPH_ARGS --osd_pool_default_size=2" \
run_mon $dir a || return 1
local size=$(CEPH_ARGS='' ceph --format=json daemon $dir/ceph-mon.a.asok \
config get osd_pool_default_size)
test "$size" = '{"osd_pool_default_size":"2"}' || return 1
kill_daemons $dir
kill_daemons $dir || return 1
teardown $dir || return 1
}
@ -484,7 +524,7 @@ function test_activate_osd() {
config get osd_max_backfills)
test "$backfills" = '{"osd_max_backfills":"10"}' || return 1
kill_daemons $dir TERM osd
kill_daemons $dir TERM osd || return 1
activate_osd $dir 0 --osd-max-backfills 20 || return 1
local backfills=$(CEPH_ARGS='' ceph --format=json daemon $dir//ceph-osd.0.asok \