ceph/qa/workunits/rbd/rbd_mirror.sh
Mykola Golub 21790484dc rbd-mirror: make image replayer asok commands available when not started
Initially the asok commands were registered only after the image
replayer start (and unregistered on stop) because their names were
built using remote pool and image names, which became known only after
start.

Now, the asok commands are registered on the image replayer construction
using the temporary name "remote_pool_name/global_image_id". They are
re-registered using "remote_pool_name/remote_image_name" when the
image replayer is started.  Also the commands are not unregistered on
the image replayer stop.

Signed-off-by: Mykola Golub <mgolub@mirantis.com>
2016-05-01 18:33:54 +03:00

536 lines
14 KiB
Bash
Executable File

#!/bin/sh
#
# rbd_mirror.sh - test rbd-mirror daemon
#
# The scripts starts two ("local" and "remote") clusters using mstart.sh script,
# creates a temporary directory, used for cluster configs, daemon logs, admin
# socket, temporary files, and launches rbd-mirror daemon.
#
# There are several env variables useful when troubleshooting a test failure:
#
# RBD_MIRROR_NOCLEANUP - if not empty, don't run the cleanup (stop processes,
# destroy the clusters and remove the temp directory)
# on exit, so it is possible to check the test state
# after failure.
# RBD_MIRROR_TEMDIR - use this path when creating the temporary directory
# (should not exist) instead of running mktemp(1).
#
# The cleanup can be done as a separate step, running the script with
# `cleanup ${RBD_MIRROR_TEMDIR}' arguments.
#
# Note, as other workunits tests, rbd_mirror.sh expects to find ceph binaries
# in PATH.
#
# Thus a typical troubleshooting session:
#
# From Ceph src dir (CEPH_SRC_PATH), start the test in NOCLEANUP mode and with
# TEMPDIR pointing to a known location:
#
# cd $CEPH_SRC_PATH
# PATH=$CEPH_SRC_PATH:$PATH
# RBD_MIRROR_NOCLEANUP=1 RBD_MIRROR_TEMDIR=/tmp/tmp.rbd_mirror \
# ../qa/workunits/rbd/rbd_mirror.sh
#
# After the test failure cd to TEMPDIR and check the current state:
#
# cd /tmp/tmp.rbd_mirror
# ls
# less rbd-mirror.cluster1_daemon.$pid.log
# ceph --cluster cluster1 -s
# ceph --cluster cluster1 -s
# rbd --cluster cluster2 -p mirror ls
# rbd --cluster cluster2 -p mirror journal status --image test
# ceph --admin-daemon rbd-mirror.cluster1_daemon.cluster1.$pid.asok help
# ...
#
# Also you can execute commands (functions) from the script:
#
# cd $CEPH_SRC_PATH
# export RBD_MIRROR_TEMDIR=/tmp/tmp.rbd_mirror
# ../qa/workunits/rbd/rbd_mirror.sh status
# ../qa/workunits/rbd/rbd_mirror.sh stop_mirror cluster1
# ../qa/workunits/rbd/rbd_mirror.sh start_mirror cluster2
# ../qa/workunits/rbd/rbd_mirror.sh flush cluster2
# ...
#
# Eventually, run the cleanup:
#
# cd $CEPH_SRC_PATH
# RBD_MIRROR_TEMDIR=/tmp/tmp.rbd_mirror \
# ../qa/workunits/rbd/rbd_mirror.sh cleanup
#
CLUSTER1=cluster1
CLUSTER2=cluster2
POOL=mirror
SRC_DIR=$(readlink -f $(dirname $0)/../../../src)
TEMPDIR=
# These vars facilitate running this script in an environment with
# ceph installed from packages, like teuthology. These are not defined
# by default.
#
# RBD_MIRROR_USE_EXISTING_CLUSTER - if set, do not start and stop ceph clusters
# RBD_MIRROR_USE_EXISTING_DAEMON - if set, use an existing instance of rbd-mirror
# running as ceph client $CEPH_ID. If empty,
# this script will start and stop rbd-mirror
#
# Functions
#
daemon_asok_file()
{
local local_cluster=$1
local cluster=$2
if [ -n "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then
echo $(ceph-conf --cluster $local_cluster --name "client.${CEPH_ID}" 'admin socket')
else
echo "${TEMPDIR}/rbd-mirror.${local_cluster}_daemon.${cluster}.asok"
fi
}
daemon_pid_file()
{
local cluster=$1
if [ -n "${RBD_MIRROR_USE_RBD_MIRROR}" ]; then
echo $(ceph-conf --cluster $cluster --name "client.${CEPH_ID}" 'pid file')
else
echo "${TEMPDIR}/rbd-mirror.${cluster}_daemon.pid"
fi
}
setup()
{
local c
trap cleanup INT TERM EXIT
if [ -n "${RBD_MIRROR_TEMDIR}" ]; then
mkdir "${RBD_MIRROR_TEMDIR}"
TEMPDIR="${RBD_MIRROR_TEMDIR}"
else
TEMPDIR=`mktemp -d`
fi
if [ -z "${RBD_MIRROR_USE_EXISTING_CLUSTER}" ]; then
cd ${SRC_DIR}
./mstart.sh ${CLUSTER1} -n
./mstart.sh ${CLUSTER2} -n
ln -s $(readlink -f run/${CLUSTER1}/ceph.conf) \
${TEMPDIR}/${CLUSTER1}.conf
ln -s $(readlink -f run/${CLUSTER2}/ceph.conf) \
${TEMPDIR}/${CLUSTER2}.conf
cd ${TEMPDIR}
fi
ceph --cluster ${CLUSTER1} osd pool create ${POOL} 64 64
ceph --cluster ${CLUSTER2} osd pool create ${POOL} 64 64
rbd --cluster ${CLUSTER1} mirror pool enable ${POOL} pool
rbd --cluster ${CLUSTER2} mirror pool enable ${POOL} pool
rbd --cluster ${CLUSTER1} mirror pool peer add ${POOL} ${CLUSTER2}
rbd --cluster ${CLUSTER2} mirror pool peer add ${POOL} ${CLUSTER1}
}
cleanup()
{
test -n "${RBD_MIRROR_NOCLEANUP}" && return
set +e
stop_mirror "${CLUSTER1}"
stop_mirror "${CLUSTER2}"
if [ -z "${RBD_MIRROR_USE_EXISTING_CLUSTER}" ]; then
cd ${SRC_DIR}
./mstop.sh ${CLUSTER1}
./mstop.sh ${CLUSTER2}
else
ceph --cluster ${CLUSTER1} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it
ceph --cluster ${CLUSTER2} osd pool rm ${POOL} ${POOL} --yes-i-really-really-mean-it
fi
rm -Rf ${TEMPDIR}
}
start_mirror()
{
local cluster=$1
test -n "${RBD_MIRROR_USE_RBD_MIRROR}" && return
rbd-mirror \
--cluster ${cluster} \
--pid-file=$(daemon_pid_file "${cluster}") \
--log-file=${TEMPDIR}/rbd-mirror.\$cluster_daemon.\$pid.log \
--admin-socket=${TEMPDIR}/rbd-mirror.${cluster}_daemon.\$cluster.asok \
--debug-rbd=30 --debug-journaler=30 \
--debug-rbd_mirror=30 \
--daemonize=true
}
stop_mirror()
{
local cluster=$1
test -n "${RBD_MIRROR_USE_RBD_MIRROR}" && return
local pid
pid=$(cat $(daemon_pid_file "${cluster}") 2>/dev/null) || :
if [ -n "${pid}" ]
then
kill ${pid}
for s in 1 2 4 8 16 32; do
sleep $s
ps auxww | awk -v pid=${pid} '$2 == pid {print; exit 1}' && break
done
ps auxww | awk -v pid=${pid} '$2 == pid {print; exit 1}'
fi
rm -f $(daemon_asok_file "${cluster}" "${CLUSTER1}")
rm -f $(daemon_asok_file "${cluster}" "${CLUSTER2}")
rm -f $(daemon_pid_file "${cluster}")
}
status()
{
local cluster daemon image
for cluster in ${CLUSTER1} ${CLUSTER2}
do
echo "${cluster} status"
ceph --cluster ${cluster} -s
echo
echo "${cluster} ${POOL} images"
rbd --cluster ${cluster} -p ${POOL} ls
echo
echo "${cluster} ${POOL} mirror pool status"
rbd --cluster ${cluster} -p ${POOL} mirror pool status --verbose
echo
for image in `rbd --cluster ${cluster} -p ${POOL} ls 2>/dev/null`
do
echo "image ${image} info"
rbd --cluster ${cluster} -p ${POOL} info ${image}
echo
echo "image ${image} journal status"
rbd --cluster ${cluster} -p ${POOL} journal status --image ${image}
echo
done
done
local ret
for cluster in "${CLUSTER1}" "${CLUSTER2}"
do
local pid_file=$(daemon_pid_file ${cluster} )
if [ ! -e ${pid_file} ]
then
echo "${cluster} rbd-mirror not running or unknown" \
"(${pid_file} not exist)"
continue
fi
local pid
pid=$(cat ${pid_file} 2>/dev/null) || :
if [ -z "${pid}" ]
then
echo "${cluster} rbd-mirror not running or unknown" \
"(can't find pid using ${pid_file})"
ret=1
continue
fi
echo "${daemon} rbd-mirror process in ps output:"
if ps auxww |
awk -v pid=${pid} 'NR == 1 {print} $2 == pid {print; exit 1}'
then
echo
echo "${cluster} rbd-mirror not running" \
"(can't find pid $pid in ps output)"
ret=1
continue
fi
echo
local asok_file=$(daemon_asok_file ${cluster} ${cluster})
if [ ! -S "${asok_file}" ]
then
echo "${cluster} rbd-mirror asok is unknown (${asok_file} not exits)"
ret=1
continue
fi
echo "${cluster} rbd-mirror status"
ceph --admin-daemon ${asok_file} rbd mirror status
echo
done
return ${ret}
}
flush()
{
local cluster=$1
local image=$2
local cmd="rbd mirror flush"
if [ -n "${image}" ]
then
cmd="${cmd} ${POOL}/${image}"
fi
local asok_file=$(daemon_asok_file "${cluster}" "${cluster}")
test -S "${asok_file}"
ceph --admin-daemon ${asok_file} ${cmd}
}
test_image_replay_state()
{
local cluster=$1
local image=$2
local test_state=$3
local current_state=stopped
local asok_file=$(daemon_asok_file "${cluster}" "${cluster}")
test -S "${asok_file}"
ceph --admin-daemon ${asok_file} help |
fgrep "\"rbd mirror status ${POOL}/${image}\"" &&
ceph --admin-daemon ${asok_file} rbd mirror status ${POOL}/${image} |
grep -i 'state.*Replaying' &&
current_state=started
test "${test_state}" = "${current_state}"
}
wait_for_image_replay_state()
{
local cluster=$1
local image=$2
local state=$3
local s
# TODO: add a way to force rbd-mirror to update replayers
for s in 1 2 4 8 8 8 8 8 8 8 8 16 16; do
sleep ${s}
test_image_replay_state "${cluster}" "${image}" "${state}" && return 0
done
return 1
}
wait_for_image_replay_started()
{
local cluster=$1
local image=$2
wait_for_image_replay_state "${cluster}" "${image}" started
}
wait_for_image_replay_stopped()
{
local cluster=$1
local image=$2
wait_for_image_replay_state "${cluster}" "${image}" stopped
}
get_position()
{
local cluster=$1
local image=$2
local id_regexp=$3
# Parse line like below, looking for the first position
# [id=, commit_position=[positions=[[object_number=1, tag_tid=3, entry_tid=9], [object_number=0, tag_tid=3, entry_tid=8], [object_number=3, tag_tid=3, entry_tid=7], [object_number=2, tag_tid=3, entry_tid=6]]]]
local status_log=${TEMPDIR}/${CLUSTER2}-${POOL}-${image}.status
rbd --cluster ${cluster} -p ${POOL} journal status --image ${image} |
tee ${status_log} >&2
sed -nEe 's/^.*\[id='"${id_regexp}"',.*positions=\[\[([^]]*)\],.*$/\1/p' \
${status_log}
}
get_master_position()
{
local cluster=$1
local image=$2
get_position "${cluster}" "${image}" ''
}
get_mirror_position()
{
local cluster=$1
local image=$2
get_position "${cluster}" "${image}" '..*'
}
wait_for_replay_complete()
{
local local_cluster=$1
local cluster=$2
local image=$3
local s master_pos mirror_pos
for s in 0.2 0.4 0.8 1.6 2 2 4 4 8 8 16 16; do
sleep ${s}
flush "${local_cluster}" "${image}"
master_pos=$(get_master_position "${cluster}" "${image}")
mirror_pos=$(get_mirror_position "${cluster}" "${image}")
test -n "${master_pos}" -a "${master_pos}" = "${mirror_pos}" && return 0
done
return 1
}
test_status_in_pool_dir()
{
local cluster=$1
local image=$2
local state_pattern=$3
local description_pattern=$4
local status_log=${TEMPDIR}/${cluster}-${image}.mirror_status
rbd --cluster ${cluster} -p ${POOL} mirror image status ${image} |
tee ${status_log}
grep "state: .*${state_pattern}" ${status_log}
grep "description: .*${description_pattern}" ${status_log}
}
create_image()
{
local cluster=$1
local image=$2
rbd --cluster ${cluster} -p ${POOL} create --size 128 \
--image-feature exclusive-lock --image-feature journaling ${image}
}
write_image()
{
local cluster=$1
local image=$2
local count=$3
rbd --cluster ${cluster} -p ${POOL} bench-write ${image} \
--io-size 4096 --io-threads 1 --io-total $((4096 * count)) \
--io-pattern rand
}
compare_images()
{
local image=$1
local rmt_export=${TEMPDIR}/${CLUSTER2}-${POOL}-${image}.export
local loc_export=${TEMPDIR}/${CLUSTER1}-${POOL}-${image}.export
rm -f ${rmt_export} ${loc_export}
rbd --cluster ${CLUSTER2} -p ${POOL} export ${image} ${rmt_export}
rbd --cluster ${CLUSTER1} -p ${POOL} export ${image} ${loc_export}
cmp ${rmt_export} ${loc_export}
}
demote_image()
{
local cluster=$1
local image=$2
rbd --cluster=${cluster} mirror image demote ${POOL}/${image}
}
promote_image()
{
local cluster=$1
local image=$2
rbd --cluster=${cluster} mirror image promote ${POOL}/${image}
}
#
# Main
#
if [ "$#" -gt 0 ]
then
if [ -z "${RBD_MIRROR_TEMDIR}" ]
then
echo "RBD_MIRROR_TEMDIR is not set" >&2
exit 1
fi
TEMPDIR="${RBD_MIRROR_TEMDIR}"
cd ${TEMPDIR}
$@
exit $?
fi
set -xe
setup
echo "TEST: add image and test replay"
start_mirror ${CLUSTER1}
image=test
create_image ${CLUSTER2} ${image}
wait_for_image_replay_started ${CLUSTER1} ${image}
write_image ${CLUSTER2} ${image} 100
wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${image}
test_status_in_pool_dir ${CLUSTER1} ${image} 'up+replaying' 'master_position'
test_status_in_pool_dir ${CLUSTER2} ${image} 'down+unknown'
compare_images ${image}
echo "TEST: stop mirror, add image, start mirror and test replay"
stop_mirror ${CLUSTER1}
image1=test1
create_image ${CLUSTER2} ${image1}
write_image ${CLUSTER2} ${image1} 100
start_mirror ${CLUSTER1}
wait_for_image_replay_started ${CLUSTER1} ${image1}
wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${image1}
test_status_in_pool_dir ${CLUSTER1} ${image1} 'up+replaying' 'master_position'
test_status_in_pool_dir ${CLUSTER2} ${image1} 'down+unknown'
compare_images ${image1}
echo "TEST: test the first image is replaying after restart"
write_image ${CLUSTER2} ${image} 100
wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${image}
test_status_in_pool_dir ${CLUSTER1} ${image} 'up+replaying' 'master_position'
compare_images ${image}
echo "TEST: failover and failback"
start_mirror ${CLUSTER2}
# failover
demote_image ${CLUSTER2} ${image}
wait_for_image_replay_stopped ${CLUSTER1} ${image}
test_status_in_pool_dir ${CLUSTER1} ${image} 'up+stopped'
test_status_in_pool_dir ${CLUSTER2} ${image} 'up+stopped'
promote_image ${CLUSTER1} ${image}
wait_for_image_replay_started ${CLUSTER2} ${image}
write_image ${CLUSTER1} ${image} 100
wait_for_replay_complete ${CLUSTER2} ${CLUSTER1} ${image}
test_status_in_pool_dir ${CLUSTER1} ${image} 'up+stopped'
test_status_in_pool_dir ${CLUSTER2} ${image} 'up+replaying' 'master_position'
compare_images ${image}
# failback
demote_image ${CLUSTER1} ${image}
wait_for_image_replay_stopped ${CLUSTER2} ${image}
test_status_in_pool_dir ${CLUSTER2} ${image} 'up+stopped'
promote_image ${CLUSTER2} ${image}
wait_for_image_replay_started ${CLUSTER1} ${image}
write_image ${CLUSTER2} ${image} 100
wait_for_replay_complete ${CLUSTER1} ${CLUSTER2} ${image}
test_status_in_pool_dir ${CLUSTER1} ${image} 'up+replaying' 'master_position'
test_status_in_pool_dir ${CLUSTER2} ${image} 'up+stopped'
compare_images ${image}
echo OK