qa: Add tests to validate syncing of images using rbd-mirror

Introduce functional tests to validate that the images under
workloads are correctly mirrored between two clusters using snapshot
based mirroring.

Run workload on a primary image using a krbd or nbd client. Take
mirror snapshots of the image under workload. Unmount the mapped image
and calculate its MD5 checksum before demoting it. After demotion,
wait for the mirror status of the image to be 'up+unknown' in both
the clusters. This is to make sure that the non-primary image in the
other cluster is ready to be promoted. Now promote the non-primary
image in the other cluster. Map the promoted image and calculate its
MD5 checksum. Verify that the checksums of the demoted and promoted
images in the two clusters are the same.

The above test is run as part of two different workunits:
 - a workunit that validates the syncing of multiple mirrored images
   with workloads running on them
 - another workunit that validates the syncing of a single mirrored
   image with workload running on it and the image is set as primary
   alternatively between the two clusters, as it happens during
   failover and failback scenarios.

Fixes: https://tracker.ceph.com/issues/61617
Signed-off-by: Ramana Raja <rraja@redhat.com>
Co-authored-by: Ilya Dryomov <idryomov@redhat.com>
Co-authored-by: Christopher Hoffman <choffman@redhat.com>
This commit is contained in:
Ramana Raja 2023-05-25 16:48:12 +00:00
parent ea3a567f7f
commit b7aae5c3c5
6 changed files with 332 additions and 0 deletions

View File

@ -0,0 +1,13 @@
overrides:
install:
ceph:
extra_system_packages:
- pv
tasks:
- workunit:
clients:
cluster1.client.mirror:
- rbd/compare_mirror_image_alternate_primary.sh
env:
RBD_DEVICE_TYPE: 'krbd'
timeout: 3h

View File

@ -0,0 +1,15 @@
overrides:
install:
ceph:
extra_packages:
- rbd-nbd
extra_system_packages:
- pv
tasks:
- workunit:
clients:
cluster1.client.mirror:
- rbd/compare_mirror_image_alternate_primary.sh
env:
RBD_DEVICE_TYPE: 'nbd'
timeout: 3h

View File

@ -0,0 +1,13 @@
overrides:
install:
ceph:
extra_system_packages:
- pv
tasks:
- workunit:
clients:
cluster1.client.mirror:
- rbd/compare_mirror_images.sh
env:
RBD_DEVICE_TYPE: 'krbd'
timeout: 3h

View File

@ -0,0 +1,15 @@
overrides:
install:
ceph:
extra_packages:
- rbd-nbd
extra_system_packages:
- pv
tasks:
- workunit:
clients:
cluster1.client.mirror:
- rbd/compare_mirror_images.sh
env:
RBD_DEVICE_TYPE: 'nbd'
timeout: 3h

View File

@ -0,0 +1,106 @@
#!/bin/bash
set -ex
IMAGE=image-alternate-primary
MIRROR_IMAGE_MODE=snapshot
MIRROR_POOL_MODE=image
MOUNT=test-alternate-primary
RBD_IMAGE_FEATURES='layering,exclusive-lock,object-map,fast-diff'
RBD_MIRROR_INSTANCES=1
RBD_MIRROR_MODE=snapshot
RBD_MIRROR_USE_EXISTING_CLUSTER=1
. $(dirname $0)/rbd_mirror_helpers.sh
take_mirror_snapshots() {
local cluster=$1
local pool=$2
local image=$3
for i in {1..30}; do
mirror_image_snapshot $cluster $pool $image
sleep 3
done
}
slow_untar_workload() {
local mountpt=$1
cp linux-5.4.tar.gz $mountpt
# run workload that updates the data and metadata of multiple files on disk.
# rate limit the workload such that the mirror snapshots can be taken as the
# contents of the image are progressively changed by the workload.
local ret=0
timeout 5m bash -c "zcat $mountpt/linux-5.4.tar.gz \
| pv -L 256K | tar xf - -C $mountpt" || ret=$?
if ((ret != 124)); then
echo "Workload completed prematurely"
return 1
fi
}
setup
start_mirrors ${CLUSTER1}
start_mirrors ${CLUSTER2}
# initial setup
create_image_and_enable_mirror ${CLUSTER1} ${POOL} ${IMAGE} \
${RBD_MIRROR_MODE} 10G
if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then
DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t nbd \
-o try-netlink ${POOL}/${IMAGE})
elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then
DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t krbd \
${POOL}/${IMAGE})
else
echo "Unknown RBD_DEVICE_TYPE: ${RBD_DEVICE_TYPE}"
exit 1
fi
sudo mkfs.ext4 ${DEV}
mkdir ${MOUNT}
wget https://download.ceph.com/qa/linux-5.4.tar.gz
for i in {1..25}; do
# create mirror snapshots every few seconds under I/O
sudo mount ${DEV} ${MOUNT}
sudo chown $(whoami) ${MOUNT}
rm -rf ${MOUNT}/*
take_mirror_snapshots ${CLUSTER1} ${POOL} ${IMAGE} &
SNAP_PID=$!
slow_untar_workload ${MOUNT}
wait $SNAP_PID
sudo umount ${MOUNT}
# calculate hash before demotion of primary image
DEMOTE_MD5=$(sudo md5sum ${DEV} | awk '{print $1}')
sudo rbd --cluster ${CLUSTER1} device unmap -t ${RBD_DEVICE_TYPE} ${DEV}
demote_image ${CLUSTER1} ${POOL} ${IMAGE}
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${IMAGE} 'up+unknown'
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${IMAGE} 'up+unknown'
promote_image ${CLUSTER2} ${POOL} ${IMAGE}
# calculate hash after promotion of secondary image
if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then
DEV=$(sudo rbd --cluster ${CLUSTER2} device map -t nbd \
-o try-netlink ${POOL}/${IMAGE})
elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then
DEV=$(sudo rbd --cluster ${CLUSTER2} device map -t krbd ${POOL}/${IMAGE})
fi
PROMOTE_MD5=$(sudo md5sum ${DEV} | awk '{print $1}')
if [[ "${DEMOTE_MD5}" != "${PROMOTE_MD5}" ]]; then
echo "Mismatch at iteration ${i}: ${DEMOTE_MD5} != ${PROMOTE_MD5}"
exit 1
fi
TEMP=${CLUSTER1}
CLUSTER1=${CLUSTER2}
CLUSTER2=${TEMP}
done
echo OK

View File

@ -0,0 +1,170 @@
#!/bin/bash
set -ex
IMG_PREFIX=image-primary
MIRROR_IMAGE_MODE=snapshot
MIRROR_POOL_MODE=image
MNTPT_PREFIX=test-primary
RBD_IMAGE_FEATURES='layering,exclusive-lock,object-map,fast-diff'
RBD_MIRROR_INSTANCES=1
RBD_MIRROR_MODE=snapshot
RBD_MIRROR_USE_EXISTING_CLUSTER=1
. $(dirname $0)/rbd_mirror_helpers.sh
take_mirror_snapshots() {
local cluster=$1
local pool=$2
local image=$3
for i in {1..30}; do
mirror_image_snapshot $cluster $pool $image
sleep 3
done
}
slow_untar_workload() {
local mountpt=$1
cp linux-5.4.tar.gz $mountpt
# run workload that updates the data and metadata of multiple files on disk.
# rate limit the workload such that the mirror snapshots can be taken as the
# contents of the image are progressively changed by the workload.
local ret=0
timeout 5m bash -c "zcat $mountpt/linux-5.4.tar.gz \
| pv -L 256K | tar xf - -C $mountpt" || ret=$?
if ((ret != 124)); then
echo "Workload completed prematurely"
return 1
fi
}
wait_for_image_removal() {
local cluster=$1
local pool=$2
local image=$3
for s in 1 2 4 8 8 8 8 8 8 8 8 16 16; do
if ! rbd --cluster $cluster ls $pool | grep -wq $image; then
return 0
fi
sleep $s
done
echo "image ${pool}/${image} not removed from cluster ${cluster}"
return 1
}
compare_demoted_promoted_image() {
local dev=${DEVS[$1-1]}
local img=${IMG_PREFIX}$1
local mntpt=${MNTPT_PREFIX}$1
local demote_md5 promote_md5
sudo umount ${mntpt}
# calculate hash before demotion of primary image
demote_md5=$(sudo md5sum ${dev} | awk '{print $1}')
sudo rbd --cluster ${CLUSTER1} device unmap -t ${RBD_DEVICE_TYPE} \
${POOL}/${img}
demote_image ${CLUSTER1} ${POOL} ${img}
wait_for_status_in_pool_dir ${CLUSTER1} ${POOL} ${img} 'up+unknown'
wait_for_status_in_pool_dir ${CLUSTER2} ${POOL} ${img} 'up+unknown'
promote_image ${CLUSTER2} ${POOL} ${img}
# calculate hash after promotion of secondary image
if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then
dev=$(sudo rbd --cluster ${CLUSTER2} device map -t nbd \
-o try-netlink ${POOL}/${img})
elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then
dev=$(sudo rbd --cluster ${CLUSTER2} device map -t krbd ${POOL}/${img})
fi
promote_md5=$(sudo md5sum ${dev} | awk '{print $1}')
sudo rbd --cluster ${CLUSTER2} device unmap -t ${RBD_DEVICE_TYPE} ${dev}
if [[ "${demote_md5}" != "${promote_md5}" ]]; then
echo "Mismatch for image ${POOL}/${img}: ${demote_md5} != ${promote_md5}"
return 1
fi
}
setup
start_mirrors ${CLUSTER1}
start_mirrors ${CLUSTER2}
wget https://download.ceph.com/qa/linux-5.4.tar.gz
for i in {1..10}; do
DEVS=()
SNAP_PIDS=()
COMPARE_PIDS=()
WORKLOAD_PIDS=()
RET=0
for j in {1..10}; do
IMG=${IMG_PREFIX}${j}
MNTPT=${MNTPT_PREFIX}${j}
create_image_and_enable_mirror ${CLUSTER1} ${POOL} ${IMG} \
${RBD_MIRROR_MODE} 10G
if [[ $RBD_DEVICE_TYPE == "nbd" ]]; then
DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t nbd \
-o try-netlink ${POOL}/${IMG})
elif [[ $RBD_DEVICE_TYPE == "krbd" ]]; then
DEV=$(sudo rbd --cluster ${CLUSTER1} device map -t krbd \
${POOL}/${IMG})
else
echo "Unknown RBD_DEVICE_TYPE: ${RBD_DEVICE_TYPE}"
exit 1
fi
DEVS+=($DEV)
sudo mkfs.ext4 ${DEV}
mkdir ${MNTPT}
sudo mount ${DEV} ${MNTPT}
sudo chown $(whoami) ${MNTPT}
# create mirror snapshots under I/O every few seconds
take_mirror_snapshots ${CLUSTER1} ${POOL} ${IMG} &
SNAP_PIDS+=($!)
slow_untar_workload ${MNTPT} &
WORKLOAD_PIDS+=($!)
done
for pid in ${SNAP_PIDS[@]}; do
wait $pid || RET=$?
done
if ((RET != 0)); then
echo "take_mirror_snapshots failed"
exit 1
fi
for pid in ${WORKLOAD_PIDS[@]}; do
wait $pid || RET=$?
done
if ((RET != 0)); then
echo "slow_untar_workload failed"
exit 1
fi
for j in {1..10}; do
compare_demoted_promoted_image $j &
COMPARE_PIDS+=($!)
done
for pid in ${COMPARE_PIDS[@]}; do
wait $pid || RET=$?
done
if ((RET != 0)); then
echo "compare_demoted_promoted_image failed"
exit 1
fi
for j in {1..10}; do
IMG=${IMG_PREFIX}${j}
# Allow for removal of non-primary image by checking that mirroring
# image status is "up+replaying"
wait_for_replaying_status_in_pool_dir ${CLUSTER1} ${POOL} ${IMG}
remove_image ${CLUSTER2} ${POOL} ${IMG}
wait_for_image_removal ${CLUSTER1} ${POOL} ${IMG}
rm -rf ${MNTPT_PREFIX}${j}
done
done
echo OK