qa: Expand nvmeof thrasher and add nvmeof_namespaces.yaml job

1. qa/tasks/nvmeof.py: add other methods to stop nvmeof daemons
2. add qa/workunits/rbd/nvmeof_namespace_test.sh which adds and
   deletes new namespaces. It is run in nvmeof_namespaces.yaml
   job where fio happens to other namespaces in background.

Signed-off-by: Vallari Agrawal <val.agl002@gmail.com>
This commit is contained in:
Vallari Agrawal 2024-07-29 16:31:12 +05:30
parent 02fe44ac60
commit 58d8be9fd8
No known key found for this signature in database
GPG Key ID: B8139A21B418CCAE
3 changed files with 155 additions and 7 deletions

View File

@ -0,0 +1,40 @@
tasks:
- nvmeof:
client: client.0
gw_image: quay.io/ceph/nvmeof:1.2 # "default" is the image cephadm defaults to; change to test specific nvmeof images, example "latest"
rbd:
pool_name: mypool
image_name_prefix: myimage
gateway_config:
subsystems_count: 3
namespaces_count: 20
cli_image: quay.io/ceph/nvmeof-cli:1.2
- cephadm.wait_for_service:
service: nvmeof.mypool
- workunit:
no_coverage_and_limits: true
clients:
client.2:
- rbd/nvmeof_setup_subsystem.sh
env:
RBD_POOL: mypool
RBD_IMAGE_PREFIX: myimage
- workunit:
no_coverage_and_limits: true
timeout: 30m
clients:
client.2:
- rbd/nvmeof_basic_tests.sh
- rbd/nvmeof_fio_test.sh --rbd_iostat
client.3:
- rbd/nvmeof_basic_tests.sh
- rbd/nvmeof_namespace_test.sh
env:
RBD_POOL: mypool
IOSTAT_INTERVAL: '10'
RUNTIME: '600'
NEW_NAMESPACES_COUNT: '5'

View File

@ -343,6 +343,37 @@ class NvmeofThrasher(Thrasher, Greenlet):
self.log('switch_task: done waiting for the other thrasher')
other_thrasher.switch_thrasher.clear()
def kill_daemon(self, daemon):
kill_methods = [
"ceph_daemon_stop", "systemctl_stop",
"daemon_remove",
]
chosen_method = self.rng.choice(kill_methods)
d_name = '%s.%s' % (daemon.type_, daemon.id_)
if chosen_method == "ceph_daemon_stop":
daemon.remote.run(args=[
"ceph", "orch", "daemon", "stop",
d_name
], check_status=False)
elif chosen_method == "systemctl_stop":
daemon.stop()
elif chosen_method == "daemon_remove":
daemon.remote.run(args=[
"ceph", "orch", "daemon", "rm",
d_name
], check_status=False)
return chosen_method
def revive_daemon(self, daemon, killed_method):
if killed_method == "ceph_daemon_stop":
name = '%s.%s' % (daemon.type_, daemon.id_)
daemon.remote.run(args=[
"ceph", "orch", "daemon", "restart",
name
])
elif killed_method == "systemctl_stop":
daemon.restart()
def do_thrash(self):
self.log('start thrashing')
self.log(f'seed: {self.random_seed}, , '\
@ -354,7 +385,7 @@ class NvmeofThrasher(Thrasher, Greenlet):
summary = []
while not self.stopping.is_set():
killed_daemons = []
killed_daemons = defaultdict(list)
weight = 1.0 / len(self.daemons)
count = 0
@ -380,9 +411,10 @@ class NvmeofThrasher(Thrasher, Greenlet):
continue
self.log('kill {label}'.format(label=daemon.id_))
daemon.stop()
# daemon.stop()
kill_method = self.kill_daemon(daemon)
killed_daemons.append(daemon)
killed_daemons[kill_method].append(daemon)
daemons_thrash_history[daemon.id_] += [datetime.now()]
# only thrash max_thrash_daemons amount of daemons
@ -391,7 +423,10 @@ class NvmeofThrasher(Thrasher, Greenlet):
break
if killed_daemons:
summary += ["killed: " + ", ".join([d.id_ for d in killed_daemons])]
iteration_summary = "thrashed- "
for kill_method in killed_daemons:
iteration_summary += (", ".join([d.id_ for d in killed_daemons[kill_method]]) + f" (by {kill_method}); ")
summary += [iteration_summary]
# delay before reviving
revive_delay = self.min_revive_delay
if self.randomize:
@ -405,9 +440,11 @@ class NvmeofThrasher(Thrasher, Greenlet):
self.switch_task()
# revive after thrashing
for daemon in killed_daemons:
self.log('reviving {label}'.format(label=daemon.id_))
daemon.restart()
for kill_method in killed_daemons:
for daemon in killed_daemons[kill_method]:
self.log('reviving {label}'.format(label=daemon.id_))
# daemon.restart()
self.revive_daemon(daemon, kill_method)
# delay before thrashing
thrash_delay = self.min_thrash_delay

View File

@ -0,0 +1,71 @@
#!/bin/bash -xe
# It's assumed in this test that each subsystem has equal number
# of namespaces (i.e. NVMEOF_NAMESPACES_COUNT ns per subsystem).
# This script then adds NEW_NAMESPACES_COUNT amount of namespaces
# to each subsystem and then deletes those new namespaces.
source /etc/ceph/nvmeof.env
RBD_POOL="${RBD_POOL:-mypool}"
NEW_IMAGE_SIZE="${RBD_IMAGE_SIZE:-8192}" # 1024*8
NEW_NAMESPACES_COUNT="${NEW_NAMESPACES_COUNT:-3}"
gateways_count=$(( $(echo "$NVMEOF_GATEWAY_IP_ADDRESSES" | tr -cd ',' | wc -c) + 1 ))
new_images_count=$(( $NVMEOF_SUBSYSTEMS_COUNT * $NEW_NAMESPACES_COUNT))
assert_namespaces_count() {
expected_count_per_subsys=$1
actual_count=$(sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format json subsystem list |
grep namespace_count | grep $expected_count_per_subsys | wc -l)
if [ "$actual_count" -ne "$NVMEOF_SUBSYSTEMS_COUNT" ]; then
sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format json subsystem list
echo "Expected count of namepaces not found, expected (per subsystem): $expected_count_per_subsys"
return 1
fi
}
# add rbd images
for i in $(seq 1 $new_images_count); do
image_name="test${i}"
rbd create $RBD_POOL/$image_name --size $NEW_IMAGE_SIZE
done
# add new namespaces
image_index=1
for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
for ns in $(seq 1 $NEW_NAMESPACES_COUNT); do
image="test${image_index}"
sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT namespace add --subsystem $subsystem_nqn --rbd-pool $RBD_POOL --rbd-image $image --load-balancing-group $(($image_index % $gateways_count + 1))
((image_index++))
done
done
# list namespaces
for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format plain namespace list --subsystem $subsystem_nqn
done
# verify namespaces added
expected_count_per_subsys=$(( $NEW_NAMESPACES_COUNT + $NVMEOF_NAMESPACES_COUNT ))
assert_namespaces_count $expected_count_per_subsys
# delete namespaces
for i in $(seq 1 $NVMEOF_SUBSYSTEMS_COUNT); do
subsystem_nqn="${NVMEOF_SUBSYSTEMS_PREFIX}${i}"
NSIDs=$(sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT --format json namespace list --subsystem $subsystem_nqn |
jq -r '.namespaces[] | select(.rbd_image_name | startswith("test")) | .nsid')
for nsid in $NSIDs; do
sudo podman run -it $NVMEOF_CLI_IMAGE --server-address $NVMEOF_DEFAULT_GATEWAY_IP_ADDRESS --server-port $NVMEOF_SRPORT namespace del --subsystem $subsystem_nqn --nsid $nsid
done
done
# verify namespaces deleted
expected_count_per_subsys=$NVMEOF_NAMESPACES_COUNT
assert_namespaces_count $expected_count_per_subsys