Merge pull request #44217 from CongMinYin/fix-pwl-recovery-test

qa/suites/rbd/pwl-cache: ensure recovery is actually tested Reviewed-by: Ilya Dryomov <idryomov@gmail.com> Reviewed-by: Deepika Upadhyay <dupadhya@redhat.com>
2025-01-02 09:02:34 +00:00 · 2022-07-16 11:34:55 +02:00 · 2022-07-16 11:34:55 +02:00 · 3d710cda2f
commit 3d710cda2f
parent f43c7a6e5f 2de0574382
2 changed files with 99 additions and 4 deletions
--- a/qa/suites/rbd/pwl-cache/home/7-workloads/recovery.yaml
+++ b/qa/suites/rbd/pwl-cache/home/7-workloads/recovery.yaml
@ -2,9 +2,8 @@ tasks:
 - rbd.create_image:
    client.0:
      image_name: testimage
-      image_size: 10
+      image_size: 10240
      image_format: 2
- exec:
+- rbd_pwl_cache_recovery:
    client.0:
-      - "timeout 10s rbd bench --io-pattern rand --io-type write testimage || true"
-      - "rbd bench --io-type write --io-pattern rand --io-total 32M testimage"
+      image_name: testimage
--- a/qa/tasks/rbd_pwl_cache_recovery.py
+++ b/qa/tasks/rbd_pwl_cache_recovery.py
@ -0,0 +1,96 @@
+"""
+persistent write log cache recovery task
+"""
+import contextlib
+import logging
+import random
+import json
+import time
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+
+DEFAULT_NUM_ITERATIONS = 20
+IO_PATTERNS = ("full-seq", "rand")
+IO_SIZES = ('4K', '16K', '128K', '1024K')
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def thrashes_rbd_bench_on_persistent_cache(ctx, config):
+    """
+    thrashes rbd bench on persistent write log cache.
+    It can test recovery feature of persistent write log cache.
+    """
+    log.info("thrashes rbd bench on persistent write log cache")
+
+    client, client_config = list(config.items())[0]
+    (remote,) = ctx.cluster.only(client).remotes.keys()
+    client_config = client_config if client_config is not None else dict()
+    image_name = client_config.get('image_name', 'testimage')
+    num_iterations = client_config.get('num_iterations', DEFAULT_NUM_ITERATIONS)
+
+    for i in range(num_iterations):
+        log.info("start rbd bench")
+        # rbd bench could not specify the run time so set a large enough test size.
+        remote.run(
+            args=[
+                'rbd', 'bench',
+                '--io-type', 'write',
+                '--io-pattern', random.choice(IO_PATTERNS),
+                '--io-size', random.choice(IO_SIZES),
+                '--io-total', '100G',
+                image_name,
+                ],
+            wait=False,
+        )
+        # Wait a few seconds for the rbd bench process to run
+        # and complete the pwl cache initialization
+        time.sleep(10)
+        log.info("dump cache state when rbd bench running.")
+        remote.sh(['rbd', 'status', image_name, '--format=json'])
+        log.info("sleep...")
+        time.sleep(random.randint(10, 60))
+        log.info("rbd bench crash.")
+        remote.run(
+            args=[
+                'killall', '-9', 'rbd',
+                ],
+            check_status=False,
+        )
+        log.info("wait for watch timeout.")
+        time.sleep(40)
+        log.info("check cache state after crash.")
+        out = remote.sh(['rbd', 'status', image_name, '--format=json'])
+        rbd_status = json.loads(out)
+        assert len(rbd_status['watchers']) == 0
+        assert rbd_status['persistent_cache']['present'] == True
+        assert rbd_status['persistent_cache']['empty'] == False
+        assert rbd_status['persistent_cache']['clean'] == False
+        log.info("check dirty cache file.")
+        remote.run(
+            args=[
+                'test', '-e', rbd_status['persistent_cache']['path'],
+                ]
+        )
+    try:
+        yield
+    finally:
+        log.info("cleanup")
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    This is task for testing persistent write log cache recovery.
+    """
+    assert isinstance(config, dict), \
+            "task rbd_pwl_cache_recovery only supports a dictionary for configuration"
+
+    managers = []
+    config = teuthology.replace_all_with_clients(ctx.cluster, config)
+    managers.append(
+        lambda: thrashes_rbd_bench_on_persistent_cache(ctx=ctx, config=config)
+        )
+
+    with contextutil.nested(*managers):
+        yield