Merge pull request #357 from ceph/wip-10949

tasks/cephfs: clean up core on deliberate crash Reviewed-by: Greg Farnum <gfarnum@redhat.com>
2024-12-19 01:46:00 +00:00 · 2015-03-04 19:14:43 +00:00 · 2015-03-04 19:14:43 +00:00 · c1bff2ea9e
commit c1bff2ea9e
parent 17a7db126a 7ad8ac63fc
1 changed files with 47 additions and 7 deletions
--- a/tasks/mds_journal_repair.py
+++ b/tasks/mds_journal_repair.py
@ -6,11 +6,15 @@ Test our tools for recovering the content of damaged journals
 import contextlib
 import json
 import logging
+import os
 from textwrap import dedent
 import time
+from StringIO import StringIO
+import re
 from teuthology.orchestra.run import CommandFailedError
 from tasks.cephfs.filesystem import Filesystem, ObjectNotFound, ROOT_INO
 from tasks.cephfs.cephfs_test_case import CephFSTestCase, run_tests
+from teuthology.orchestra import run


 log = logging.getLogger(__name__)
@ -213,16 +217,52 @@ class TestJournalRepair(CephFSTestCase):

        # See that the second MDS will crash when it starts and tries to
        # acquire rank 1
-        self.fs.mds_restart(active_mds_names[1])
-        crasher = self.fs.mds_daemons[active_mds_names[1]].proc
-
+        crasher_id = active_mds_names[1]
+        self.fs.mds_restart(crasher_id)
        try:
-            crasher.wait()
+            self.fs.mds_daemons[crasher_id].proc.wait()
        except CommandFailedError as e:
-            log.info("MDS '{0}' crashed with status {1} as expected".format(active_mds_names[1], e.exitstatus))
-            self.fs.mds_daemons[active_mds_names[1]].proc = None
+            log.info("MDS '{0}' crashed with status {1} as expected".format(crasher_id, e.exitstatus))
+            self.fs.mds_daemons[crasher_id].proc = None
+
+            # Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
+            # catch it later and treat it as a failure.
+            p = self.fs.mds_daemons[crasher_id].remote.run(args=[
+                "sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
+            core_pattern = p.stdout.getvalue().strip()
+            if os.path.dirname(core_pattern):  # Non-default core_pattern with a directory in it
+                # We have seen a core_pattern that looks like it's from teuthology's coredump
+                # task, so proceed to clear out the core file
+                log.info("Clearing core from pattern: {0}".format(core_pattern))
+
+                # Determine the PID of the crashed MDS by inspecting the MDSMap, it had
+                # to talk to the mons to get assigned a rank to reach the point of crashing
+                addr = self.fs.mon_manager.get_mds_status(crasher_id)['addr']
+                pid_str = addr.split("/")[1]
+                log.info("Determined crasher PID was {0}".format(pid_str))
+
+                # Substitute PID into core_pattern to get a glob
+                core_glob = core_pattern.replace("%p", pid_str)
+                core_glob = re.sub("%[a-z]", "*", core_glob)  # Match all for all other % tokens
+
+                # Verify that we see the expected single coredump matching the expected pattern
+                ls_proc = self.fs.mds_daemons[crasher_id].remote.run(args=[
+                    "sudo", "ls", run.Raw(core_glob)
+                ], stdout=StringIO())
+                cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
+                log.info("Enumerated cores: {0}".format(cores))
+                self.assertEqual(len(cores), 1)
+
+                log.info("Found core file {0}, deleting it".format(cores[0]))
+
+                self.fs.mds_daemons[crasher_id].remote.run(args=[
+                    "sudo", "rm", "-f", cores[0]
+                ])
+            else:
+                log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
+
        else:
-            raise RuntimeError("MDS daemon '{0}' did not crash as expected".format(active_mds_names[1]))
+            raise RuntimeError("MDS daemon '{0}' did not crash as expected".format(crasher_id))

        # Now it's crashed, let the MDSMonitor know that it's not coming back
        self.fs.mds_fail(active_mds_names[1])