From 7ad8ac63fcd228508d2829fb991f74ba1d1e57e6 Mon Sep 17 00:00:00 2001 From: John Spray Date: Thu, 26 Feb 2015 16:27:13 +0000 Subject: [PATCH] tasks/cephfs: clean up core on deliberate crash To avoid internal.coredump task synthesizing a failure during teardown from the core we left behind. Fixes: #10949 Signed-off-by: John Spray --- tasks/mds_journal_repair.py | 54 ++++++++++++++++++++++++++++++++----- 1 file changed, 47 insertions(+), 7 deletions(-) diff --git a/tasks/mds_journal_repair.py b/tasks/mds_journal_repair.py index b61214ff154..62ed0534d89 100644 --- a/tasks/mds_journal_repair.py +++ b/tasks/mds_journal_repair.py @@ -6,11 +6,15 @@ Test our tools for recovering the content of damaged journals import contextlib import json import logging +import os from textwrap import dedent import time +from StringIO import StringIO +import re from teuthology.orchestra.run import CommandFailedError from tasks.cephfs.filesystem import Filesystem, ObjectNotFound, ROOT_INO from tasks.cephfs.cephfs_test_case import CephFSTestCase, run_tests +from teuthology.orchestra import run log = logging.getLogger(__name__) @@ -213,16 +217,52 @@ class TestJournalRepair(CephFSTestCase): # See that the second MDS will crash when it starts and tries to # acquire rank 1 - self.fs.mds_restart(active_mds_names[1]) - crasher = self.fs.mds_daemons[active_mds_names[1]].proc - + crasher_id = active_mds_names[1] + self.fs.mds_restart(crasher_id) try: - crasher.wait() + self.fs.mds_daemons[crasher_id].proc.wait() except CommandFailedError as e: - log.info("MDS '{0}' crashed with status {1} as expected".format(active_mds_names[1], e.exitstatus)) - self.fs.mds_daemons[active_mds_names[1]].proc = None + log.info("MDS '{0}' crashed with status {1} as expected".format(crasher_id, e.exitstatus)) + self.fs.mds_daemons[crasher_id].proc = None + + # Go remove the coredump from the crash, otherwise teuthology.internal.coredump will + # catch it later and treat it as a failure. + p = self.fs.mds_daemons[crasher_id].remote.run(args=[ + "sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO()) + core_pattern = p.stdout.getvalue().strip() + if os.path.dirname(core_pattern): # Non-default core_pattern with a directory in it + # We have seen a core_pattern that looks like it's from teuthology's coredump + # task, so proceed to clear out the core file + log.info("Clearing core from pattern: {0}".format(core_pattern)) + + # Determine the PID of the crashed MDS by inspecting the MDSMap, it had + # to talk to the mons to get assigned a rank to reach the point of crashing + addr = self.fs.mon_manager.get_mds_status(crasher_id)['addr'] + pid_str = addr.split("/")[1] + log.info("Determined crasher PID was {0}".format(pid_str)) + + # Substitute PID into core_pattern to get a glob + core_glob = core_pattern.replace("%p", pid_str) + core_glob = re.sub("%[a-z]", "*", core_glob) # Match all for all other % tokens + + # Verify that we see the expected single coredump matching the expected pattern + ls_proc = self.fs.mds_daemons[crasher_id].remote.run(args=[ + "sudo", "ls", run.Raw(core_glob) + ], stdout=StringIO()) + cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f] + log.info("Enumerated cores: {0}".format(cores)) + self.assertEqual(len(cores), 1) + + log.info("Found core file {0}, deleting it".format(cores[0])) + + self.fs.mds_daemons[crasher_id].remote.run(args=[ + "sudo", "rm", "-f", cores[0] + ]) + else: + log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)") + else: - raise RuntimeError("MDS daemon '{0}' did not crash as expected".format(active_mds_names[1])) + raise RuntimeError("MDS daemon '{0}' did not crash as expected".format(crasher_id)) # Now it's crashed, let the MDSMonitor know that it's not coming back self.fs.mds_fail(active_mds_names[1])