Merge pull request #357 from ceph/wip-10949

tasks/cephfs: clean up core on deliberate crash

Reviewed-by: Greg Farnum <gfarnum@redhat.com>
This commit is contained in:
John Spray 2015-03-04 19:14:43 +00:00
commit c1bff2ea9e

View File

@ -6,11 +6,15 @@ Test our tools for recovering the content of damaged journals
import contextlib
import json
import logging
import os
from textwrap import dedent
import time
from StringIO import StringIO
import re
from teuthology.orchestra.run import CommandFailedError
from tasks.cephfs.filesystem import Filesystem, ObjectNotFound, ROOT_INO
from tasks.cephfs.cephfs_test_case import CephFSTestCase, run_tests
from teuthology.orchestra import run
log = logging.getLogger(__name__)
@ -213,16 +217,52 @@ class TestJournalRepair(CephFSTestCase):
# See that the second MDS will crash when it starts and tries to
# acquire rank 1
self.fs.mds_restart(active_mds_names[1])
crasher = self.fs.mds_daemons[active_mds_names[1]].proc
crasher_id = active_mds_names[1]
self.fs.mds_restart(crasher_id)
try:
crasher.wait()
self.fs.mds_daemons[crasher_id].proc.wait()
except CommandFailedError as e:
log.info("MDS '{0}' crashed with status {1} as expected".format(active_mds_names[1], e.exitstatus))
self.fs.mds_daemons[active_mds_names[1]].proc = None
log.info("MDS '{0}' crashed with status {1} as expected".format(crasher_id, e.exitstatus))
self.fs.mds_daemons[crasher_id].proc = None
# Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
# catch it later and treat it as a failure.
p = self.fs.mds_daemons[crasher_id].remote.run(args=[
"sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
core_pattern = p.stdout.getvalue().strip()
if os.path.dirname(core_pattern): # Non-default core_pattern with a directory in it
# We have seen a core_pattern that looks like it's from teuthology's coredump
# task, so proceed to clear out the core file
log.info("Clearing core from pattern: {0}".format(core_pattern))
# Determine the PID of the crashed MDS by inspecting the MDSMap, it had
# to talk to the mons to get assigned a rank to reach the point of crashing
addr = self.fs.mon_manager.get_mds_status(crasher_id)['addr']
pid_str = addr.split("/")[1]
log.info("Determined crasher PID was {0}".format(pid_str))
# Substitute PID into core_pattern to get a glob
core_glob = core_pattern.replace("%p", pid_str)
core_glob = re.sub("%[a-z]", "*", core_glob) # Match all for all other % tokens
# Verify that we see the expected single coredump matching the expected pattern
ls_proc = self.fs.mds_daemons[crasher_id].remote.run(args=[
"sudo", "ls", run.Raw(core_glob)
], stdout=StringIO())
cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
log.info("Enumerated cores: {0}".format(cores))
self.assertEqual(len(cores), 1)
log.info("Found core file {0}, deleting it".format(cores[0]))
self.fs.mds_daemons[crasher_id].remote.run(args=[
"sudo", "rm", "-f", cores[0]
])
else:
log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
else:
raise RuntimeError("MDS daemon '{0}' did not crash as expected".format(active_mds_names[1]))
raise RuntimeError("MDS daemon '{0}' did not crash as expected".format(crasher_id))
# Now it's crashed, let the MDSMonitor know that it's not coming back
self.fs.mds_fail(active_mds_names[1])