mirror of
https://github.com/ceph/ceph
synced 2024-12-19 01:46:00 +00:00
Merge pull request #357 from ceph/wip-10949
tasks/cephfs: clean up core on deliberate crash Reviewed-by: Greg Farnum <gfarnum@redhat.com>
This commit is contained in:
commit
c1bff2ea9e
@ -6,11 +6,15 @@ Test our tools for recovering the content of damaged journals
|
||||
import contextlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from textwrap import dedent
|
||||
import time
|
||||
from StringIO import StringIO
|
||||
import re
|
||||
from teuthology.orchestra.run import CommandFailedError
|
||||
from tasks.cephfs.filesystem import Filesystem, ObjectNotFound, ROOT_INO
|
||||
from tasks.cephfs.cephfs_test_case import CephFSTestCase, run_tests
|
||||
from teuthology.orchestra import run
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@ -213,16 +217,52 @@ class TestJournalRepair(CephFSTestCase):
|
||||
|
||||
# See that the second MDS will crash when it starts and tries to
|
||||
# acquire rank 1
|
||||
self.fs.mds_restart(active_mds_names[1])
|
||||
crasher = self.fs.mds_daemons[active_mds_names[1]].proc
|
||||
|
||||
crasher_id = active_mds_names[1]
|
||||
self.fs.mds_restart(crasher_id)
|
||||
try:
|
||||
crasher.wait()
|
||||
self.fs.mds_daemons[crasher_id].proc.wait()
|
||||
except CommandFailedError as e:
|
||||
log.info("MDS '{0}' crashed with status {1} as expected".format(active_mds_names[1], e.exitstatus))
|
||||
self.fs.mds_daemons[active_mds_names[1]].proc = None
|
||||
log.info("MDS '{0}' crashed with status {1} as expected".format(crasher_id, e.exitstatus))
|
||||
self.fs.mds_daemons[crasher_id].proc = None
|
||||
|
||||
# Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
|
||||
# catch it later and treat it as a failure.
|
||||
p = self.fs.mds_daemons[crasher_id].remote.run(args=[
|
||||
"sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
|
||||
core_pattern = p.stdout.getvalue().strip()
|
||||
if os.path.dirname(core_pattern): # Non-default core_pattern with a directory in it
|
||||
# We have seen a core_pattern that looks like it's from teuthology's coredump
|
||||
# task, so proceed to clear out the core file
|
||||
log.info("Clearing core from pattern: {0}".format(core_pattern))
|
||||
|
||||
# Determine the PID of the crashed MDS by inspecting the MDSMap, it had
|
||||
# to talk to the mons to get assigned a rank to reach the point of crashing
|
||||
addr = self.fs.mon_manager.get_mds_status(crasher_id)['addr']
|
||||
pid_str = addr.split("/")[1]
|
||||
log.info("Determined crasher PID was {0}".format(pid_str))
|
||||
|
||||
# Substitute PID into core_pattern to get a glob
|
||||
core_glob = core_pattern.replace("%p", pid_str)
|
||||
core_glob = re.sub("%[a-z]", "*", core_glob) # Match all for all other % tokens
|
||||
|
||||
# Verify that we see the expected single coredump matching the expected pattern
|
||||
ls_proc = self.fs.mds_daemons[crasher_id].remote.run(args=[
|
||||
"sudo", "ls", run.Raw(core_glob)
|
||||
], stdout=StringIO())
|
||||
cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
|
||||
log.info("Enumerated cores: {0}".format(cores))
|
||||
self.assertEqual(len(cores), 1)
|
||||
|
||||
log.info("Found core file {0}, deleting it".format(cores[0]))
|
||||
|
||||
self.fs.mds_daemons[crasher_id].remote.run(args=[
|
||||
"sudo", "rm", "-f", cores[0]
|
||||
])
|
||||
else:
|
||||
log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
|
||||
|
||||
else:
|
||||
raise RuntimeError("MDS daemon '{0}' did not crash as expected".format(active_mds_names[1]))
|
||||
raise RuntimeError("MDS daemon '{0}' did not crash as expected".format(crasher_id))
|
||||
|
||||
# Now it's crashed, let the MDSMonitor know that it's not coming back
|
||||
self.fs.mds_fail(active_mds_names[1])
|
||||
|
Loading…
Reference in New Issue
Block a user