mirror of
https://github.com/ceph/ceph
synced 2025-03-11 02:39:05 +00:00
tasks: update journal_repair test for 'damaged' state
To track recent change in master where instead of crashing on missing MDSTable object we'll go into damaged state. Instead of catching a crash, handle the rank's transition to the damanged state. Leave the crash handling code (unused for the moment) in the Filesystem class in case it's needed elsewhere soon. Signed-off-by: John Spray <john.spray@redhat.com>
This commit is contained in:
parent
bd3ae1eeba
commit
097ccbb5be
@ -2,7 +2,13 @@ import logging
|
||||
import unittest
|
||||
from unittest import case
|
||||
import time
|
||||
import os
|
||||
import re
|
||||
from StringIO import StringIO
|
||||
|
||||
from tasks.cephfs.fuse_mount import FuseMount
|
||||
from teuthology.orchestra import run
|
||||
from teuthology.orchestra.run import CommandFailedError
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@ -189,3 +195,53 @@ class CephFSTestCase(unittest.TestCase):
|
||||
elapsed += period
|
||||
|
||||
log.debug("wait_until_true: success")
|
||||
|
||||
def assert_mds_crash(self, daemon_id):
|
||||
"""
|
||||
Assert that the a particular MDS daemon crashes (block until
|
||||
it does)
|
||||
"""
|
||||
try:
|
||||
self.fs.mds_daemons[daemon_id].proc.wait()
|
||||
except CommandFailedError as e:
|
||||
log.info("MDS '{0}' crashed with status {1} as expected".format(daemon_id, e.exitstatus))
|
||||
self.fs.mds_daemons[daemon_id].proc = None
|
||||
|
||||
# Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
|
||||
# catch it later and treat it as a failure.
|
||||
p = self.fs.mds_daemons[daemon_id].remote.run(args=[
|
||||
"sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
|
||||
core_pattern = p.stdout.getvalue().strip()
|
||||
if os.path.dirname(core_pattern): # Non-default core_pattern with a directory in it
|
||||
# We have seen a core_pattern that looks like it's from teuthology's coredump
|
||||
# task, so proceed to clear out the core file
|
||||
log.info("Clearing core from pattern: {0}".format(core_pattern))
|
||||
|
||||
# Determine the PID of the crashed MDS by inspecting the MDSMap, it had
|
||||
# to talk to the mons to get assigned a rank to reach the point of crashing
|
||||
addr = self.fs.mon_manager.get_mds_status(daemon_id)['addr']
|
||||
pid_str = addr.split("/")[1]
|
||||
log.info("Determined crasher PID was {0}".format(pid_str))
|
||||
|
||||
# Substitute PID into core_pattern to get a glob
|
||||
core_glob = core_pattern.replace("%p", pid_str)
|
||||
core_glob = re.sub("%[a-z]", "*", core_glob) # Match all for all other % tokens
|
||||
|
||||
# Verify that we see the expected single coredump matching the expected pattern
|
||||
ls_proc = self.fs.mds_daemons[daemon_id].remote.run(args=[
|
||||
"sudo", "ls", run.Raw(core_glob)
|
||||
], stdout=StringIO())
|
||||
cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
|
||||
log.info("Enumerated cores: {0}".format(cores))
|
||||
self.assertEqual(len(cores), 1)
|
||||
|
||||
log.info("Found core file {0}, deleting it".format(cores[0]))
|
||||
|
||||
self.fs.mds_daemons[daemon_id].remote.run(args=[
|
||||
"sudo", "rm", "-f", cores[0]
|
||||
])
|
||||
else:
|
||||
log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
|
||||
|
||||
else:
|
||||
raise AssertionError("MDS daemon '{0}' did not crash as expected".format(daemon_id))
|
||||
|
@ -5,16 +5,12 @@ Test our tools for recovering the content of damaged journals
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from textwrap import dedent
|
||||
import time
|
||||
from StringIO import StringIO
|
||||
import re
|
||||
|
||||
from teuthology.orchestra.run import CommandFailedError
|
||||
from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
|
||||
from tasks.cephfs.cephfs_test_case import CephFSTestCase
|
||||
from teuthology.orchestra import run
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
@ -219,55 +215,22 @@ class TestJournalRepair(CephFSTestCase):
|
||||
|
||||
# See that the second MDS will crash when it starts and tries to
|
||||
# acquire rank 1
|
||||
crasher_id = active_mds_names[1]
|
||||
self.fs.mds_restart(crasher_id)
|
||||
try:
|
||||
self.fs.mds_daemons[crasher_id].proc.wait()
|
||||
except CommandFailedError as e:
|
||||
log.info("MDS '{0}' crashed with status {1} as expected".format(crasher_id, e.exitstatus))
|
||||
self.fs.mds_daemons[crasher_id].proc = None
|
||||
damaged_id = active_mds_names[1]
|
||||
self.fs.mds_restart(damaged_id)
|
||||
|
||||
# Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
|
||||
# catch it later and treat it as a failure.
|
||||
p = self.fs.mds_daemons[crasher_id].remote.run(args=[
|
||||
"sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
|
||||
core_pattern = p.stdout.getvalue().strip()
|
||||
if os.path.dirname(core_pattern): # Non-default core_pattern with a directory in it
|
||||
# We have seen a core_pattern that looks like it's from teuthology's coredump
|
||||
# task, so proceed to clear out the core file
|
||||
log.info("Clearing core from pattern: {0}".format(core_pattern))
|
||||
# The daemon taking the damaged rank should start starting, then
|
||||
# restart back into standby after asking the mon to mark the rank
|
||||
# damaged.
|
||||
def is_marked_damaged():
|
||||
mds_map = self.fs.get_mds_map()
|
||||
return 1 in mds_map['damaged']
|
||||
|
||||
# Determine the PID of the crashed MDS by inspecting the MDSMap, it had
|
||||
# to talk to the mons to get assigned a rank to reach the point of crashing
|
||||
addr = self.fs.mon_manager.get_mds_status(crasher_id)['addr']
|
||||
pid_str = addr.split("/")[1]
|
||||
log.info("Determined crasher PID was {0}".format(pid_str))
|
||||
self.wait_until_true(is_marked_damaged, 60)
|
||||
|
||||
# Substitute PID into core_pattern to get a glob
|
||||
core_glob = core_pattern.replace("%p", pid_str)
|
||||
core_glob = re.sub("%[a-z]", "*", core_glob) # Match all for all other % tokens
|
||||
self.fs.wait_for_state("up:standby", timeout=60, mds_id=damaged_id)
|
||||
|
||||
# Verify that we see the expected single coredump matching the expected pattern
|
||||
ls_proc = self.fs.mds_daemons[crasher_id].remote.run(args=[
|
||||
"sudo", "ls", run.Raw(core_glob)
|
||||
], stdout=StringIO())
|
||||
cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
|
||||
log.info("Enumerated cores: {0}".format(cores))
|
||||
self.assertEqual(len(cores), 1)
|
||||
|
||||
log.info("Found core file {0}, deleting it".format(cores[0]))
|
||||
|
||||
self.fs.mds_daemons[crasher_id].remote.run(args=[
|
||||
"sudo", "rm", "-f", cores[0]
|
||||
])
|
||||
else:
|
||||
log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
|
||||
|
||||
else:
|
||||
raise RuntimeError("MDS daemon '{0}' did not crash as expected".format(crasher_id))
|
||||
|
||||
# Now it's crashed, let the MDSMonitor know that it's not coming back
|
||||
self.fs.mds_fail(active_mds_names[1])
|
||||
self.fs.mds_stop(damaged_id)
|
||||
self.fs.mds_fail(damaged_id)
|
||||
|
||||
# Now give up and go through a disaster recovery procedure
|
||||
self.fs.mds_stop(active_mds_names[0])
|
||||
|
Loading…
Reference in New Issue
Block a user