tasks: update journal_repair test for 'damaged' state

To track recent change in master where instead of
crashing on missing MDSTable object we'll go
into damaged state.

Instead of catching a crash, handle the rank's
transition to the damanged state.  Leave the crash
handling code (unused for the moment) in the
Filesystem class in case it's needed elsewhere
soon.

Signed-off-by: John Spray <john.spray@redhat.com>
This commit is contained in:
John Spray 2015-03-26 17:15:28 +00:00
parent bd3ae1eeba
commit 097ccbb5be
2 changed files with 68 additions and 49 deletions

View File

@ -2,7 +2,13 @@ import logging
import unittest
from unittest import case
import time
import os
import re
from StringIO import StringIO
from tasks.cephfs.fuse_mount import FuseMount
from teuthology.orchestra import run
from teuthology.orchestra.run import CommandFailedError
log = logging.getLogger(__name__)
@ -189,3 +195,53 @@ class CephFSTestCase(unittest.TestCase):
elapsed += period
log.debug("wait_until_true: success")
def assert_mds_crash(self, daemon_id):
"""
Assert that the a particular MDS daemon crashes (block until
it does)
"""
try:
self.fs.mds_daemons[daemon_id].proc.wait()
except CommandFailedError as e:
log.info("MDS '{0}' crashed with status {1} as expected".format(daemon_id, e.exitstatus))
self.fs.mds_daemons[daemon_id].proc = None
# Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
# catch it later and treat it as a failure.
p = self.fs.mds_daemons[daemon_id].remote.run(args=[
"sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
core_pattern = p.stdout.getvalue().strip()
if os.path.dirname(core_pattern): # Non-default core_pattern with a directory in it
# We have seen a core_pattern that looks like it's from teuthology's coredump
# task, so proceed to clear out the core file
log.info("Clearing core from pattern: {0}".format(core_pattern))
# Determine the PID of the crashed MDS by inspecting the MDSMap, it had
# to talk to the mons to get assigned a rank to reach the point of crashing
addr = self.fs.mon_manager.get_mds_status(daemon_id)['addr']
pid_str = addr.split("/")[1]
log.info("Determined crasher PID was {0}".format(pid_str))
# Substitute PID into core_pattern to get a glob
core_glob = core_pattern.replace("%p", pid_str)
core_glob = re.sub("%[a-z]", "*", core_glob) # Match all for all other % tokens
# Verify that we see the expected single coredump matching the expected pattern
ls_proc = self.fs.mds_daemons[daemon_id].remote.run(args=[
"sudo", "ls", run.Raw(core_glob)
], stdout=StringIO())
cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
log.info("Enumerated cores: {0}".format(cores))
self.assertEqual(len(cores), 1)
log.info("Found core file {0}, deleting it".format(cores[0]))
self.fs.mds_daemons[daemon_id].remote.run(args=[
"sudo", "rm", "-f", cores[0]
])
else:
log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
else:
raise AssertionError("MDS daemon '{0}' did not crash as expected".format(daemon_id))

View File

@ -5,16 +5,12 @@ Test our tools for recovering the content of damaged journals
import json
import logging
import os
from textwrap import dedent
import time
from StringIO import StringIO
import re
from teuthology.orchestra.run import CommandFailedError
from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
from tasks.cephfs.cephfs_test_case import CephFSTestCase
from teuthology.orchestra import run
log = logging.getLogger(__name__)
@ -219,55 +215,22 @@ class TestJournalRepair(CephFSTestCase):
# See that the second MDS will crash when it starts and tries to
# acquire rank 1
crasher_id = active_mds_names[1]
self.fs.mds_restart(crasher_id)
try:
self.fs.mds_daemons[crasher_id].proc.wait()
except CommandFailedError as e:
log.info("MDS '{0}' crashed with status {1} as expected".format(crasher_id, e.exitstatus))
self.fs.mds_daemons[crasher_id].proc = None
damaged_id = active_mds_names[1]
self.fs.mds_restart(damaged_id)
# Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
# catch it later and treat it as a failure.
p = self.fs.mds_daemons[crasher_id].remote.run(args=[
"sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
core_pattern = p.stdout.getvalue().strip()
if os.path.dirname(core_pattern): # Non-default core_pattern with a directory in it
# We have seen a core_pattern that looks like it's from teuthology's coredump
# task, so proceed to clear out the core file
log.info("Clearing core from pattern: {0}".format(core_pattern))
# The daemon taking the damaged rank should start starting, then
# restart back into standby after asking the mon to mark the rank
# damaged.
def is_marked_damaged():
mds_map = self.fs.get_mds_map()
return 1 in mds_map['damaged']
# Determine the PID of the crashed MDS by inspecting the MDSMap, it had
# to talk to the mons to get assigned a rank to reach the point of crashing
addr = self.fs.mon_manager.get_mds_status(crasher_id)['addr']
pid_str = addr.split("/")[1]
log.info("Determined crasher PID was {0}".format(pid_str))
self.wait_until_true(is_marked_damaged, 60)
# Substitute PID into core_pattern to get a glob
core_glob = core_pattern.replace("%p", pid_str)
core_glob = re.sub("%[a-z]", "*", core_glob) # Match all for all other % tokens
self.fs.wait_for_state("up:standby", timeout=60, mds_id=damaged_id)
# Verify that we see the expected single coredump matching the expected pattern
ls_proc = self.fs.mds_daemons[crasher_id].remote.run(args=[
"sudo", "ls", run.Raw(core_glob)
], stdout=StringIO())
cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
log.info("Enumerated cores: {0}".format(cores))
self.assertEqual(len(cores), 1)
log.info("Found core file {0}, deleting it".format(cores[0]))
self.fs.mds_daemons[crasher_id].remote.run(args=[
"sudo", "rm", "-f", cores[0]
])
else:
log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
else:
raise RuntimeError("MDS daemon '{0}' did not crash as expected".format(crasher_id))
# Now it's crashed, let the MDSMonitor know that it's not coming back
self.fs.mds_fail(active_mds_names[1])
self.fs.mds_stop(damaged_id)
self.fs.mds_fail(damaged_id)
# Now give up and go through a disaster recovery procedure
self.fs.mds_stop(active_mds_names[0])