mirror of
https://github.com/ceph/ceph
synced 2025-03-11 02:39:05 +00:00
qa/cephfs: Add test for rebuilding into an alternate metadata pool
Add a test to validate the ability of cephfs_data_scan and friends to recover metadata from a damaged CephFS installation into a fresh metadata pool. cf: http://tracker.ceph.com/issues/15068 cf: http://tracker.ceph.com/issues/15069 Signed-off-by: Douglas Fuller <dfuller@redhat.com>
This commit is contained in:
parent
cb86740a5f
commit
37bafff9f4
@ -291,9 +291,15 @@ class MDSCluster(CephCluster):
|
||||
'--yes-i-really-really-mean-it')
|
||||
for data_pool in mdsmap['data_pools']:
|
||||
data_pool = pool_id_name[data_pool]
|
||||
self.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
|
||||
data_pool, data_pool,
|
||||
'--yes-i-really-really-mean-it')
|
||||
try:
|
||||
self.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
|
||||
data_pool, data_pool,
|
||||
'--yes-i-really-really-mean-it')
|
||||
except CommandFailedError as e:
|
||||
if e.exitstatus == 16: # EBUSY, this data pool is used
|
||||
pass # by two metadata pools, let the 2nd
|
||||
else: # pass delete it
|
||||
raise
|
||||
|
||||
def get_standby_daemons(self):
|
||||
return set([s['name'] for s in self.status().get_standbys()])
|
||||
|
@ -294,7 +294,7 @@ class NonDefaultLayout(Workload):
|
||||
self._initial_state = self._mount.stat("datafile")
|
||||
|
||||
def validate(self):
|
||||
p = self._mount.run_shell(["getfattr", "--only-values", "-n", "ceph.file.layout.object_size", "./datafile"])
|
||||
p = self._mount.run_shell(["sudo", "getfattr", "--only-values", "-n", "ceph.file.layout.object_size", "./datafile"])
|
||||
|
||||
# Check we got the layout reconstructed properly
|
||||
object_size = int(p.stdout.getvalue().strip())
|
||||
@ -312,13 +312,15 @@ class TestDataScan(CephFSTestCase):
|
||||
mds_map = self.fs.get_mds_map()
|
||||
return rank in mds_map['damaged']
|
||||
|
||||
def _rebuild_metadata(self, workload, workers=1):
|
||||
def _rebuild_metadata(self, workload, other_pool=None, workers=1):
|
||||
"""
|
||||
That when all objects in metadata pool are removed, we can rebuild a metadata pool
|
||||
based on the contents of a data pool, and a client can see and read our files.
|
||||
"""
|
||||
|
||||
# First, inject some files
|
||||
|
||||
other_fs = other_pool + '-fs' if other_pool else None
|
||||
workload.write()
|
||||
|
||||
# Unmount the client and flush the journal: the tool should also cope with
|
||||
@ -326,6 +328,23 @@ class TestDataScan(CephFSTestCase):
|
||||
self.mount_a.umount_wait()
|
||||
workload.flush()
|
||||
|
||||
# Create the alternate pool if requested
|
||||
if other_pool:
|
||||
self.fs.rados(['mkpool', other_pool])
|
||||
self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
|
||||
'enable_multiple', 'true',
|
||||
'--yes-i-really-mean-it')
|
||||
self.fs.mon_manager.raw_cluster_cmd('fs', 'new', other_fs,
|
||||
other_pool,
|
||||
self.fs.get_data_pool_name(),
|
||||
'--allow-dangerous-metadata-overlay')
|
||||
self.fs.data_scan(['init', '--force-init', '--filesystem',
|
||||
other_fs, '--alternate-pool', other_pool])
|
||||
self.fs.mon_manager.raw_cluster_cmd('-s')
|
||||
self.fs.table_tool([other_fs + ":0", "reset", "session"])
|
||||
self.fs.table_tool([other_fs + ":0", "reset", "snap"])
|
||||
self.fs.table_tool([other_fs + ":0", "reset", "inode"])
|
||||
|
||||
# Stop the MDS
|
||||
self.fs.mds_stop()
|
||||
self.fs.mds_fail()
|
||||
@ -343,32 +362,63 @@ class TestDataScan(CephFSTestCase):
|
||||
self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
|
||||
'--yes-i-really-mean-it')
|
||||
|
||||
# Attempt to start an MDS, see that it goes into damaged state
|
||||
self.fs.mds_restart()
|
||||
if other_pool is None:
|
||||
self.fs.mds_restart()
|
||||
|
||||
def get_state(mds_id):
|
||||
info = self.mds_cluster.get_mds_info(mds_id)
|
||||
return info['state'] if info is not None else None
|
||||
|
||||
self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
|
||||
for mds_id in self.fs.mds_ids:
|
||||
self.wait_until_equal(
|
||||
lambda: get_state(mds_id),
|
||||
"up:standby",
|
||||
timeout=60)
|
||||
if other_pool is None:
|
||||
self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
|
||||
for mds_id in self.fs.mds_ids:
|
||||
self.wait_until_equal(
|
||||
lambda: get_state(mds_id),
|
||||
"up:standby",
|
||||
timeout=60)
|
||||
|
||||
self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
|
||||
self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
|
||||
self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
|
||||
|
||||
# Run the recovery procedure
|
||||
self.fs.table_tool(["0", "reset", "session"])
|
||||
self.fs.table_tool(["0", "reset", "snap"])
|
||||
self.fs.table_tool(["0", "reset", "inode"])
|
||||
if False:
|
||||
with self.assertRaises(CommandFailedError):
|
||||
# Normal reset should fail when no objects are present, we'll use --force instead
|
||||
self.fs.journal_tool(["journal", "reset"])
|
||||
self.fs.journal_tool(["journal", "reset", "--force"])
|
||||
self.fs.data_scan(["init"])
|
||||
self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
|
||||
self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
|
||||
|
||||
if other_pool:
|
||||
self.fs.mds_stop()
|
||||
self.fs.data_scan(['scan_extents', '--alternate-pool',
|
||||
other_pool, '--filesystem', self.fs.name,
|
||||
self.fs.get_data_pool_name()])
|
||||
self.fs.data_scan(['scan_inodes', '--alternate-pool',
|
||||
other_pool, '--filesystem', self.fs.name,
|
||||
'--force-corrupt', '--force-init',
|
||||
self.fs.get_data_pool_name()])
|
||||
self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
|
||||
'recover_dentries', 'list',
|
||||
'--alternate-pool', other_pool])
|
||||
|
||||
self.fs.data_scan(['init', '--force-init', '--filesystem',
|
||||
self.fs.name])
|
||||
self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
|
||||
'--force-corrupt', '--force-init',
|
||||
self.fs.get_data_pool_name()])
|
||||
self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
|
||||
'recover_dentries', 'list'])
|
||||
|
||||
self.fs.journal_tool(['--rank=' + other_fs + ":0", 'journal',
|
||||
'reset', '--force'])
|
||||
self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'journal',
|
||||
'reset', '--force'])
|
||||
self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
|
||||
other_fs + ":0")
|
||||
else:
|
||||
self.fs.journal_tool(["journal", "reset", "--force"])
|
||||
self.fs.data_scan(["init"])
|
||||
self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
|
||||
self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
|
||||
|
||||
# Mark the MDS repaired
|
||||
self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
|
||||
@ -376,10 +426,26 @@ class TestDataScan(CephFSTestCase):
|
||||
# Start the MDS
|
||||
self.fs.mds_restart()
|
||||
self.fs.wait_for_daemons()
|
||||
if other_pool:
|
||||
for mds_id in self.fs.mds_ids:
|
||||
self.wait_until_equal(
|
||||
lambda: get_state(mds_id),
|
||||
"up:active",
|
||||
timeout=60)
|
||||
self.fs.mon_manager.raw_cluster_cmd('tell', 'mds.a',
|
||||
'injectargs', '--debug-mds=20')
|
||||
self.fs.mon_manager.raw_cluster_cmd('tell', 'mds.b',
|
||||
'injectargs', '--debug-mds=20')
|
||||
self.fs.mon_manager.raw_cluster_cmd('daemon', 'mds.a',
|
||||
'scrub_path', '/',
|
||||
'recursive', 'repair')
|
||||
self.fs.mon_manager.raw_cluster_cmd('daemon', 'mds.b',
|
||||
'scrub_path', '/',
|
||||
'recursive', 'repair')
|
||||
log.info(str(self.mds_cluster.status()))
|
||||
|
||||
# Mount a client
|
||||
self.mount_a.mount()
|
||||
self.mount_a.mount(mount_fs_name=other_fs)
|
||||
self.mount_a.wait_until_mounted()
|
||||
|
||||
# See that the files are present and correct
|
||||
@ -414,7 +480,12 @@ class TestDataScan(CephFSTestCase):
|
||||
def test_stashed_layout(self):
|
||||
self._rebuild_metadata(StripedStashedLayout(self.fs, self.mount_a))
|
||||
|
||||
def test_rebuild_simple_altpool(self):
|
||||
self._rebuild_metadata(SimpleWorkload(self.fs, self.mount_a), other_pool="recovery")
|
||||
|
||||
def _dirfrag_keys(self, object_id):
|
||||
self.other_pool = 'recovery'
|
||||
self.other_fs = self.other_pool + '-fs'
|
||||
keys_str = self.fs.rados(["listomapkeys", object_id])
|
||||
if keys_str:
|
||||
return keys_str.split("\n")
|
||||
|
Loading…
Reference in New Issue
Block a user