qa: add test for snap format upgrade

Signed-off-by: Patrick Donnelly <pdonnell@redhat.com>
This commit is contained in:
Patrick Donnelly 2018-04-30 14:43:28 -07:00
parent aedd5301dc
commit 91942df5a6
No known key found for this signature in database
GPG Key ID: 3A2A7E25BEA8AADB
25 changed files with 271 additions and 14 deletions

View File

View File

@ -0,0 +1 @@
../../../../../cephfs/clusters/3-mds.yaml

View File

@ -0,0 +1 @@
../../../../cephfs/objectstore-ec/

View File

View File

@ -0,0 +1 @@
../../../../../cephfs/overrides/debug.yaml

View File

@ -0,0 +1 @@
../../../../../cephfs/overrides/frag_enable.yaml

View File

@ -0,0 +1,3 @@
overrides:
ceph:
max_mds: 1

View File

@ -0,0 +1 @@
../../../../../cephfs/overrides/whitelist_health.yaml

View File

@ -0,0 +1 @@
../../../../../cephfs/overrides/whitelist_wrongly_marked_down.yaml

View File

View File

@ -0,0 +1,30 @@
meta:
- desc: |
install ceph/luminous latest
tasks:
- install:
branch: luminous
- print: "**** done installing luminous"
- ceph:
log-whitelist:
- overall HEALTH_
- \(FS_
- \(MDS_
- \(OSD_
- \(MON_DOWN\)
- \(CACHE_POOL_
- \(POOL_
- \(MGR_DOWN\)
- \(PG_
- \(SMALLER_PGP_NUM\)
- Monitor daemon marked osd
- Behind on trimming
- Manager daemon
conf:
global:
mon warn on pool no app: false
- exec:
osd.0:
- ceph osd require-osd-release luminous
- ceph osd set-require-min-compat-client luminous
- print: "**** done ceph"

View File

@ -0,0 +1,13 @@
tasks:
- ceph-fuse:
- print: "**** done luminous client"
- exec:
mon.a:
- ceph fs set cephfs allow_new_snaps true --yes-i-really-mean-it
- workunit:
timeout: 5m
cleanup: false
clients:
client.0:
- fs/snaps/snap-hierarchy.sh
- print: "**** done snap hierarchy"

View File

@ -0,0 +1,12 @@
tasks:
- mds_pre_upgrade:
- print: "**** done mds pre-upgrade sequence"
- install.upgrade:
mon.a:
mon.b:
- print: "**** done install.upgrade both hosts"
- ceph.stop: [mds.*]
- ceph.restart:
daemons: [mon.*, mgr.*, osd.*, mds.*]
mon-health-to-clog: false
- print: "**** done ceph.restart"

View File

@ -0,0 +1,10 @@
tasks:
- exec:
mon.a:
- ceph status
- ceph fs dump --format=json-pretty
- ceph fs set cephfs max_mds 2 && exit 1 || true
- print: "**** confirmed cannot set max_mds=2"
- exec:
mon.a:
- ceph fs set cephfs allow_new_snaps true

View File

@ -0,0 +1,10 @@
tasks:
- install.upgrade:
client.0:
- print: "**** done install.upgrade on client.0"
- ceph-fuse:
client.0:
mounted: false
- ceph-fuse:
client.0:
- print: "**** done remount client"

View File

@ -0,0 +1,10 @@
tasks:
- workunit:
timeout: 5m
cleanup: false
env:
VERIFY: verify
clients:
client.0:
- fs/snaps/snap-hierarchy.sh
- print: "**** done verify snap hierarchy"

View File

@ -0,0 +1,16 @@
overrides:
ceph:
log-whitelist:
- bad backtrace on inode
tasks:
- cephfs_upgrade_snap:
- print: "**** upgraded snapshot metadata"
- exec:
mon.a:
- ceph fs set cephfs max_mds 2
- print: "**** increased max_mds=2"
- sleep:
duration: 10
- exec:
mon.a:
- ceph fs dump | grep '^max_mds.*2'

View File

@ -0,0 +1 @@
5-client-sanity.yaml

View File

@ -376,7 +376,7 @@ def cephfs_setup(ctx, config):
all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
num_active = len([r for r in all_roles if is_active_mds(r)])
fs.set_max_mds(num_active)
fs.set_max_mds(config.get('max_mds', num_active))
yield

View File

@ -728,6 +728,16 @@ class Filesystem(MDSCluster):
return result
def get_rank(self, rank=0, status=None):
if status is None:
status = self.getinfo()
return status.get_rank(self.id, rank)
def get_ranks(self, status=None):
if status is None:
status = self.getinfo()
return status.get_ranks(self.id)
def get_rank_names(self, status=None):
"""
Return MDS daemon names of those daemons holding a rank,
@ -854,6 +864,10 @@ class Filesystem(MDSCluster):
return self.json_asok(command, 'mds', mds_id)
def rank_asok(self, command, rank=0):
info = self.get_rank(rank=rank)
return self.json_asok(command, 'mds', info['name'])
def read_cache(self, path, depth=None):
cmd = ["dump", "tree", path]
if depth is not None:

View File

@ -0,0 +1,45 @@
"""
Upgrade cluster snap format.
"""
import logging
import time
from tasks.cephfs.filesystem import Filesystem
log = logging.getLogger(__name__)
def task(ctx, config):
"""
Upgrade CephFS file system snap format.
"""
if config is None:
config = {}
assert isinstance(config, dict), \
'snap-upgrade task only accepts a dict for configuration'
fs = Filesystem(ctx)
mds_map = fs.get_mds_map()
assert(mds_map['max_mds'] == 1)
json = fs.rank_asok(["scrub_path", "/", "force", "recursive", "repair"])
if not json or json['return_code'] == 0:
log.info("scrub / completed")
else:
log.info("scrub / failed: {}".format(json))
json = fs.rank_asok(["scrub_path", "~mdsdir", "force", "recursive", "repair"])
if not json or json['return_code'] == 0:
log.info("scrub ~mdsdir completed")
else:
log.info("scrub / failed: {}".format(json))
for i in range(0, 10):
mds_map = fs.get_mds_map()
if (mds_map['flags'] & (1<<1)) != 0 and (mds_map['flags'] & (1<<4)) != 0:
break
time.sleep(10)
assert((mds_map['flags'] & (1<<1)) != 0) # Test CEPH_MDSMAP_ALLOW_SNAPS
assert((mds_map['flags'] & (1<<4)) != 0) # Test CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS

View File

@ -0,0 +1,56 @@
"""
Prepare MDS cluster for upgrade.
"""
import logging
import time
from tasks.cephfs.filesystem import Filesystem
log = logging.getLogger(__name__)
def task(ctx, config):
"""
Prepare MDS cluster for upgrade.
This task reduces ranks to 1 and stops all standbys.
"""
if config is None:
config = {}
assert isinstance(config, dict), \
'snap-upgrade task only accepts a dict for configuration'
fs = Filesystem(ctx)
status = fs.getinfo()
fs.set_max_mds(1)
status = fs.getinfo()
targets = filter(lambda r: r['rank'] >= 1, fs.get_ranks(status=status))
if len(targets) > 0:
# deactivate mds in decending order
targets = sorted(targets, key=lambda r: r['rank'], reverse=True)
for target in targets:
self.log("deactivating rank %d" % target['rank'])
self.fs.deactivate(target['rank'])
status = self.wait_for_stable()[0]
else:
status = self.wait_for_stable()[0]
assert(fs.get_mds_map(status=status)['max_mds'] == 1)
assert(fs.get_mds_map(status=status)['in'] == [0])
# Stop standbys now to minimize time rank 0 is down in subsequent:
# tasks:
# - ceph.stop: [mds.*]
rank0 = fs.get_rank(rank=0, status=status)
for daemon in ctx.daemons.iter_daemons_of_role('mds', fs.mon_manager.cluster):
if rank0['name'] != daemon.id_:
daemon.stop()
for i in range(1, 10):
time.sleep(5) # time for FSMap to update
status = fs.getinfo()
if len(list(status.get_standbys())) == 0:
break
assert(len(list(status.get_standbys())) == 0)

View File

@ -160,6 +160,7 @@ def task(ctx, config):
refspec = Head()
timeout = config.get('timeout', '3h')
cleanup = config.get('cleanup', True)
log.info('Pulling workunits from ref %s', refspec)
@ -181,24 +182,28 @@ def task(ctx, config):
created_mountpoint[role] = created_mnt_dir
# Execute any non-all workunits
log.info("timeout={}".format(timeout))
log.info("cleanup={}".format(cleanup))
with parallel() as p:
for role, tests in clients.iteritems():
if role != "all":
p.spawn(_run_tests, ctx, refspec, role, tests,
config.get('env'),
basedir=config.get('basedir','qa/workunits'),
timeout=timeout)
timeout=timeout,cleanup=cleanup)
# Clean up dirs from any non-all workunits
for role, created in created_mountpoint.items():
_delete_dir(ctx, role, created)
if cleanup:
# Clean up dirs from any non-all workunits
for role, created in created_mountpoint.items():
_delete_dir(ctx, role, created)
# Execute any 'all' workunits
if 'all' in clients:
all_tasks = clients["all"]
_spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'),
config.get('basedir', 'qa/workunits'),
config.get('subdir'), timeout=timeout)
config.get('subdir'), timeout=timeout,
cleanup=cleanup)
def _client_mountpoint(ctx, cluster, id_):
@ -326,7 +331,7 @@ def _make_scratch_dir(ctx, role, subdir):
return created_mountpoint
def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=None):
def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=None, cleanup=True):
"""
Make a scratch directory for each client in the cluster, and then for each
test spawn _run_tests() for each role.
@ -351,12 +356,13 @@ def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=Non
timeout=timeout)
# cleanup the generated client directories
for role, _ in client_remotes.items():
_delete_dir(ctx, role, created_mountpoint[role])
if cleanup:
for role, _ in client_remotes.items():
_delete_dir(ctx, role, created_mountpoint[role])
def _run_tests(ctx, refspec, role, tests, env, basedir,
subdir=None, timeout=None):
subdir=None, timeout=None, cleanup=True):
"""
Run the individual test. Create a scratch directory and then extract the
workunits from git. Make the executables, and then run the tests.
@ -472,10 +478,11 @@ def _run_tests(ctx, refspec, role, tests, env, basedir,
args=args,
label="workunit test {workunit}".format(workunit=workunit)
)
remote.run(
logger=log.getChild(role),
args=['sudo', 'rm', '-rf', '--', scratch_tmp],
)
if cleanup:
remote.run(
logger=log.getChild(role),
args=['sudo', 'rm', '-rf', '--', scratch_tmp],
)
finally:
log.info('Stopping %s on %s...', tests, role)
remote.run(

View File

@ -0,0 +1,24 @@
#!/bin/sh
set -ex
if [ -d "$1" ]; then
mkdir -p -- "$1" && cd "$1"
fi
[ "$VERIFY" != verify ] && mkdir 1
[ "$VERIFY" != verify ] && mkdir 1/.snap/first
stat 1/.snap/first
[ "$VERIFY" != verify ] && mkdir 1/2
stat 1/.snap/first/2 && exit 1
[ "$VERIFY" != verify ] && mkdir 1/2/.snap/second
stat 1/2/.snap/second
[ "$VERIFY" != verify ] && touch 1/foo
stat 1/.snap/first/foo && exit 1
[ "$VERIFY" != verify ] && mkdir 1/.snap/third
stat 1/.snap/third/foo || exit 1
[ "$VERIFY" != verify ] && mkdir 1/2/3
[ "$VERIFY" != verify ] && mkdir 1/2/.snap/fourth
stat 1/2/.snap/fourth/3
exit 0