From 6a788bf203dc07d32f299ce488b054addaae4f75 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Thu, 3 May 2018 13:12:54 -0700 Subject: [PATCH 1/3] qa: add mds deactivation procedure for upgrades Signed-off-by: Patrick Donnelly --- qa/tasks/cephfs/filesystem.py | 95 ++++++++++++++++++++++++++--------- qa/tasks/mds_pre_upgrade.py | 15 +----- 2 files changed, 71 insertions(+), 39 deletions(-) diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py index d22126a9baa..4ff3cc01413 100644 --- a/qa/tasks/cephfs/filesystem.py +++ b/qa/tasks/cephfs/filesystem.py @@ -9,6 +9,7 @@ import datetime import re import errno import random +import traceback from teuthology.exceptions import CommandFailedError from teuthology import misc @@ -432,6 +433,42 @@ class Filesystem(MDSCluster): raise RuntimeError("cannot specify fscid when configuring overlay") self.metadata_overlay = overlay + def deactivate(self, rank): + if rank < 0: + raise RuntimeError("invalid rank") + elif rank == 0: + raise RuntimeError("cannot deactivate rank 0") + self.mon_manager.raw_cluster_cmd("mds", "deactivate", "%d:%d" % (self.id, rank)) + + def reach_max_mds(self): + # Try to reach rank count == max_mds, up or down (UPGRADE SENSITIVE!) + status = self.getinfo() + mds_map = self.get_mds_map(status=status) + max_mds = mds_map['max_mds'] + + count = len(list(self.get_ranks(status=status))) + if count > max_mds: + try: + # deactivate mds in decending order + status = self.wait_for_daemons(status=status, skip_max_mds_check=True) + while count > max_mds: + targets = sorted(self.get_ranks(status=status), key=lambda r: r['rank'], reverse=True) + target = targets[0] + log.info("deactivating rank %d" % target['rank']) + self.deactivate(target['rank']) + status = self.wait_for_daemons(skip_max_mds_check=True) + count = len(list(self.get_ranks(status=status))) + except: + # In Mimic, deactivation is done automatically: + log.info("Error:\n{}".format(traceback.format_exc())) + status = self.wait_for_daemons() + else: + status = self.wait_for_daemons() + + mds_map = self.get_mds_map(status=status) + assert(mds_map['max_mds'] == max_mds) + assert(mds_map['in'] == list(range(0, max_mds))) + def set_var(self, var, *args): a = map(str, args) self.mon_manager.raw_cluster_cmd("fs", "set", self.name, var, *a) @@ -631,7 +668,7 @@ class Filesystem(MDSCluster): def get_usage(self): return self._df()['stats']['total_used_bytes'] - def are_daemons_healthy(self, status=None): + def are_daemons_healthy(self, status=None, skip_max_mds_check=False): """ Return true if all daemons are in one of active, standby, standby-replay, and at least max_mds daemons are in 'active'. @@ -671,30 +708,34 @@ class Filesystem(MDSCluster): active_count, mds_map['max_mds'] )) - if active_count > mds_map['max_mds']: - log.info("are_daemons_healthy: number of actives is grater than max_mds: {0}".format(mds_map)) - return False - elif active_count == mds_map['max_mds']: - # The MDSMap says these guys are active, but let's check they really are - for mds_id, mds_status in mds_map['info'].items(): - if mds_status['state'] == 'up:active': - try: - daemon_status = self.mds_asok(["status"], mds_id=mds_status['name']) - except CommandFailedError as cfe: - if cfe.exitstatus == errno.EINVAL: - # Old version, can't do this check - continue - else: - # MDS not even running + if not skip_max_mds_check: + if active_count > mds_map['max_mds']: + log.info("are_daemons_healthy: number of actives is greater than max_mds: {0}".format(mds_map)) + return False + elif active_count == mds_map['max_mds']: + # The MDSMap says these guys are active, but let's check they really are + for mds_id, mds_status in mds_map['info'].items(): + if mds_status['state'] == 'up:active': + try: + daemon_status = self.mds_asok(["status"], mds_id=mds_status['name']) + except CommandFailedError as cfe: + if cfe.exitstatus == errno.EINVAL: + # Old version, can't do this check + continue + else: + # MDS not even running + return False + + if daemon_status['state'] != 'up:active': + # MDS hasn't taken the latest map yet return False - if daemon_status['state'] != 'up:active': - # MDS hasn't taken the latest map yet - return False - - return True + return True + else: + return False else: - return False + log.info("are_daemons_healthy: skipping max_mds check") + return True def get_daemon_names(self, state=None, status=None): """ @@ -753,7 +794,7 @@ class Filesystem(MDSCluster): return result - def wait_for_daemons(self, timeout=None): + def wait_for_daemons(self, timeout=None, skip_max_mds_check=False, status=None): """ Wait until all daemons are healthy :return: @@ -762,10 +803,12 @@ class Filesystem(MDSCluster): if timeout is None: timeout = DAEMON_WAIT_TIMEOUT + if status is None: + status = self.status() + elapsed = 0 while True: - status = self.status() - if self.are_daemons_healthy(status=status): + if self.are_daemons_healthy(status=status, skip_max_mds_check=skip_max_mds_check): return status else: time.sleep(1) @@ -775,6 +818,8 @@ class Filesystem(MDSCluster): log.info("status = {0}".format(status)) raise RuntimeError("Timed out waiting for MDS daemons to become healthy") + status = self.status() + def get_lone_mds_id(self): """ Get a single MDS ID: the only one if there is only one diff --git a/qa/tasks/mds_pre_upgrade.py b/qa/tasks/mds_pre_upgrade.py index 5193f92eefa..0856d48337c 100644 --- a/qa/tasks/mds_pre_upgrade.py +++ b/qa/tasks/mds_pre_upgrade.py @@ -25,20 +25,7 @@ def task(ctx, config): status = fs.getinfo() fs.set_max_mds(1) - status = fs.getinfo() - targets = filter(lambda r: r['rank'] >= 1, fs.get_ranks(status=status)) - if len(targets) > 0: - # deactivate mds in decending order - targets = sorted(targets, key=lambda r: r['rank'], reverse=True) - for target in targets: - self.log("deactivating rank %d" % target['rank']) - self.fs.deactivate(target['rank']) - status = self.wait_for_stable()[0] - else: - status = self.wait_for_stable()[0] - - assert(fs.get_mds_map(status=status)['max_mds'] == 1) - assert(fs.get_mds_map(status=status)['in'] == [0]) + fs.reach_max_mds() # Stop standbys now to minimize time rank 0 is down in subsequent: # tasks: From 0b466cb2e6ccdeb56199cb889f24cc590aedff04 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Thu, 3 May 2018 13:15:35 -0700 Subject: [PATCH 2/3] qa: try snap format upgrade with multimds cluster Fixes: http://tracker.ceph.com/issues/24002 Signed-off-by: Patrick Donnelly --- qa/suites/fs/upgrade/snaps/overrides/{+ => %} | 0 .../snaps/overrides/{no_multimds.yaml => multimds/no.yaml} | 0 qa/suites/fs/upgrade/snaps/overrides/multimds/yes.yaml | 3 +++ 3 files changed, 3 insertions(+) rename qa/suites/fs/upgrade/snaps/overrides/{+ => %} (100%) rename qa/suites/fs/upgrade/snaps/overrides/{no_multimds.yaml => multimds/no.yaml} (100%) create mode 100644 qa/suites/fs/upgrade/snaps/overrides/multimds/yes.yaml diff --git a/qa/suites/fs/upgrade/snaps/overrides/+ b/qa/suites/fs/upgrade/snaps/overrides/% similarity index 100% rename from qa/suites/fs/upgrade/snaps/overrides/+ rename to qa/suites/fs/upgrade/snaps/overrides/% diff --git a/qa/suites/fs/upgrade/snaps/overrides/no_multimds.yaml b/qa/suites/fs/upgrade/snaps/overrides/multimds/no.yaml similarity index 100% rename from qa/suites/fs/upgrade/snaps/overrides/no_multimds.yaml rename to qa/suites/fs/upgrade/snaps/overrides/multimds/no.yaml diff --git a/qa/suites/fs/upgrade/snaps/overrides/multimds/yes.yaml b/qa/suites/fs/upgrade/snaps/overrides/multimds/yes.yaml new file mode 100644 index 00000000000..ecf118d9f86 --- /dev/null +++ b/qa/suites/fs/upgrade/snaps/overrides/multimds/yes.yaml @@ -0,0 +1,3 @@ +overrides: + ceph: + max_mds: 2 From 4d37b0ee8d9594977aeae02c536b5f5fcc187e33 Mon Sep 17 00:00:00 2001 From: Patrick Donnelly Date: Thu, 3 May 2018 15:07:21 -0700 Subject: [PATCH 3/3] qa: move snap-hierarchy out of snaps workunits The snapshot hierarchy it leaves behind can't be cleaned up by `rm -rf` which breaks workunit cleanup. So, don't run this as part of normal snaps test. Signed-off-by: Patrick Donnelly --- qa/suites/fs/upgrade/snaps/tasks/1-client.yaml | 2 +- qa/suites/fs/upgrade/snaps/tasks/5-client-sanity.yaml | 2 +- qa/workunits/fs/{snaps => }/snap-hierarchy.sh | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename qa/workunits/fs/{snaps => }/snap-hierarchy.sh (100%) diff --git a/qa/suites/fs/upgrade/snaps/tasks/1-client.yaml b/qa/suites/fs/upgrade/snaps/tasks/1-client.yaml index e9dea8f4e30..0aa6dcf7877 100644 --- a/qa/suites/fs/upgrade/snaps/tasks/1-client.yaml +++ b/qa/suites/fs/upgrade/snaps/tasks/1-client.yaml @@ -9,5 +9,5 @@ tasks: cleanup: false clients: client.0: - - fs/snaps/snap-hierarchy.sh + - fs/snap-hierarchy.sh - print: "**** done snap hierarchy" diff --git a/qa/suites/fs/upgrade/snaps/tasks/5-client-sanity.yaml b/qa/suites/fs/upgrade/snaps/tasks/5-client-sanity.yaml index f32a89da452..680e440757a 100644 --- a/qa/suites/fs/upgrade/snaps/tasks/5-client-sanity.yaml +++ b/qa/suites/fs/upgrade/snaps/tasks/5-client-sanity.yaml @@ -6,5 +6,5 @@ tasks: VERIFY: verify clients: client.0: - - fs/snaps/snap-hierarchy.sh + - fs/snap-hierarchy.sh - print: "**** done verify snap hierarchy" diff --git a/qa/workunits/fs/snaps/snap-hierarchy.sh b/qa/workunits/fs/snap-hierarchy.sh similarity index 100% rename from qa/workunits/fs/snaps/snap-hierarchy.sh rename to qa/workunits/fs/snap-hierarchy.sh