Merge PR #47732 into main

* refs/pull/47732/head:
	Revert "Merge pull request #47092 from dparmar18/wip-dparmar-cephadm-simple-1"

Reviewed-by: Adam King <adking@redhat.com>
This commit is contained in:
Patrick Donnelly 2022-08-22 20:04:54 -04:00
commit 4d9be7e13b
No known key found for this signature in database
GPG Key ID: BE69BB7D36E459B4
35 changed files with 28 additions and 333 deletions

View File

@ -99,3 +99,4 @@ of the feature, refer this link on how to perform it:
https://docs.ceph.com/en/quincy/cephadm/upgrade/#staggered-upgrade
Relevant tracker: https://tracker.ceph.com/issues/55715
Relevant tracker: https://tracker.ceph.com/issues/5614

View File

@ -48,31 +48,6 @@ The automated upgrade process follows Ceph best practices. For example:
Starting the upgrade
====================
.. note::
.. note::
`Staggered Upgrade`_ of the mons/mgrs may be necessary to have access
to this new feature.
Cephadm by default reduces `max_mds` to `1`. This can be disruptive for large
scale CephFS deployments because the cluster cannot quickly reduce active MDS(s)
to `1` and a single active MDS cannot easily handle the load of all clients
even for a short time. Therefore, to upgrade MDS(s) without reducing `max_mds`,
the `fail_fs` option can to be set to `true` (default value is `false`) prior
to initiating the upgrade:
.. prompt:: bash #
ceph config set mgr mgr/orchestrator/fail_fs true
This would:
#. Fail CephFS filesystems, bringing active MDS daemon(s) to
`up:standby` state.
#. Upgrade MDS daemons safely.
#. Bring CephFS filesystems back up, bringing the state of active
MDS daemon(s) from `up:standby` to `up:active`.
Before you use cephadm to upgrade Ceph, verify that all hosts are currently online and that your cluster is healthy by running the following command:
.. prompt:: bash #

View File

@ -1 +0,0 @@
.qa/cephfs/objectstore-ec/bluestore-bitmap.yaml

View File

@ -1 +0,0 @@
.qa/distros/podman/centos_8.stream_container_tools.yaml

View File

@ -1 +0,0 @@
.qa/cephfs/conf/

View File

@ -1 +0,0 @@
.qa/cephfs/overrides/ignorelist_health.yaml

View File

@ -1 +0,0 @@
.qa/cephfs/overrides/ignorelist_wrongly_marked_down.yaml

View File

@ -1,5 +0,0 @@
overrides:
ceph:
conf:
global:
mon pg warn min per osd: 0

View File

@ -1,3 +0,0 @@
overrides:
kclient:
syntax: 'v1'

View File

@ -1,11 +0,0 @@
roles:
- - host.a
- client.0
- osd.0
- osd.1
- osd.2
- - host.b
- client.1
- osd.3
- osd.4
- osd.5

View File

@ -1,32 +0,0 @@
meta:
- desc: |
setup ceph/pacific
tasks:
- install:
branch: pacific
exclude_packages:
- ceph-volume
- print: "**** done install task..."
- cephadm:
image: quay.io/ceph/daemon-base:latest-pacific
roleless: true
cephadm_branch: pacific
cephadm_git_url: https://github.com/ceph/ceph
conf:
osd:
#set config option for which cls modules are allowed to be loaded / used
osd_class_load_list: "*"
osd_class_default_list: "*"
- print: "**** done end installing pacific cephadm ..."
- cephadm.shell:
host.a:
- ceph config set mgr mgr/cephadm/use_repo_digest true --force
- print: "**** done cephadm.shell ceph config set mgr..."
- cephadm.shell:
host.a:
- ceph orch status
- ceph orch ps
- ceph orch ls
- ceph orch host ls
- ceph orch device ls

View File

@ -1,30 +0,0 @@
meta:
- desc: |
setup ceph/pacific v16.2.4
tasks:
# Disable metrics sending by kclient as it may crash (assert) a v16.2.4 MDS
- pexec:
clients:
- sudo modprobe -r ceph
- sudo modprobe ceph disable_send_metrics=on
- install:
tag: v16.2.4
exclude_packages:
- ceph-volume
- print: "**** done install task..."
- cephadm:
roleless: true
image: quay.io/ceph/ceph:v16.2.4
cephadm_branch: v16.2.4
cephadm_git_url: https://github.com/ceph/ceph
# needed for v16.2.4 due to --skip-admin-label
avoid_pacific_features: true
- print: "**** done starting v16.2.4"
- cephadm.shell:
host.a:
- ceph orch status
- ceph orch ps
- ceph orch ls
- ceph orch host ls
- ceph orch device ls

View File

@ -1,5 +0,0 @@
tasks:
- cephadm.shell:
host.a:
- ceph fs volume create cephfs --placement=4
- ceph fs dump

View File

@ -1,4 +0,0 @@
tasks:
- cephadm.shell:
host.a:
- ceph fs set cephfs max_mds 2

View File

@ -1,4 +0,0 @@
tasks:
- cephadm.shell:
host.a:
- ceph fs set cephfs allow_standby_replay false

View File

@ -1,4 +0,0 @@
tasks:
- cephadm.shell:
host.a:
- ceph fs set cephfs inline_data false

View File

@ -1,4 +0,0 @@
tasks:
- cephadm.shell:
host.a:
- ceph fs set cephfs inline_data true --yes-i-really-really-mean-it

View File

@ -1,7 +0,0 @@
tasks:
- cephadm.shell:
host.a:
- ceph fs dump
- ceph --format=json fs dump | jq -e ".filesystems | length == 1"
- while ! ceph --format=json mds versions | jq -e ". | add == 4"; do sleep 1; done
- fs.pre_upgrade_save:

View File

@ -1,3 +0,0 @@
tasks:
- kclient:
- print: "**** done client"

View File

@ -1,73 +0,0 @@
tasks:
- parallel:
- upgrade-tasks
- workload-tasks
upgrade-tasks:
sequential:
- cephadm.shell:
env: [sha1]
host.a:
- ceph config set mon mon_warn_on_insecure_global_id_reclaim false --force
- ceph config set mon mon_warn_on_insecure_global_id_reclaim_allowed false --force
- ceph config set global log_to_journald false --force
- ceph orch ps
- ceph versions
- ceph -s
- ceph orch ls
- ceph orch daemon redeploy "mgr.$(ceph mgr dump -f json | jq .standbys | jq .[] | jq -r .name)" --image quay.ceph.io/ceph-ci/ceph:$sha1
- ceph orch ps --refresh
- sleep 300
- ceph orch ps
- ceph versions
- ceph -s
- ceph versions | jq -e '.mgr | length == 2'
- ceph mgr fail
- sleep 180
- ceph orch daemon redeploy "mgr.$(ceph mgr dump -f json | jq .standbys | jq .[] | jq -r .name)" --image quay.ceph.io/ceph-ci/ceph:$sha1
- ceph orch ps --refresh
- sleep 180
- ceph orch ps
- ceph versions
- ceph -s
- ceph mgr fail
- sleep 300
- ceph orch ps
- ceph versions
- ceph -s
- ceph versions | jq -e '.mgr | length == 1'
- ceph mgr fail
- sleep 180
- ceph orch ps
- ceph versions
- ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1 --daemon-types mgr
- while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph orch upgrade status ; sleep 30 ; done
- ceph versions | jq -e '.mgr | length == 1'
- ceph versions | jq -e '.mgr | keys' | grep $sha1
- ceph versions | jq -e '.overall | length == 2'
- ceph orch upgrade check quay.ceph.io/ceph-ci/ceph:$sha1 | jq -e '.up_to_date | length == 2'
- ceph orch ps --refresh
- sleep 180
- ceph config set mgr mgr/orchestrator/fail_fs true
- ceph orch upgrade start --image quay.ceph.io/ceph-ci/ceph:$sha1
- cephadm.shell:
env: [sha1]
host.a:
- while ceph orch upgrade status | jq '.in_progress' | grep true && ! ceph orch upgrade status | jq '.message' | grep Error ; do ceph orch ps ; ceph versions ; ceph fs dump; ceph orch upgrade status ; sleep 30 ; done
- ceph orch ps
- ceph versions
- echo "wait for servicemap items w/ changing names to refresh"
- sleep 60
- ceph orch ps
- ceph health detail
- ceph orch upgrade status
- ceph versions
- ceph versions | jq -e '.overall | length == 1'
- ceph versions | jq -e '.overall | keys' | grep $sha1
workload-tasks:
sequential:
- workunit:
clients:
all:
- suites/fsstress.sh

View File

@ -1,5 +0,0 @@
tasks:
- cephadm.shell:
host.a:
- ceph fs dump
- fs.post_upgrade_checks:

View File

@ -11,7 +11,6 @@ log = logging.getLogger(__name__)
# Everything up to CEPH_MDSMAP_ALLOW_STANDBY_REPLAY
CEPH_MDSMAP_ALLOW_STANDBY_REPLAY = (1<<5)
CEPH_MDSMAP_NOT_JOINABLE = (1 << 0)
CEPH_MDSMAP_LAST = CEPH_MDSMAP_ALLOW_STANDBY_REPLAY
UPGRADE_FLAGS_MASK = ((CEPH_MDSMAP_LAST<<1) - 1)
def pre_upgrade_save(ctx, config):
@ -60,35 +59,21 @@ def post_upgrade_checks(ctx, config):
epoch = mdsmap['epoch']
pre_upgrade_epoch = fs_state['epoch']
assert pre_upgrade_epoch < epoch
multiple_max_mds = fs_state['max_mds'] > 1
should_decrease_max_mds = fs_state['max_mds'] > 1
did_decrease_max_mds = False
should_disable_allow_standby_replay = fs_state['flags'] & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY
did_disable_allow_standby_replay = False
did_fail_fs = False
for i in range(pre_upgrade_epoch+1, mdsmap['epoch']):
old_status = mdsc.status(epoch=i)
old_fs = old_status.get_fsmap(fscid)
old_mdsmap = old_fs['mdsmap']
if not multiple_max_mds \
and (old_mdsmap['flags'] & CEPH_MDSMAP_NOT_JOINABLE):
raise RuntimeError('mgr is failing fs when there is only one '
f'rank in epoch {i}.')
if multiple_max_mds \
and (old_mdsmap['flags'] & CEPH_MDSMAP_NOT_JOINABLE) \
and old_mdsmap['max_mds'] == 1:
raise RuntimeError('mgr is failing fs as well the max_mds '
f'is reduced in epoch {i}')
if old_mdsmap['flags'] & CEPH_MDSMAP_NOT_JOINABLE:
log.debug(f"max_mds not reduced in epoch {i} as fs was failed "
"for carrying out rapid multi-rank mds upgrade")
did_fail_fs = True
if multiple_max_mds and old_mdsmap['max_mds'] == 1:
if should_decrease_max_mds and old_mdsmap['max_mds'] == 1:
log.debug(f"max_mds reduced in epoch {i}")
did_decrease_max_mds = True
if should_disable_allow_standby_replay and not (old_mdsmap['flags'] & CEPH_MDSMAP_ALLOW_STANDBY_REPLAY):
log.debug(f"allow_standby_replay disabled in epoch {i}")
did_disable_allow_standby_replay = True
assert not multiple_max_mds or did_fail_fs or did_decrease_max_mds
assert not should_decrease_max_mds or did_decrease_max_mds
assert not should_disable_allow_standby_replay or did_disable_allow_standby_replay

View File

@ -2,7 +2,7 @@ import json
import logging
import time
import uuid
from typing import TYPE_CHECKING, Optional, Dict, List, Tuple, Any, cast
from typing import TYPE_CHECKING, Optional, Dict, List, Tuple, Any
import orchestrator
from cephadm.registry import Registry
@ -20,7 +20,6 @@ logger = logging.getLogger(__name__)
# from ceph_fs.h
CEPH_MDSMAP_ALLOW_STANDBY_REPLAY = (1 << 5)
CEPH_MDSMAP_NOT_JOINABLE = (1 << 0)
def normalize_image_digest(digest: str, default_registry: str) -> str:
@ -59,7 +58,6 @@ class UpgradeState:
target_version: Optional[str] = None,
error: Optional[str] = None,
paused: Optional[bool] = None,
fail_fs: bool = False,
fs_original_max_mds: Optional[Dict[str, int]] = None,
fs_original_allow_standby_replay: Optional[Dict[str, bool]] = None,
daemon_types: Optional[List[str]] = None,
@ -78,7 +76,6 @@ class UpgradeState:
self.fs_original_max_mds: Optional[Dict[str, int]] = fs_original_max_mds
self.fs_original_allow_standby_replay: Optional[Dict[str,
bool]] = fs_original_allow_standby_replay
self.fail_fs = fail_fs
self.daemon_types = daemon_types
self.hosts = hosts
self.services = services
@ -92,7 +89,6 @@ class UpgradeState:
'target_id': self.target_id,
'target_digests': self.target_digests,
'target_version': self.target_version,
'fail_fs': self.fail_fs,
'fs_original_max_mds': self.fs_original_max_mds,
'fs_original_allow_standby_replay': self.fs_original_allow_standby_replay,
'error': self.error,
@ -303,8 +299,6 @@ class CephadmUpgrade:
def upgrade_start(self, image: str, version: str, daemon_types: Optional[List[str]] = None,
hosts: Optional[List[str]] = None, services: Optional[List[str]] = None, limit: Optional[int] = None) -> str:
fail_fs_value = cast(bool, self.mgr.get_module_option_ex(
'orchestrator', 'fail_fs', False))
if self.mgr.mode != 'root':
raise OrchestratorError('upgrade is not supported in %s mode' % (
self.mgr.mode))
@ -342,7 +336,6 @@ class CephadmUpgrade:
self.upgrade_state = UpgradeState(
target_name=target_name,
progress_id=str(uuid.uuid4()),
fail_fs=fail_fs_value,
daemon_types=daemon_types,
hosts=hosts,
services=services,
@ -619,43 +612,27 @@ class CephadmUpgrade:
# scale down this filesystem?
if mdsmap["max_mds"] > 1:
if self.upgrade_state.fail_fs:
if not (mdsmap['flags'] & CEPH_MDSMAP_NOT_JOINABLE) and \
len(mdsmap['up']) > 0:
self.mgr.log.info(f'Upgrade: failing fs {fs_name} for '
f'rapid multi-rank mds upgrade')
ret, out, err = self.mgr.check_mon_command({
'prefix': 'fs fail',
'fs_name': fs_name
})
if ret != 0:
continue_upgrade = False
continue
else:
self.mgr.log.info('Upgrade: Scaling down filesystem %s' % (
fs_name
))
if fscid not in self.upgrade_state.fs_original_max_mds:
self.upgrade_state.fs_original_max_mds[fscid] = \
mdsmap['max_mds']
self._save_upgrade_state()
ret, out, err = self.mgr.check_mon_command({
'prefix': 'fs set',
'fs_name': fs_name,
'var': 'max_mds',
'val': '1',
})
continue_upgrade = False
continue
self.mgr.log.info('Upgrade: Scaling down filesystem %s' % (
fs_name
))
if fscid not in self.upgrade_state.fs_original_max_mds:
self.upgrade_state.fs_original_max_mds[fscid] = mdsmap['max_mds']
self._save_upgrade_state()
ret, out, err = self.mgr.check_mon_command({
'prefix': 'fs set',
'fs_name': fs_name,
'var': 'max_mds',
'val': '1',
})
continue_upgrade = False
continue
if not self.upgrade_state.fail_fs:
if not (mdsmap['in'] == [0] and len(mdsmap['up']) <= 1):
self.mgr.log.info(
'Upgrade: Waiting for fs %s to scale down to reach 1 MDS' % (
fs_name))
time.sleep(10)
continue_upgrade = False
continue
if not (mdsmap['in'] == [0] and len(mdsmap['up']) <= 1):
self.mgr.log.info(
'Upgrade: Waiting for fs %s to scale down to reach 1 MDS' % (fs_name))
time.sleep(10)
continue_upgrade = False
continue
if len(mdsmap['up']) == 0:
self.mgr.log.warning(
@ -799,15 +776,7 @@ class CephadmUpgrade:
return False, to_upgrade
if d.daemon_type == 'mds' and self._enough_mds_for_ok_to_stop(d):
# when fail_fs is set to true, all MDS daemons will be moved to
# up:standby state, so Cephadm won't be able to upgrade due to
# this check and and will warn with "It is NOT safe to stop
# mds.<daemon_name> at this time: one or more filesystems is
# currently degraded", therefore we bypass this check for that
# case.
assert self.upgrade_state is not None
if not self.upgrade_state.fail_fs \
and not self._wait_for_ok_to_stop(d, known_ok_to_stop):
if not self._wait_for_ok_to_stop(d, known_ok_to_stop):
return False, to_upgrade
to_upgrade.append(d_entry)
@ -953,25 +922,7 @@ class CephadmUpgrade:
def _complete_mds_upgrade(self) -> None:
assert self.upgrade_state is not None
if self.upgrade_state.fail_fs:
for fs in self.mgr.get("fs_map")['filesystems']:
fs_name = fs['mdsmap']['fs_name']
self.mgr.log.info('Upgrade: Setting filesystem '
f'{fs_name} Joinable')
try:
ret, _, err = self.mgr.check_mon_command({
'prefix': 'fs set',
'fs_name': fs_name,
'var': 'joinable',
'val': 'true',
})
except Exception as e:
logger.error("Failed to set fs joinable "
f"true due to {e}")
raise OrchestratorError("Failed to set"
"fs joinable true"
f"due to {e}")
elif self.upgrade_state.fs_original_max_mds:
if self.upgrade_state.fs_original_max_mds:
for fs in self.mgr.get("fs_map")['filesystems']:
fscid = fs["id"]
fs_name = fs['mdsmap']['fs_name']

View File

@ -213,13 +213,7 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
desc='Orchestrator backend',
enum_allowed=['cephadm', 'rook', 'test_orchestrator'],
runtime=True,
),
Option(
'fail_fs',
type='bool',
default=False,
desc='Fail filesystem for rapid multi-rank mds upgrade'
),
)
]
NATIVE_OPTIONS = [] # type: List[dict]
@ -345,9 +339,6 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
def _select_orchestrator(self) -> str:
return cast(str, self.get_module_option("orchestrator"))
def _get_fail_fs_value(self) -> bool:
return bool(self.get_module_option("fail_fs"))
@_cli_write_command('orch host add')
def _add_host(self,
hostname: str,
@ -1493,12 +1484,6 @@ Usage:
self._set_backend('')
assert self._select_orchestrator() is None
self._set_backend(old_orch)
old_fs_fail_value = self._get_fail_fs_value()
self.set_module_option("fail_fs", True)
assert self._get_fail_fs_value() is True
self.set_module_option("fail_fs", False)
assert self._get_fail_fs_value() is False
self.set_module_option("fail_fs", old_fs_fail_value)
e1 = self.remote('selftest', 'remote_from_orchestrator_cli_self_test', "ZeroDivisionError")
try: