From 3e35324dd70771d7f52d5e055d778069897f6b58 Mon Sep 17 00:00:00 2001 From: Adam King Date: Fri, 21 Oct 2022 11:57:53 -0400 Subject: [PATCH 1/7] mgr/cephadm: abort upgrade if there are offline hosts. We won't be able to complete the upgrade if there are offline hosts anyway so we might as well abort immediately. Signed-off-by: Adam King --- src/pybind/mgr/cephadm/module.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index a226e808502..1a523ed9f98 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -2989,7 +2989,9 @@ Then run the following: def upgrade_start(self, image: str, version: str, daemon_types: Optional[List[str]] = None, host_placement: Optional[str] = None, services: Optional[List[str]] = None, limit: Optional[int] = None) -> str: if self.inventory.get_host_with_state("maintenance"): - raise OrchestratorError("upgrade aborted - you have host(s) in maintenance state") + raise OrchestratorError("Upgrade aborted - you have host(s) in maintenance state") + if self.offline_hosts: + raise OrchestratorError(f"Upgrade aborted - Some host(s) are currently offline: {self.offline_hosts}") if daemon_types is not None and services is not None: raise OrchestratorError('--daemon-types and --services are mutually exclusive') if daemon_types is not None: From bdd8008d246e1c1dc662e3492a3230c2e8c465dc Mon Sep 17 00:00:00 2001 From: Adam King Date: Fri, 21 Oct 2022 12:56:44 -0400 Subject: [PATCH 2/7] mgr/cephadm: introduce HostConnectionError exception type in ssh handling The idea is to be able to know elsewhere that the OrchestratorError we are looking at is specifically one raised due to a failure to connnect to a host. This can hopefully allow for some more precise error handling Signed-off-by: Adam King --- src/pybind/mgr/cephadm/ssh.py | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/pybind/mgr/cephadm/ssh.py b/src/pybind/mgr/cephadm/ssh.py index bdd3ae046dd..8d4fca96583 100644 --- a/src/pybind/mgr/cephadm/ssh.py +++ b/src/pybind/mgr/cephadm/ssh.py @@ -26,6 +26,14 @@ logger = logging.getLogger(__name__) asyncssh_logger = logging.getLogger('asyncssh') asyncssh_logger.propagate = False + +class HostConnectionError(OrchestratorError): + def __init__(self, message: str, hostname: str, addr: str) -> None: + super().__init__(message) + self.hostname = hostname + self.addr = addr + + DEFAULT_SSH_CONFIG = """ Host * User root @@ -106,19 +114,19 @@ class SSHManager: log_content = log_string.getvalue() msg = f"Can't communicate with remote host `{addr}`, possibly because python3 is not installed there or you are missing NOPASSWD in sudoers. {str(e)}" logger.exception(msg) - raise OrchestratorError(msg) + raise HostConnectionError(msg, host, addr) except asyncssh.Error as e: self.mgr.offline_hosts.add(host) log_content = log_string.getvalue() msg = f'Failed to connect to {host} ({addr}). {str(e)}' + '\n' + f'Log: {log_content}' logger.debug(msg) - raise OrchestratorError(msg) + raise HostConnectionError(msg, host, addr) except Exception as e: self.mgr.offline_hosts.add(host) log_content = log_string.getvalue() logger.exception(str(e)) - raise OrchestratorError( - f'Failed to connect to {host} ({addr}): {repr(e)}' + '\n' f'Log: {log_content}') + raise HostConnectionError( + f'Failed to connect to {host} ({addr}): {repr(e)}' + '\n' f'Log: {log_content}', host, addr) finally: log_string.flush() asyncssh_logger.removeHandler(ch) @@ -148,7 +156,12 @@ class SSHManager: logger.debug(f'Connection to {host} failed. {str(e)}') await self._reset_con(host) self.mgr.offline_hosts.add(host) - raise OrchestratorError(f'Unable to reach remote host {host}. {str(e)}') + if not addr: + try: + addr = self.mgr.inventory.get_addr(host) + except Exception: + addr = host + raise HostConnectionError(f'Unable to reach remote host {host}. {str(e)}', host, addr) def _rstrip(v: Union[bytes, str, None]) -> str: if not v: From 225cd2d84b9753b985113998f7f3e969794884ed Mon Sep 17 00:00:00 2001 From: Adam King Date: Fri, 21 Oct 2022 13:12:53 -0400 Subject: [PATCH 3/7] mgr/cephadm: raise a better error on connection error during upgrade Right now failures to connect to a host during the upgrade result in a "failed due to an unexpected exception" error. We can do a bit better than that. Fixes: https://tracker.ceph.com/issues/57891 Signed-off-by: Adam King --- src/pybind/mgr/cephadm/upgrade.py | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py index b7ad4a8b66e..948336aa559 100644 --- a/src/pybind/mgr/cephadm/upgrade.py +++ b/src/pybind/mgr/cephadm/upgrade.py @@ -10,6 +10,7 @@ from cephadm.serve import CephadmServe from cephadm.services.cephadmservice import CephadmDaemonDeploySpec from cephadm.utils import ceph_release_to_major, name_to_config_section, CEPH_UPGRADE_ORDER, \ MONITORING_STACK_TYPES, CEPH_TYPES, GATEWAY_TYPES +from cephadm.ssh import HostConnectionError from orchestrator import OrchestratorError, DaemonDescription, DaemonDescriptionStatus, daemon_type_to_service if TYPE_CHECKING: @@ -122,7 +123,8 @@ class CephadmUpgrade: 'UPGRADE_FAILED_PULL', 'UPGRADE_REDEPLOY_DAEMON', 'UPGRADE_BAD_TARGET_VERSION', - 'UPGRADE_EXCEPTION' + 'UPGRADE_EXCEPTION', + 'UPGRADE_OFFLINE_HOST' ] def __init__(self, mgr: "CephadmOrchestrator"): @@ -491,6 +493,14 @@ class CephadmUpgrade: if self.upgrade_state and not self.upgrade_state.paused: try: self._do_upgrade() + except HostConnectionError as e: + self._fail_upgrade('UPGRADE_OFFLINE_HOST', { + 'severity': 'error', + 'summary': f'Upgrade: Failed to connect to host {e.hostname} at addr ({e.addr})', + 'count': 1, + 'detail': [f'SSH connection failed to {e.hostname} at addr ({e.addr}): {str(e)}'], + }) + return False except Exception as e: self._fail_upgrade('UPGRADE_EXCEPTION', { 'severity': 'error', @@ -1025,6 +1035,18 @@ class CephadmUpgrade: logger.debug('_do_upgrade no state, exiting') return + if self.mgr.offline_hosts: + # offline host(s), on top of potential connection errors when trying to upgrade a daemon + # or pull an image, can cause issues where daemons are never ok to stop. Since evaluating + # whether or not that risk is present for any given offline hosts is a difficult problem, + # it's best to just fail upgrade cleanly so user can address the offline host(s) + + # the HostConnectionError expects a hostname and addr, so let's just take + # one at random. It doesn't really matter which host we say we couldn't reach here. + hostname: str = list(self.mgr.offline_hosts)[0] + addr: str = self.mgr.inventory.get_addr(hostname) + raise HostConnectionError(f'Host(s) were marked offline: {self.mgr.offline_hosts}', hostname, addr) + target_image = self.target_image target_id = self.upgrade_state.target_id target_digests = self.upgrade_state.target_digests From 29bff27ccfcbaac1b008b03354736c35b629e81a Mon Sep 17 00:00:00 2001 From: Adam King Date: Fri, 21 Oct 2022 14:41:03 -0400 Subject: [PATCH 4/7] mgr/cephadm: update check-host to handle new HostConnectionError exception Otherwise we'll return a message about the host not being found and to check 'ceph orch host ls' when the actual problem is the host being offline Signed-off-by: Adam King --- src/pybind/mgr/cephadm/module.py | 4 ++++ src/pybind/mgr/cephadm/tests/test_ssh.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index 1a523ed9f98..a8e17b1dfbb 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1098,6 +1098,10 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule, error_ok=True, no_fsid=True)) if code: return 1, '', ('check-host failed:\n' + '\n'.join(err)) + except ssh.HostConnectionError as e: + self.log.exception(f"check-host failed for '{host}' at addr ({e.addr}) due to connection failure: {str(e)}") + return 1, '', ('check-host failed:\n' + + f"Failed to connect to {host} at address ({e.addr}): {str(e)}") except OrchestratorError: self.log.exception(f"check-host failed for '{host}'") return 1, '', ('check-host failed:\n' diff --git a/src/pybind/mgr/cephadm/tests/test_ssh.py b/src/pybind/mgr/cephadm/tests/test_ssh.py index 3282295e5ca..4197d8d7ef0 100644 --- a/src/pybind/mgr/cephadm/tests/test_ssh.py +++ b/src/pybind/mgr/cephadm/tests/test_ssh.py @@ -38,7 +38,7 @@ class TestWithSSH: asyncssh_connect.side_effect = ConnectionLost('reason') code, out, err = cephadm_module.check_host('test') assert out == '' - assert "Host 'test' not found" in err + assert "Failed to connect to test at address (1::4)" in err out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json() assert out == HostSpec('test', '1::4', status='Offline').to_json() From c05e6766633be0894e602e9d2055e6aa0824a4a0 Mon Sep 17 00:00:00 2001 From: Adam King Date: Fri, 21 Oct 2022 16:03:29 -0400 Subject: [PATCH 5/7] mgr/cephadm: unit tests for upgrade offline host scenarios Signed-off-by: Adam King --- src/pybind/mgr/cephadm/tests/test_upgrade.py | 35 +++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/src/pybind/mgr/cephadm/tests/test_upgrade.py b/src/pybind/mgr/cephadm/tests/test_upgrade.py index 608b68f890f..2315511d447 100644 --- a/src/pybind/mgr/cephadm/tests/test_upgrade.py +++ b/src/pybind/mgr/cephadm/tests/test_upgrade.py @@ -5,7 +5,8 @@ import pytest from ceph.deployment.service_spec import PlacementSpec, ServiceSpec from cephadm import CephadmOrchestrator -from cephadm.upgrade import CephadmUpgrade +from cephadm.upgrade import CephadmUpgrade, UpgradeState +from cephadm.ssh import HostConnectionError from orchestrator import OrchestratorError, DaemonDescription from .fixtures import _run_cephadm, wait, with_host, with_service, \ receive_agent_metadata, async_side_effect @@ -34,6 +35,38 @@ def test_upgrade_start(cephadm_module: CephadmOrchestrator): ) == 'Stopped upgrade to image_id' +@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) +def test_upgrade_start_offline_hosts(cephadm_module: CephadmOrchestrator): + with with_host(cephadm_module, 'test'): + with with_host(cephadm_module, 'test2'): + cephadm_module.offline_hosts = set(['test2']) + with pytest.raises(OrchestratorError, match=r"Upgrade aborted - Some host\(s\) are currently offline: {'test2'}"): + cephadm_module.upgrade_start('image_id', None) + cephadm_module.offline_hosts = set([]) # so remove_host doesn't fail when leaving the with_host block + + +@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) +def test_upgrade_daemons_offline_hosts(cephadm_module: CephadmOrchestrator): + with with_host(cephadm_module, 'test'): + with with_host(cephadm_module, 'test2'): + cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0) + with mock.patch("cephadm.serve.CephadmServe._run_cephadm", side_effect=HostConnectionError('connection failure reason', 'test2', '192.168.122.1')): + _to_upgrade = [(DaemonDescription(daemon_type='crash', daemon_id='test2', hostname='test2'), True)] + with pytest.raises(HostConnectionError, match=r"connection failure reason"): + cephadm_module.upgrade._upgrade_daemons(_to_upgrade, 'target_image', ['digest1']) + + +@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) +def test_do_upgrade_offline_hosts(cephadm_module: CephadmOrchestrator): + with with_host(cephadm_module, 'test'): + with with_host(cephadm_module, 'test2'): + cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0) + cephadm_module.offline_hosts = set(['test2']) + with pytest.raises(HostConnectionError, match=r"Host\(s\) were marked offline: {'test2'}"): + cephadm_module.upgrade._do_upgrade() + cephadm_module.offline_hosts = set([]) # so remove_host doesn't fail when leaving the with_host block + + @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) @pytest.mark.parametrize("use_repo_digest", [ From 23f523dd0fdf079d2e0b41113c524e7b53ed3c0c Mon Sep 17 00:00:00 2001 From: Adam King Date: Fri, 21 Oct 2022 16:24:19 -0400 Subject: [PATCH 6/7] mgr/cephadm: clear upgrade health error when upgrade is resumed The health error is no longer valid if the upgrade has been removed. If the issue is still present we'll hit it again. Fixes: https://tracker.ceph.com/issues/57891 Signed-off-by: Adam King --- src/pybind/mgr/cephadm/tests/test_upgrade.py | 13 +++++++++++++ src/pybind/mgr/cephadm/upgrade.py | 2 ++ 2 files changed, 15 insertions(+) diff --git a/src/pybind/mgr/cephadm/tests/test_upgrade.py b/src/pybind/mgr/cephadm/tests/test_upgrade.py index 2315511d447..107395b93a1 100644 --- a/src/pybind/mgr/cephadm/tests/test_upgrade.py +++ b/src/pybind/mgr/cephadm/tests/test_upgrade.py @@ -67,6 +67,19 @@ def test_do_upgrade_offline_hosts(cephadm_module: CephadmOrchestrator): cephadm_module.offline_hosts = set([]) # so remove_host doesn't fail when leaving the with_host block +@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) +@mock.patch("cephadm.module.CephadmOrchestrator.remove_health_warning") +def test_upgrade_resume_clear_health_warnings(_rm_health_warning, cephadm_module: CephadmOrchestrator): + with with_host(cephadm_module, 'test'): + with with_host(cephadm_module, 'test2'): + cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0, paused=True) + _rm_health_warning.return_value = None + assert wait(cephadm_module, cephadm_module.upgrade_resume() + ) == 'Resumed upgrade to target_image' + calls_list = [mock.call(alert_id) for alert_id in cephadm_module.upgrade.UPGRADE_ERRORS] + _rm_health_warning.assert_has_calls(calls_list, any_order=True) + + @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}')) @pytest.mark.parametrize("use_repo_digest", [ diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py index 948336aa559..11dc9cfaae3 100644 --- a/src/pybind/mgr/cephadm/upgrade.py +++ b/src/pybind/mgr/cephadm/upgrade.py @@ -469,6 +469,8 @@ class CephadmUpgrade: self.mgr.log.info('Upgrade: Resumed upgrade to %s' % self.target_image) self._save_upgrade_state() self.mgr.event.set() + for alert_id in self.UPGRADE_ERRORS: + self.mgr.remove_health_warning(alert_id) return 'Resumed upgrade to %s' % self.target_image def upgrade_stop(self) -> str: From beea26fc267c338ac4ed58a39535f017f892ea85 Mon Sep 17 00:00:00 2001 From: Adam King Date: Fri, 21 Oct 2022 16:54:31 -0400 Subject: [PATCH 7/7] mgr/cephadm: remove host from offline_hosts list when removing host Otherwise, it could remain in the list and cephadm could think there is an offline host in the cluster when said host has actually already been removed. Signed-off-by: Adam King --- src/pybind/mgr/cephadm/module.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py index a8e17b1dfbb..7ab5f10f39b 100644 --- a/src/pybind/mgr/cephadm/module.py +++ b/src/pybind/mgr/cephadm/module.py @@ -1582,6 +1582,7 @@ Then run the following: self.inventory.rm_host(host) self.cache.rm_host(host) self.ssh.reset_con(host) + self.offline_hosts_remove(host) # if host was in offline host list, we should remove it now. self.event.set() # refresh stray health check self.log.info('Removed host %s' % host) return "Removed {} host '{}'".format('offline' if offline else '', host)