Merge pull request #48592 from adk3798/offline-upgrade

mgr/cephadm: improve offline host handling, mostly around upgrade Reviewed-by: Redouane Kachach <rkachach@redhat.com>
2025-01-03 09:32:43 +00:00 · 2023-01-13 12:01:41 -05:00 · 2023-01-13 12:01:41 -05:00 · 2b8e7a6155
commit 2b8e7a6155
parent 06206df4ea beea26fc26
5 changed files with 99 additions and 9 deletions
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@ -1106,6 +1106,10 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
                                                                             error_ok=True, no_fsid=True))
            if code:
                return 1, '', ('check-host failed:\n' + '\n'.join(err))
+        except ssh.HostConnectionError as e:
+            self.log.exception(f"check-host failed for '{host}' at addr ({e.addr}) due to connection failure: {str(e)}")
+            return 1, '', ('check-host failed:\n'
+                           + f"Failed to connect to {host} at address ({e.addr}): {str(e)}")
        except OrchestratorError:
            self.log.exception(f"check-host failed for '{host}'")
            return 1, '', ('check-host failed:\n'
@ -1586,6 +1590,7 @@ Then run the following:
        self.inventory.rm_host(host)
        self.cache.rm_host(host)
        self.ssh.reset_con(host)
+        self.offline_hosts_remove(host)  # if host was in offline host list, we should remove it now.
        self.event.set()  # refresh stray health check
        self.log.info('Removed host %s' % host)
        return "Removed {} host '{}'".format('offline' if offline else '', host)
@ -2998,7 +3003,9 @@ Then run the following:
    def upgrade_start(self, image: str, version: str, daemon_types: Optional[List[str]] = None, host_placement: Optional[str] = None,
                      services: Optional[List[str]] = None, limit: Optional[int] = None) -> str:
        if self.inventory.get_host_with_state("maintenance"):
-            raise OrchestratorError("upgrade aborted - you have host(s) in maintenance state")
+            raise OrchestratorError("Upgrade aborted - you have host(s) in maintenance state")
+        if self.offline_hosts:
+            raise OrchestratorError(f"Upgrade aborted - Some host(s) are currently offline: {self.offline_hosts}")
        if daemon_types is not None and services is not None:
            raise OrchestratorError('--daemon-types and --services are mutually exclusive')
        if daemon_types is not None:
--- a/src/pybind/mgr/cephadm/ssh.py
+++ b/src/pybind/mgr/cephadm/ssh.py
@ -26,6 +26,14 @@ logger = logging.getLogger(__name__)
 asyncssh_logger = logging.getLogger('asyncssh')
 asyncssh_logger.propagate = False

+
+class HostConnectionError(OrchestratorError):
+    def __init__(self, message: str, hostname: str, addr: str) -> None:
+        super().__init__(message)
+        self.hostname = hostname
+        self.addr = addr
+
+
 DEFAULT_SSH_CONFIG = """
 Host *
  User root
@ -106,19 +114,19 @@ class SSHManager:
            log_content = log_string.getvalue()
            msg = f"Can't communicate with remote host `{addr}`, possibly because python3 is not installed there or you are missing NOPASSWD in sudoers. {str(e)}"
            logger.exception(msg)
-            raise OrchestratorError(msg)
+            raise HostConnectionError(msg, host, addr)
        except asyncssh.Error as e:
            self.mgr.offline_hosts.add(host)
            log_content = log_string.getvalue()
            msg = f'Failed to connect to {host} ({addr}). {str(e)}' + '\n' + f'Log: {log_content}'
            logger.debug(msg)
-            raise OrchestratorError(msg)
+            raise HostConnectionError(msg, host, addr)
        except Exception as e:
            self.mgr.offline_hosts.add(host)
            log_content = log_string.getvalue()
            logger.exception(str(e))
-            raise OrchestratorError(
-                f'Failed to connect to {host} ({addr}): {repr(e)}' + '\n' f'Log: {log_content}')
+            raise HostConnectionError(
+                f'Failed to connect to {host} ({addr}): {repr(e)}' + '\n' f'Log: {log_content}', host, addr)
        finally:
            log_string.flush()
            asyncssh_logger.removeHandler(ch)
@ -150,7 +158,12 @@ class SSHManager:
            logger.debug(f'Connection to {host} failed. {str(e)}')
            await self._reset_con(host)
            self.mgr.offline_hosts.add(host)
-            raise OrchestratorError(f'Unable to reach remote host {host}. {str(e)}')
+            if not addr:
+                try:
+                    addr = self.mgr.inventory.get_addr(host)
+                except Exception:
+                    addr = host
+            raise HostConnectionError(f'Unable to reach remote host {host}. {str(e)}', host, addr)

        def _rstrip(v: Union[bytes, str, None]) -> str:
            if not v:
--- a/src/pybind/mgr/cephadm/tests/test_ssh.py
+++ b/src/pybind/mgr/cephadm/tests/test_ssh.py
@ -38,7 +38,7 @@ class TestWithSSH:
                asyncssh_connect.side_effect = ConnectionLost('reason')
                code, out, err = cephadm_module.check_host('test')
                assert out == ''
-                assert "Host 'test' not found" in err
+                assert "Failed to connect to test at address (1::4)" in err

                out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json()
                assert out == HostSpec('test', '1::4', status='Offline').to_json()
--- a/src/pybind/mgr/cephadm/tests/test_upgrade.py
+++ b/src/pybind/mgr/cephadm/tests/test_upgrade.py
@ -5,7 +5,8 @@ import pytest

 from ceph.deployment.service_spec import PlacementSpec, ServiceSpec
 from cephadm import CephadmOrchestrator
-from cephadm.upgrade import CephadmUpgrade
+from cephadm.upgrade import CephadmUpgrade, UpgradeState
+from cephadm.ssh import HostConnectionError
 from orchestrator import OrchestratorError, DaemonDescription
 from .fixtures import _run_cephadm, wait, with_host, with_service, \
    receive_agent_metadata, async_side_effect
@ -34,6 +35,51 @@ def test_upgrade_start(cephadm_module: CephadmOrchestrator):
                            ) == 'Stopped upgrade to image_id'


+@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
+def test_upgrade_start_offline_hosts(cephadm_module: CephadmOrchestrator):
+    with with_host(cephadm_module, 'test'):
+        with with_host(cephadm_module, 'test2'):
+            cephadm_module.offline_hosts = set(['test2'])
+            with pytest.raises(OrchestratorError, match=r"Upgrade aborted - Some host\(s\) are currently offline: {'test2'}"):
+                cephadm_module.upgrade_start('image_id', None)
+            cephadm_module.offline_hosts = set([])  # so remove_host doesn't fail when leaving the with_host block
+
+
+@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
+def test_upgrade_daemons_offline_hosts(cephadm_module: CephadmOrchestrator):
+    with with_host(cephadm_module, 'test'):
+        with with_host(cephadm_module, 'test2'):
+            cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0)
+            with mock.patch("cephadm.serve.CephadmServe._run_cephadm", side_effect=HostConnectionError('connection failure reason', 'test2', '192.168.122.1')):
+                _to_upgrade = [(DaemonDescription(daemon_type='crash', daemon_id='test2', hostname='test2'), True)]
+                with pytest.raises(HostConnectionError, match=r"connection failure reason"):
+                    cephadm_module.upgrade._upgrade_daemons(_to_upgrade, 'target_image', ['digest1'])
+
+
+@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
+def test_do_upgrade_offline_hosts(cephadm_module: CephadmOrchestrator):
+    with with_host(cephadm_module, 'test'):
+        with with_host(cephadm_module, 'test2'):
+            cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0)
+            cephadm_module.offline_hosts = set(['test2'])
+            with pytest.raises(HostConnectionError, match=r"Host\(s\) were marked offline: {'test2'}"):
+                cephadm_module.upgrade._do_upgrade()
+            cephadm_module.offline_hosts = set([])  # so remove_host doesn't fail when leaving the with_host block
+
+
+@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
+@mock.patch("cephadm.module.CephadmOrchestrator.remove_health_warning")
+def test_upgrade_resume_clear_health_warnings(_rm_health_warning, cephadm_module: CephadmOrchestrator):
+    with with_host(cephadm_module, 'test'):
+        with with_host(cephadm_module, 'test2'):
+            cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0, paused=True)
+            _rm_health_warning.return_value = None
+            assert wait(cephadm_module, cephadm_module.upgrade_resume()
+                        ) == 'Resumed upgrade to target_image'
+            calls_list = [mock.call(alert_id) for alert_id in cephadm_module.upgrade.UPGRADE_ERRORS]
+            _rm_health_warning.assert_has_calls(calls_list, any_order=True)
+
+
@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
@pytest.mark.parametrize("use_repo_digest",
                         [
--- a/src/pybind/mgr/cephadm/upgrade.py
+++ b/src/pybind/mgr/cephadm/upgrade.py
@ -10,6 +10,7 @@ from cephadm.serve import CephadmServe
 from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
 from cephadm.utils import ceph_release_to_major, name_to_config_section, CEPH_UPGRADE_ORDER, \
    MONITORING_STACK_TYPES, CEPH_TYPES, GATEWAY_TYPES
+from cephadm.ssh import HostConnectionError
 from orchestrator import OrchestratorError, DaemonDescription, DaemonDescriptionStatus, daemon_type_to_service

 if TYPE_CHECKING:
@ -122,7 +123,8 @@ class CephadmUpgrade:
        'UPGRADE_FAILED_PULL',
        'UPGRADE_REDEPLOY_DAEMON',
        'UPGRADE_BAD_TARGET_VERSION',
-        'UPGRADE_EXCEPTION'
+        'UPGRADE_EXCEPTION',
+        'UPGRADE_OFFLINE_HOST'
    ]

    def __init__(self, mgr: "CephadmOrchestrator"):
@ -467,6 +469,8 @@ class CephadmUpgrade:
        self.mgr.log.info('Upgrade: Resumed upgrade to %s' % self.target_image)
        self._save_upgrade_state()
        self.mgr.event.set()
+        for alert_id in self.UPGRADE_ERRORS:
+            self.mgr.remove_health_warning(alert_id)
        return 'Resumed upgrade to %s' % self.target_image

    def upgrade_stop(self) -> str:
@ -491,6 +495,14 @@ class CephadmUpgrade:
        if self.upgrade_state and not self.upgrade_state.paused:
            try:
                self._do_upgrade()
+            except HostConnectionError as e:
+                self._fail_upgrade('UPGRADE_OFFLINE_HOST', {
+                    'severity': 'error',
+                    'summary': f'Upgrade: Failed to connect to host {e.hostname} at addr ({e.addr})',
+                    'count': 1,
+                    'detail': [f'SSH connection failed to {e.hostname} at addr ({e.addr}): {str(e)}'],
+                })
+                return False
            except Exception as e:
                self._fail_upgrade('UPGRADE_EXCEPTION', {
                    'severity': 'error',
@ -1025,6 +1037,18 @@ class CephadmUpgrade:
            logger.debug('_do_upgrade no state, exiting')
            return

+        if self.mgr.offline_hosts:
+            # offline host(s), on top of potential connection errors when trying to upgrade a daemon
+            # or pull an image, can cause issues where daemons are never ok to stop. Since evaluating
+            # whether or not that risk is present for any given offline hosts is a difficult problem,
+            # it's best to just fail upgrade cleanly so user can address the offline host(s)
+
+            # the HostConnectionError expects a hostname and addr, so let's just take
+            # one at random. It doesn't really matter which host we say we couldn't reach here.
+            hostname: str = list(self.mgr.offline_hosts)[0]
+            addr: str = self.mgr.inventory.get_addr(hostname)
+            raise HostConnectionError(f'Host(s) were marked offline: {self.mgr.offline_hosts}', hostname, addr)
+
        target_image = self.target_image
        target_id = self.upgrade_state.target_id
        target_digests = self.upgrade_state.target_digests