From 3e35324dd70771d7f52d5e055d778069897f6b58 Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Fri, 21 Oct 2022 11:57:53 -0400
Subject: [PATCH 1/7] mgr/cephadm: abort upgrade if there are offline hosts.

We won't be able to complete the upgrade if there are offline
hosts anyway so we might as well abort immediately.

Signed-off-by: Adam King <adking@redhat.com>
---
 src/pybind/mgr/cephadm/module.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index a226e808502..1a523ed9f98 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -2989,7 +2989,9 @@ Then run the following:
     def upgrade_start(self, image: str, version: str, daemon_types: Optional[List[str]] = None, host_placement: Optional[str] = None,
                       services: Optional[List[str]] = None, limit: Optional[int] = None) -> str:
         if self.inventory.get_host_with_state("maintenance"):
-            raise OrchestratorError("upgrade aborted - you have host(s) in maintenance state")
+            raise OrchestratorError("Upgrade aborted - you have host(s) in maintenance state")
+        if self.offline_hosts:
+            raise OrchestratorError(f"Upgrade aborted - Some host(s) are currently offline: {self.offline_hosts}")
         if daemon_types is not None and services is not None:
             raise OrchestratorError('--daemon-types and --services are mutually exclusive')
         if daemon_types is not None:

From bdd8008d246e1c1dc662e3492a3230c2e8c465dc Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Fri, 21 Oct 2022 12:56:44 -0400
Subject: [PATCH 2/7] mgr/cephadm: introduce HostConnectionError exception type
 in ssh handling

The idea is to be able to know elsewhere that the OrchestratorError
we are looking at is specifically one raised due to a failure to
connnect to a host. This can hopefully allow for some more
precise error handling

Signed-off-by: Adam King <adking@redhat.com>
---
 src/pybind/mgr/cephadm/ssh.py | 23 ++++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/src/pybind/mgr/cephadm/ssh.py b/src/pybind/mgr/cephadm/ssh.py
index bdd3ae046dd..8d4fca96583 100644
--- a/src/pybind/mgr/cephadm/ssh.py
+++ b/src/pybind/mgr/cephadm/ssh.py
@@ -26,6 +26,14 @@ logger = logging.getLogger(__name__)
 asyncssh_logger = logging.getLogger('asyncssh')
 asyncssh_logger.propagate = False
 
+
+class HostConnectionError(OrchestratorError):
+    def __init__(self, message: str, hostname: str, addr: str) -> None:
+        super().__init__(message)
+        self.hostname = hostname
+        self.addr = addr
+
+
 DEFAULT_SSH_CONFIG = """
 Host *
   User root
@@ -106,19 +114,19 @@ class SSHManager:
             log_content = log_string.getvalue()
             msg = f"Can't communicate with remote host `{addr}`, possibly because python3 is not installed there or you are missing NOPASSWD in sudoers. {str(e)}"
             logger.exception(msg)
-            raise OrchestratorError(msg)
+            raise HostConnectionError(msg, host, addr)
         except asyncssh.Error as e:
             self.mgr.offline_hosts.add(host)
             log_content = log_string.getvalue()
             msg = f'Failed to connect to {host} ({addr}). {str(e)}' + '\n' + f'Log: {log_content}'
             logger.debug(msg)
-            raise OrchestratorError(msg)
+            raise HostConnectionError(msg, host, addr)
         except Exception as e:
             self.mgr.offline_hosts.add(host)
             log_content = log_string.getvalue()
             logger.exception(str(e))
-            raise OrchestratorError(
-                f'Failed to connect to {host} ({addr}): {repr(e)}' + '\n' f'Log: {log_content}')
+            raise HostConnectionError(
+                f'Failed to connect to {host} ({addr}): {repr(e)}' + '\n' f'Log: {log_content}', host, addr)
         finally:
             log_string.flush()
             asyncssh_logger.removeHandler(ch)
@@ -148,7 +156,12 @@ class SSHManager:
             logger.debug(f'Connection to {host} failed. {str(e)}')
             await self._reset_con(host)
             self.mgr.offline_hosts.add(host)
-            raise OrchestratorError(f'Unable to reach remote host {host}. {str(e)}')
+            if not addr:
+                try:
+                    addr = self.mgr.inventory.get_addr(host)
+                except Exception:
+                    addr = host
+            raise HostConnectionError(f'Unable to reach remote host {host}. {str(e)}', host, addr)
 
         def _rstrip(v: Union[bytes, str, None]) -> str:
             if not v:

From 225cd2d84b9753b985113998f7f3e969794884ed Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Fri, 21 Oct 2022 13:12:53 -0400
Subject: [PATCH 3/7] mgr/cephadm: raise a better error on connection error
 during upgrade

Right now failures to connect to a host during the upgrade result
in a "failed due to an unexpected exception" error. We can do a bit
better than that.

Fixes: https://tracker.ceph.com/issues/57891

Signed-off-by: Adam King <adking@redhat.com>
---
 src/pybind/mgr/cephadm/upgrade.py | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py
index b7ad4a8b66e..948336aa559 100644
--- a/src/pybind/mgr/cephadm/upgrade.py
+++ b/src/pybind/mgr/cephadm/upgrade.py
@@ -10,6 +10,7 @@ from cephadm.serve import CephadmServe
 from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
 from cephadm.utils import ceph_release_to_major, name_to_config_section, CEPH_UPGRADE_ORDER, \
     MONITORING_STACK_TYPES, CEPH_TYPES, GATEWAY_TYPES
+from cephadm.ssh import HostConnectionError
 from orchestrator import OrchestratorError, DaemonDescription, DaemonDescriptionStatus, daemon_type_to_service
 
 if TYPE_CHECKING:
@@ -122,7 +123,8 @@ class CephadmUpgrade:
         'UPGRADE_FAILED_PULL',
         'UPGRADE_REDEPLOY_DAEMON',
         'UPGRADE_BAD_TARGET_VERSION',
-        'UPGRADE_EXCEPTION'
+        'UPGRADE_EXCEPTION',
+        'UPGRADE_OFFLINE_HOST'
     ]
 
     def __init__(self, mgr: "CephadmOrchestrator"):
@@ -491,6 +493,14 @@ class CephadmUpgrade:
         if self.upgrade_state and not self.upgrade_state.paused:
             try:
                 self._do_upgrade()
+            except HostConnectionError as e:
+                self._fail_upgrade('UPGRADE_OFFLINE_HOST', {
+                    'severity': 'error',
+                    'summary': f'Upgrade: Failed to connect to host {e.hostname} at addr ({e.addr})',
+                    'count': 1,
+                    'detail': [f'SSH connection failed to {e.hostname} at addr ({e.addr}): {str(e)}'],
+                })
+                return False
             except Exception as e:
                 self._fail_upgrade('UPGRADE_EXCEPTION', {
                     'severity': 'error',
@@ -1025,6 +1035,18 @@ class CephadmUpgrade:
             logger.debug('_do_upgrade no state, exiting')
             return
 
+        if self.mgr.offline_hosts:
+            # offline host(s), on top of potential connection errors when trying to upgrade a daemon
+            # or pull an image, can cause issues where daemons are never ok to stop. Since evaluating
+            # whether or not that risk is present for any given offline hosts is a difficult problem,
+            # it's best to just fail upgrade cleanly so user can address the offline host(s)
+
+            # the HostConnectionError expects a hostname and addr, so let's just take
+            # one at random. It doesn't really matter which host we say we couldn't reach here.
+            hostname: str = list(self.mgr.offline_hosts)[0]
+            addr: str = self.mgr.inventory.get_addr(hostname)
+            raise HostConnectionError(f'Host(s) were marked offline: {self.mgr.offline_hosts}', hostname, addr)
+
         target_image = self.target_image
         target_id = self.upgrade_state.target_id
         target_digests = self.upgrade_state.target_digests

From 29bff27ccfcbaac1b008b03354736c35b629e81a Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Fri, 21 Oct 2022 14:41:03 -0400
Subject: [PATCH 4/7] mgr/cephadm: update check-host to handle new
 HostConnectionError exception

Otherwise we'll return a message about the host not being found
and to check 'ceph orch host ls' when the actual problem is the
host being offline

Signed-off-by: Adam King <adking@redhat.com>
---
 src/pybind/mgr/cephadm/module.py         | 4 ++++
 src/pybind/mgr/cephadm/tests/test_ssh.py | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index 1a523ed9f98..a8e17b1dfbb 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -1098,6 +1098,10 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
                                                                              error_ok=True, no_fsid=True))
             if code:
                 return 1, '', ('check-host failed:\n' + '\n'.join(err))
+        except ssh.HostConnectionError as e:
+            self.log.exception(f"check-host failed for '{host}' at addr ({e.addr}) due to connection failure: {str(e)}")
+            return 1, '', ('check-host failed:\n'
+                           + f"Failed to connect to {host} at address ({e.addr}): {str(e)}")
         except OrchestratorError:
             self.log.exception(f"check-host failed for '{host}'")
             return 1, '', ('check-host failed:\n'
diff --git a/src/pybind/mgr/cephadm/tests/test_ssh.py b/src/pybind/mgr/cephadm/tests/test_ssh.py
index 3282295e5ca..4197d8d7ef0 100644
--- a/src/pybind/mgr/cephadm/tests/test_ssh.py
+++ b/src/pybind/mgr/cephadm/tests/test_ssh.py
@@ -38,7 +38,7 @@ class TestWithSSH:
                 asyncssh_connect.side_effect = ConnectionLost('reason')
                 code, out, err = cephadm_module.check_host('test')
                 assert out == ''
-                assert "Host 'test' not found" in err
+                assert "Failed to connect to test at address (1::4)" in err
 
                 out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json()
                 assert out == HostSpec('test', '1::4', status='Offline').to_json()

From c05e6766633be0894e602e9d2055e6aa0824a4a0 Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Fri, 21 Oct 2022 16:03:29 -0400
Subject: [PATCH 5/7] mgr/cephadm: unit tests for upgrade offline host
 scenarios

Signed-off-by: Adam King <adking@redhat.com>
---
 src/pybind/mgr/cephadm/tests/test_upgrade.py | 35 +++++++++++++++++++-
 1 file changed, 34 insertions(+), 1 deletion(-)

diff --git a/src/pybind/mgr/cephadm/tests/test_upgrade.py b/src/pybind/mgr/cephadm/tests/test_upgrade.py
index 608b68f890f..2315511d447 100644
--- a/src/pybind/mgr/cephadm/tests/test_upgrade.py
+++ b/src/pybind/mgr/cephadm/tests/test_upgrade.py
@@ -5,7 +5,8 @@ import pytest
 
 from ceph.deployment.service_spec import PlacementSpec, ServiceSpec
 from cephadm import CephadmOrchestrator
-from cephadm.upgrade import CephadmUpgrade
+from cephadm.upgrade import CephadmUpgrade, UpgradeState
+from cephadm.ssh import HostConnectionError
 from orchestrator import OrchestratorError, DaemonDescription
 from .fixtures import _run_cephadm, wait, with_host, with_service, \
     receive_agent_metadata, async_side_effect
@@ -34,6 +35,38 @@ def test_upgrade_start(cephadm_module: CephadmOrchestrator):
                             ) == 'Stopped upgrade to image_id'
 
 
+@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
+def test_upgrade_start_offline_hosts(cephadm_module: CephadmOrchestrator):
+    with with_host(cephadm_module, 'test'):
+        with with_host(cephadm_module, 'test2'):
+            cephadm_module.offline_hosts = set(['test2'])
+            with pytest.raises(OrchestratorError, match=r"Upgrade aborted - Some host\(s\) are currently offline: {'test2'}"):
+                cephadm_module.upgrade_start('image_id', None)
+            cephadm_module.offline_hosts = set([])  # so remove_host doesn't fail when leaving the with_host block
+
+
+@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
+def test_upgrade_daemons_offline_hosts(cephadm_module: CephadmOrchestrator):
+    with with_host(cephadm_module, 'test'):
+        with with_host(cephadm_module, 'test2'):
+            cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0)
+            with mock.patch("cephadm.serve.CephadmServe._run_cephadm", side_effect=HostConnectionError('connection failure reason', 'test2', '192.168.122.1')):
+                _to_upgrade = [(DaemonDescription(daemon_type='crash', daemon_id='test2', hostname='test2'), True)]
+                with pytest.raises(HostConnectionError, match=r"connection failure reason"):
+                    cephadm_module.upgrade._upgrade_daemons(_to_upgrade, 'target_image', ['digest1'])
+
+
+@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
+def test_do_upgrade_offline_hosts(cephadm_module: CephadmOrchestrator):
+    with with_host(cephadm_module, 'test'):
+        with with_host(cephadm_module, 'test2'):
+            cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0)
+            cephadm_module.offline_hosts = set(['test2'])
+            with pytest.raises(HostConnectionError, match=r"Host\(s\) were marked offline: {'test2'}"):
+                cephadm_module.upgrade._do_upgrade()
+            cephadm_module.offline_hosts = set([])  # so remove_host doesn't fail when leaving the with_host block
+
+
 @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
 @pytest.mark.parametrize("use_repo_digest",
                          [

From 23f523dd0fdf079d2e0b41113c524e7b53ed3c0c Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Fri, 21 Oct 2022 16:24:19 -0400
Subject: [PATCH 6/7] mgr/cephadm: clear upgrade health error when upgrade is
 resumed

The health error is no longer valid if the upgrade has been
removed. If the issue is still present we'll hit it again.

Fixes: https://tracker.ceph.com/issues/57891

Signed-off-by: Adam King <adking@redhat.com>
---
 src/pybind/mgr/cephadm/tests/test_upgrade.py | 13 +++++++++++++
 src/pybind/mgr/cephadm/upgrade.py            |  2 ++
 2 files changed, 15 insertions(+)

diff --git a/src/pybind/mgr/cephadm/tests/test_upgrade.py b/src/pybind/mgr/cephadm/tests/test_upgrade.py
index 2315511d447..107395b93a1 100644
--- a/src/pybind/mgr/cephadm/tests/test_upgrade.py
+++ b/src/pybind/mgr/cephadm/tests/test_upgrade.py
@@ -67,6 +67,19 @@ def test_do_upgrade_offline_hosts(cephadm_module: CephadmOrchestrator):
             cephadm_module.offline_hosts = set([])  # so remove_host doesn't fail when leaving the with_host block
 
 
+@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
+@mock.patch("cephadm.module.CephadmOrchestrator.remove_health_warning")
+def test_upgrade_resume_clear_health_warnings(_rm_health_warning, cephadm_module: CephadmOrchestrator):
+    with with_host(cephadm_module, 'test'):
+        with with_host(cephadm_module, 'test2'):
+            cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0, paused=True)
+            _rm_health_warning.return_value = None
+            assert wait(cephadm_module, cephadm_module.upgrade_resume()
+                        ) == 'Resumed upgrade to target_image'
+            calls_list = [mock.call(alert_id) for alert_id in cephadm_module.upgrade.UPGRADE_ERRORS]
+            _rm_health_warning.assert_has_calls(calls_list, any_order=True)
+
+
 @mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
 @pytest.mark.parametrize("use_repo_digest",
                          [
diff --git a/src/pybind/mgr/cephadm/upgrade.py b/src/pybind/mgr/cephadm/upgrade.py
index 948336aa559..11dc9cfaae3 100644
--- a/src/pybind/mgr/cephadm/upgrade.py
+++ b/src/pybind/mgr/cephadm/upgrade.py
@@ -469,6 +469,8 @@ class CephadmUpgrade:
         self.mgr.log.info('Upgrade: Resumed upgrade to %s' % self.target_image)
         self._save_upgrade_state()
         self.mgr.event.set()
+        for alert_id in self.UPGRADE_ERRORS:
+            self.mgr.remove_health_warning(alert_id)
         return 'Resumed upgrade to %s' % self.target_image
 
     def upgrade_stop(self) -> str:

From beea26fc267c338ac4ed58a39535f017f892ea85 Mon Sep 17 00:00:00 2001
From: Adam King <adking@redhat.com>
Date: Fri, 21 Oct 2022 16:54:31 -0400
Subject: [PATCH 7/7] mgr/cephadm: remove host from offline_hosts list when
 removing host

Otherwise, it could remain in the list and cephadm could think
there is an offline host in the cluster when said host has
actually already been removed.

Signed-off-by: Adam King <adking@redhat.com>
---
 src/pybind/mgr/cephadm/module.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/pybind/mgr/cephadm/module.py b/src/pybind/mgr/cephadm/module.py
index a8e17b1dfbb..7ab5f10f39b 100644
--- a/src/pybind/mgr/cephadm/module.py
+++ b/src/pybind/mgr/cephadm/module.py
@@ -1582,6 +1582,7 @@ Then run the following:
         self.inventory.rm_host(host)
         self.cache.rm_host(host)
         self.ssh.reset_con(host)
+        self.offline_hosts_remove(host)  # if host was in offline host list, we should remove it now.
         self.event.set()  # refresh stray health check
         self.log.info('Removed host %s' % host)
         return "Removed {} host '{}'".format('offline' if offline else '', host)