Merge pull request #48592 from adk3798/offline-upgrade

mgr/cephadm: improve offline host handling, mostly around upgrade

Reviewed-by: Redouane Kachach <rkachach@redhat.com>
This commit is contained in:
Adam King 2023-01-13 12:01:41 -05:00 committed by GitHub
commit 2b8e7a6155
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 99 additions and 9 deletions

View File

@ -1106,6 +1106,10 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
error_ok=True, no_fsid=True))
if code:
return 1, '', ('check-host failed:\n' + '\n'.join(err))
except ssh.HostConnectionError as e:
self.log.exception(f"check-host failed for '{host}' at addr ({e.addr}) due to connection failure: {str(e)}")
return 1, '', ('check-host failed:\n'
+ f"Failed to connect to {host} at address ({e.addr}): {str(e)}")
except OrchestratorError:
self.log.exception(f"check-host failed for '{host}'")
return 1, '', ('check-host failed:\n'
@ -1586,6 +1590,7 @@ Then run the following:
self.inventory.rm_host(host)
self.cache.rm_host(host)
self.ssh.reset_con(host)
self.offline_hosts_remove(host) # if host was in offline host list, we should remove it now.
self.event.set() # refresh stray health check
self.log.info('Removed host %s' % host)
return "Removed {} host '{}'".format('offline' if offline else '', host)
@ -2998,7 +3003,9 @@ Then run the following:
def upgrade_start(self, image: str, version: str, daemon_types: Optional[List[str]] = None, host_placement: Optional[str] = None,
services: Optional[List[str]] = None, limit: Optional[int] = None) -> str:
if self.inventory.get_host_with_state("maintenance"):
raise OrchestratorError("upgrade aborted - you have host(s) in maintenance state")
raise OrchestratorError("Upgrade aborted - you have host(s) in maintenance state")
if self.offline_hosts:
raise OrchestratorError(f"Upgrade aborted - Some host(s) are currently offline: {self.offline_hosts}")
if daemon_types is not None and services is not None:
raise OrchestratorError('--daemon-types and --services are mutually exclusive')
if daemon_types is not None:

View File

@ -26,6 +26,14 @@ logger = logging.getLogger(__name__)
asyncssh_logger = logging.getLogger('asyncssh')
asyncssh_logger.propagate = False
class HostConnectionError(OrchestratorError):
def __init__(self, message: str, hostname: str, addr: str) -> None:
super().__init__(message)
self.hostname = hostname
self.addr = addr
DEFAULT_SSH_CONFIG = """
Host *
User root
@ -106,19 +114,19 @@ class SSHManager:
log_content = log_string.getvalue()
msg = f"Can't communicate with remote host `{addr}`, possibly because python3 is not installed there or you are missing NOPASSWD in sudoers. {str(e)}"
logger.exception(msg)
raise OrchestratorError(msg)
raise HostConnectionError(msg, host, addr)
except asyncssh.Error as e:
self.mgr.offline_hosts.add(host)
log_content = log_string.getvalue()
msg = f'Failed to connect to {host} ({addr}). {str(e)}' + '\n' + f'Log: {log_content}'
logger.debug(msg)
raise OrchestratorError(msg)
raise HostConnectionError(msg, host, addr)
except Exception as e:
self.mgr.offline_hosts.add(host)
log_content = log_string.getvalue()
logger.exception(str(e))
raise OrchestratorError(
f'Failed to connect to {host} ({addr}): {repr(e)}' + '\n' f'Log: {log_content}')
raise HostConnectionError(
f'Failed to connect to {host} ({addr}): {repr(e)}' + '\n' f'Log: {log_content}', host, addr)
finally:
log_string.flush()
asyncssh_logger.removeHandler(ch)
@ -150,7 +158,12 @@ class SSHManager:
logger.debug(f'Connection to {host} failed. {str(e)}')
await self._reset_con(host)
self.mgr.offline_hosts.add(host)
raise OrchestratorError(f'Unable to reach remote host {host}. {str(e)}')
if not addr:
try:
addr = self.mgr.inventory.get_addr(host)
except Exception:
addr = host
raise HostConnectionError(f'Unable to reach remote host {host}. {str(e)}', host, addr)
def _rstrip(v: Union[bytes, str, None]) -> str:
if not v:

View File

@ -38,7 +38,7 @@ class TestWithSSH:
asyncssh_connect.side_effect = ConnectionLost('reason')
code, out, err = cephadm_module.check_host('test')
assert out == ''
assert "Host 'test' not found" in err
assert "Failed to connect to test at address (1::4)" in err
out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json()
assert out == HostSpec('test', '1::4', status='Offline').to_json()

View File

@ -5,7 +5,8 @@ import pytest
from ceph.deployment.service_spec import PlacementSpec, ServiceSpec
from cephadm import CephadmOrchestrator
from cephadm.upgrade import CephadmUpgrade
from cephadm.upgrade import CephadmUpgrade, UpgradeState
from cephadm.ssh import HostConnectionError
from orchestrator import OrchestratorError, DaemonDescription
from .fixtures import _run_cephadm, wait, with_host, with_service, \
receive_agent_metadata, async_side_effect
@ -34,6 +35,51 @@ def test_upgrade_start(cephadm_module: CephadmOrchestrator):
) == 'Stopped upgrade to image_id'
@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
def test_upgrade_start_offline_hosts(cephadm_module: CephadmOrchestrator):
with with_host(cephadm_module, 'test'):
with with_host(cephadm_module, 'test2'):
cephadm_module.offline_hosts = set(['test2'])
with pytest.raises(OrchestratorError, match=r"Upgrade aborted - Some host\(s\) are currently offline: {'test2'}"):
cephadm_module.upgrade_start('image_id', None)
cephadm_module.offline_hosts = set([]) # so remove_host doesn't fail when leaving the with_host block
@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
def test_upgrade_daemons_offline_hosts(cephadm_module: CephadmOrchestrator):
with with_host(cephadm_module, 'test'):
with with_host(cephadm_module, 'test2'):
cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0)
with mock.patch("cephadm.serve.CephadmServe._run_cephadm", side_effect=HostConnectionError('connection failure reason', 'test2', '192.168.122.1')):
_to_upgrade = [(DaemonDescription(daemon_type='crash', daemon_id='test2', hostname='test2'), True)]
with pytest.raises(HostConnectionError, match=r"connection failure reason"):
cephadm_module.upgrade._upgrade_daemons(_to_upgrade, 'target_image', ['digest1'])
@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
def test_do_upgrade_offline_hosts(cephadm_module: CephadmOrchestrator):
with with_host(cephadm_module, 'test'):
with with_host(cephadm_module, 'test2'):
cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0)
cephadm_module.offline_hosts = set(['test2'])
with pytest.raises(HostConnectionError, match=r"Host\(s\) were marked offline: {'test2'}"):
cephadm_module.upgrade._do_upgrade()
cephadm_module.offline_hosts = set([]) # so remove_host doesn't fail when leaving the with_host block
@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
@mock.patch("cephadm.module.CephadmOrchestrator.remove_health_warning")
def test_upgrade_resume_clear_health_warnings(_rm_health_warning, cephadm_module: CephadmOrchestrator):
with with_host(cephadm_module, 'test'):
with with_host(cephadm_module, 'test2'):
cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0, paused=True)
_rm_health_warning.return_value = None
assert wait(cephadm_module, cephadm_module.upgrade_resume()
) == 'Resumed upgrade to target_image'
calls_list = [mock.call(alert_id) for alert_id in cephadm_module.upgrade.UPGRADE_ERRORS]
_rm_health_warning.assert_has_calls(calls_list, any_order=True)
@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
@pytest.mark.parametrize("use_repo_digest",
[

View File

@ -10,6 +10,7 @@ from cephadm.serve import CephadmServe
from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
from cephadm.utils import ceph_release_to_major, name_to_config_section, CEPH_UPGRADE_ORDER, \
MONITORING_STACK_TYPES, CEPH_TYPES, GATEWAY_TYPES
from cephadm.ssh import HostConnectionError
from orchestrator import OrchestratorError, DaemonDescription, DaemonDescriptionStatus, daemon_type_to_service
if TYPE_CHECKING:
@ -122,7 +123,8 @@ class CephadmUpgrade:
'UPGRADE_FAILED_PULL',
'UPGRADE_REDEPLOY_DAEMON',
'UPGRADE_BAD_TARGET_VERSION',
'UPGRADE_EXCEPTION'
'UPGRADE_EXCEPTION',
'UPGRADE_OFFLINE_HOST'
]
def __init__(self, mgr: "CephadmOrchestrator"):
@ -467,6 +469,8 @@ class CephadmUpgrade:
self.mgr.log.info('Upgrade: Resumed upgrade to %s' % self.target_image)
self._save_upgrade_state()
self.mgr.event.set()
for alert_id in self.UPGRADE_ERRORS:
self.mgr.remove_health_warning(alert_id)
return 'Resumed upgrade to %s' % self.target_image
def upgrade_stop(self) -> str:
@ -491,6 +495,14 @@ class CephadmUpgrade:
if self.upgrade_state and not self.upgrade_state.paused:
try:
self._do_upgrade()
except HostConnectionError as e:
self._fail_upgrade('UPGRADE_OFFLINE_HOST', {
'severity': 'error',
'summary': f'Upgrade: Failed to connect to host {e.hostname} at addr ({e.addr})',
'count': 1,
'detail': [f'SSH connection failed to {e.hostname} at addr ({e.addr}): {str(e)}'],
})
return False
except Exception as e:
self._fail_upgrade('UPGRADE_EXCEPTION', {
'severity': 'error',
@ -1025,6 +1037,18 @@ class CephadmUpgrade:
logger.debug('_do_upgrade no state, exiting')
return
if self.mgr.offline_hosts:
# offline host(s), on top of potential connection errors when trying to upgrade a daemon
# or pull an image, can cause issues where daemons are never ok to stop. Since evaluating
# whether or not that risk is present for any given offline hosts is a difficult problem,
# it's best to just fail upgrade cleanly so user can address the offline host(s)
# the HostConnectionError expects a hostname and addr, so let's just take
# one at random. It doesn't really matter which host we say we couldn't reach here.
hostname: str = list(self.mgr.offline_hosts)[0]
addr: str = self.mgr.inventory.get_addr(hostname)
raise HostConnectionError(f'Host(s) were marked offline: {self.mgr.offline_hosts}', hostname, addr)
target_image = self.target_image
target_id = self.upgrade_state.target_id
target_digests = self.upgrade_state.target_digests