mirror of
https://github.com/ceph/ceph
synced 2025-01-03 09:32:43 +00:00
Merge pull request #48592 from adk3798/offline-upgrade
mgr/cephadm: improve offline host handling, mostly around upgrade Reviewed-by: Redouane Kachach <rkachach@redhat.com>
This commit is contained in:
commit
2b8e7a6155
@ -1106,6 +1106,10 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
|
||||
error_ok=True, no_fsid=True))
|
||||
if code:
|
||||
return 1, '', ('check-host failed:\n' + '\n'.join(err))
|
||||
except ssh.HostConnectionError as e:
|
||||
self.log.exception(f"check-host failed for '{host}' at addr ({e.addr}) due to connection failure: {str(e)}")
|
||||
return 1, '', ('check-host failed:\n'
|
||||
+ f"Failed to connect to {host} at address ({e.addr}): {str(e)}")
|
||||
except OrchestratorError:
|
||||
self.log.exception(f"check-host failed for '{host}'")
|
||||
return 1, '', ('check-host failed:\n'
|
||||
@ -1586,6 +1590,7 @@ Then run the following:
|
||||
self.inventory.rm_host(host)
|
||||
self.cache.rm_host(host)
|
||||
self.ssh.reset_con(host)
|
||||
self.offline_hosts_remove(host) # if host was in offline host list, we should remove it now.
|
||||
self.event.set() # refresh stray health check
|
||||
self.log.info('Removed host %s' % host)
|
||||
return "Removed {} host '{}'".format('offline' if offline else '', host)
|
||||
@ -2998,7 +3003,9 @@ Then run the following:
|
||||
def upgrade_start(self, image: str, version: str, daemon_types: Optional[List[str]] = None, host_placement: Optional[str] = None,
|
||||
services: Optional[List[str]] = None, limit: Optional[int] = None) -> str:
|
||||
if self.inventory.get_host_with_state("maintenance"):
|
||||
raise OrchestratorError("upgrade aborted - you have host(s) in maintenance state")
|
||||
raise OrchestratorError("Upgrade aborted - you have host(s) in maintenance state")
|
||||
if self.offline_hosts:
|
||||
raise OrchestratorError(f"Upgrade aborted - Some host(s) are currently offline: {self.offline_hosts}")
|
||||
if daemon_types is not None and services is not None:
|
||||
raise OrchestratorError('--daemon-types and --services are mutually exclusive')
|
||||
if daemon_types is not None:
|
||||
|
@ -26,6 +26,14 @@ logger = logging.getLogger(__name__)
|
||||
asyncssh_logger = logging.getLogger('asyncssh')
|
||||
asyncssh_logger.propagate = False
|
||||
|
||||
|
||||
class HostConnectionError(OrchestratorError):
|
||||
def __init__(self, message: str, hostname: str, addr: str) -> None:
|
||||
super().__init__(message)
|
||||
self.hostname = hostname
|
||||
self.addr = addr
|
||||
|
||||
|
||||
DEFAULT_SSH_CONFIG = """
|
||||
Host *
|
||||
User root
|
||||
@ -106,19 +114,19 @@ class SSHManager:
|
||||
log_content = log_string.getvalue()
|
||||
msg = f"Can't communicate with remote host `{addr}`, possibly because python3 is not installed there or you are missing NOPASSWD in sudoers. {str(e)}"
|
||||
logger.exception(msg)
|
||||
raise OrchestratorError(msg)
|
||||
raise HostConnectionError(msg, host, addr)
|
||||
except asyncssh.Error as e:
|
||||
self.mgr.offline_hosts.add(host)
|
||||
log_content = log_string.getvalue()
|
||||
msg = f'Failed to connect to {host} ({addr}). {str(e)}' + '\n' + f'Log: {log_content}'
|
||||
logger.debug(msg)
|
||||
raise OrchestratorError(msg)
|
||||
raise HostConnectionError(msg, host, addr)
|
||||
except Exception as e:
|
||||
self.mgr.offline_hosts.add(host)
|
||||
log_content = log_string.getvalue()
|
||||
logger.exception(str(e))
|
||||
raise OrchestratorError(
|
||||
f'Failed to connect to {host} ({addr}): {repr(e)}' + '\n' f'Log: {log_content}')
|
||||
raise HostConnectionError(
|
||||
f'Failed to connect to {host} ({addr}): {repr(e)}' + '\n' f'Log: {log_content}', host, addr)
|
||||
finally:
|
||||
log_string.flush()
|
||||
asyncssh_logger.removeHandler(ch)
|
||||
@ -150,7 +158,12 @@ class SSHManager:
|
||||
logger.debug(f'Connection to {host} failed. {str(e)}')
|
||||
await self._reset_con(host)
|
||||
self.mgr.offline_hosts.add(host)
|
||||
raise OrchestratorError(f'Unable to reach remote host {host}. {str(e)}')
|
||||
if not addr:
|
||||
try:
|
||||
addr = self.mgr.inventory.get_addr(host)
|
||||
except Exception:
|
||||
addr = host
|
||||
raise HostConnectionError(f'Unable to reach remote host {host}. {str(e)}', host, addr)
|
||||
|
||||
def _rstrip(v: Union[bytes, str, None]) -> str:
|
||||
if not v:
|
||||
|
@ -38,7 +38,7 @@ class TestWithSSH:
|
||||
asyncssh_connect.side_effect = ConnectionLost('reason')
|
||||
code, out, err = cephadm_module.check_host('test')
|
||||
assert out == ''
|
||||
assert "Host 'test' not found" in err
|
||||
assert "Failed to connect to test at address (1::4)" in err
|
||||
|
||||
out = wait(cephadm_module, cephadm_module.get_hosts())[0].to_json()
|
||||
assert out == HostSpec('test', '1::4', status='Offline').to_json()
|
||||
|
@ -5,7 +5,8 @@ import pytest
|
||||
|
||||
from ceph.deployment.service_spec import PlacementSpec, ServiceSpec
|
||||
from cephadm import CephadmOrchestrator
|
||||
from cephadm.upgrade import CephadmUpgrade
|
||||
from cephadm.upgrade import CephadmUpgrade, UpgradeState
|
||||
from cephadm.ssh import HostConnectionError
|
||||
from orchestrator import OrchestratorError, DaemonDescription
|
||||
from .fixtures import _run_cephadm, wait, with_host, with_service, \
|
||||
receive_agent_metadata, async_side_effect
|
||||
@ -34,6 +35,51 @@ def test_upgrade_start(cephadm_module: CephadmOrchestrator):
|
||||
) == 'Stopped upgrade to image_id'
|
||||
|
||||
|
||||
@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
|
||||
def test_upgrade_start_offline_hosts(cephadm_module: CephadmOrchestrator):
|
||||
with with_host(cephadm_module, 'test'):
|
||||
with with_host(cephadm_module, 'test2'):
|
||||
cephadm_module.offline_hosts = set(['test2'])
|
||||
with pytest.raises(OrchestratorError, match=r"Upgrade aborted - Some host\(s\) are currently offline: {'test2'}"):
|
||||
cephadm_module.upgrade_start('image_id', None)
|
||||
cephadm_module.offline_hosts = set([]) # so remove_host doesn't fail when leaving the with_host block
|
||||
|
||||
|
||||
@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
|
||||
def test_upgrade_daemons_offline_hosts(cephadm_module: CephadmOrchestrator):
|
||||
with with_host(cephadm_module, 'test'):
|
||||
with with_host(cephadm_module, 'test2'):
|
||||
cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0)
|
||||
with mock.patch("cephadm.serve.CephadmServe._run_cephadm", side_effect=HostConnectionError('connection failure reason', 'test2', '192.168.122.1')):
|
||||
_to_upgrade = [(DaemonDescription(daemon_type='crash', daemon_id='test2', hostname='test2'), True)]
|
||||
with pytest.raises(HostConnectionError, match=r"connection failure reason"):
|
||||
cephadm_module.upgrade._upgrade_daemons(_to_upgrade, 'target_image', ['digest1'])
|
||||
|
||||
|
||||
@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
|
||||
def test_do_upgrade_offline_hosts(cephadm_module: CephadmOrchestrator):
|
||||
with with_host(cephadm_module, 'test'):
|
||||
with with_host(cephadm_module, 'test2'):
|
||||
cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0)
|
||||
cephadm_module.offline_hosts = set(['test2'])
|
||||
with pytest.raises(HostConnectionError, match=r"Host\(s\) were marked offline: {'test2'}"):
|
||||
cephadm_module.upgrade._do_upgrade()
|
||||
cephadm_module.offline_hosts = set([]) # so remove_host doesn't fail when leaving the with_host block
|
||||
|
||||
|
||||
@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
|
||||
@mock.patch("cephadm.module.CephadmOrchestrator.remove_health_warning")
|
||||
def test_upgrade_resume_clear_health_warnings(_rm_health_warning, cephadm_module: CephadmOrchestrator):
|
||||
with with_host(cephadm_module, 'test'):
|
||||
with with_host(cephadm_module, 'test2'):
|
||||
cephadm_module.upgrade.upgrade_state = UpgradeState('target_image', 0, paused=True)
|
||||
_rm_health_warning.return_value = None
|
||||
assert wait(cephadm_module, cephadm_module.upgrade_resume()
|
||||
) == 'Resumed upgrade to target_image'
|
||||
calls_list = [mock.call(alert_id) for alert_id in cephadm_module.upgrade.UPGRADE_ERRORS]
|
||||
_rm_health_warning.assert_has_calls(calls_list, any_order=True)
|
||||
|
||||
|
||||
@mock.patch("cephadm.serve.CephadmServe._run_cephadm", _run_cephadm('{}'))
|
||||
@pytest.mark.parametrize("use_repo_digest",
|
||||
[
|
||||
|
@ -10,6 +10,7 @@ from cephadm.serve import CephadmServe
|
||||
from cephadm.services.cephadmservice import CephadmDaemonDeploySpec
|
||||
from cephadm.utils import ceph_release_to_major, name_to_config_section, CEPH_UPGRADE_ORDER, \
|
||||
MONITORING_STACK_TYPES, CEPH_TYPES, GATEWAY_TYPES
|
||||
from cephadm.ssh import HostConnectionError
|
||||
from orchestrator import OrchestratorError, DaemonDescription, DaemonDescriptionStatus, daemon_type_to_service
|
||||
|
||||
if TYPE_CHECKING:
|
||||
@ -122,7 +123,8 @@ class CephadmUpgrade:
|
||||
'UPGRADE_FAILED_PULL',
|
||||
'UPGRADE_REDEPLOY_DAEMON',
|
||||
'UPGRADE_BAD_TARGET_VERSION',
|
||||
'UPGRADE_EXCEPTION'
|
||||
'UPGRADE_EXCEPTION',
|
||||
'UPGRADE_OFFLINE_HOST'
|
||||
]
|
||||
|
||||
def __init__(self, mgr: "CephadmOrchestrator"):
|
||||
@ -467,6 +469,8 @@ class CephadmUpgrade:
|
||||
self.mgr.log.info('Upgrade: Resumed upgrade to %s' % self.target_image)
|
||||
self._save_upgrade_state()
|
||||
self.mgr.event.set()
|
||||
for alert_id in self.UPGRADE_ERRORS:
|
||||
self.mgr.remove_health_warning(alert_id)
|
||||
return 'Resumed upgrade to %s' % self.target_image
|
||||
|
||||
def upgrade_stop(self) -> str:
|
||||
@ -491,6 +495,14 @@ class CephadmUpgrade:
|
||||
if self.upgrade_state and not self.upgrade_state.paused:
|
||||
try:
|
||||
self._do_upgrade()
|
||||
except HostConnectionError as e:
|
||||
self._fail_upgrade('UPGRADE_OFFLINE_HOST', {
|
||||
'severity': 'error',
|
||||
'summary': f'Upgrade: Failed to connect to host {e.hostname} at addr ({e.addr})',
|
||||
'count': 1,
|
||||
'detail': [f'SSH connection failed to {e.hostname} at addr ({e.addr}): {str(e)}'],
|
||||
})
|
||||
return False
|
||||
except Exception as e:
|
||||
self._fail_upgrade('UPGRADE_EXCEPTION', {
|
||||
'severity': 'error',
|
||||
@ -1025,6 +1037,18 @@ class CephadmUpgrade:
|
||||
logger.debug('_do_upgrade no state, exiting')
|
||||
return
|
||||
|
||||
if self.mgr.offline_hosts:
|
||||
# offline host(s), on top of potential connection errors when trying to upgrade a daemon
|
||||
# or pull an image, can cause issues where daemons are never ok to stop. Since evaluating
|
||||
# whether or not that risk is present for any given offline hosts is a difficult problem,
|
||||
# it's best to just fail upgrade cleanly so user can address the offline host(s)
|
||||
|
||||
# the HostConnectionError expects a hostname and addr, so let's just take
|
||||
# one at random. It doesn't really matter which host we say we couldn't reach here.
|
||||
hostname: str = list(self.mgr.offline_hosts)[0]
|
||||
addr: str = self.mgr.inventory.get_addr(hostname)
|
||||
raise HostConnectionError(f'Host(s) were marked offline: {self.mgr.offline_hosts}', hostname, addr)
|
||||
|
||||
target_image = self.target_image
|
||||
target_id = self.upgrade_state.target_id
|
||||
target_digests = self.upgrade_state.target_digests
|
||||
|
Loading…
Reference in New Issue
Block a user