mirror of
https://github.com/ceph/ceph
synced 2025-01-02 09:02:34 +00:00
mgr/cephadm: force flag for ok-to-stop and ok-to-stop for monitoring stack
Daemons that could cause data loss when stopped will always block. Daemons that will only cause loss in availability should block but have a workaround in the form of a force flag if the user is okay with the service being down. Also implements ok-to-stop for monitoring stack daemons that uses this system of blocking on availability loss unless force flag is provided Signed-off-by: Adam King <adking@redhat.com> Signed-off-by: Daniel-Pivonka <dpivonka@redhat.com>
This commit is contained in:
parent
bfa04adb65
commit
aeeffb07f1
@ -72,6 +72,13 @@ Add and remove hosts::
|
||||
ceph orch host add <hostname> [<addr>] [<labels>...]
|
||||
ceph orch host rm <hostname>
|
||||
|
||||
Place a host in and out of maintenance mode (stops all Ceph daemons on host)::
|
||||
|
||||
ceph orch host maintenance enter <hostname> [--force]
|
||||
ceph orch host maintenace exit <hostname>
|
||||
|
||||
Where the force flag when entering maintenance allows the user to bypass warnings (but not alerts)
|
||||
|
||||
For cephadm, see also :ref:`cephadm-fqdn` and :ref:`cephadm-removing-hosts`.
|
||||
|
||||
Host Specification
|
||||
|
@ -938,7 +938,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
|
||||
Text that is appended to all daemon's ceph.conf.
|
||||
Mainly a workaround, till `config generate-minimal-conf` generates
|
||||
a complete ceph.conf.
|
||||
|
||||
|
||||
Warning: this is a dangerous operation.
|
||||
"""
|
||||
if inbuf:
|
||||
@ -1250,7 +1250,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
|
||||
self.log.info('Removed label %s to host %s' % (label, host))
|
||||
return 'Removed label %s from host %s' % (label, host)
|
||||
|
||||
def _host_ok_to_stop(self, hostname: str) -> Tuple[int, str]:
|
||||
def _host_ok_to_stop(self, hostname: str, force: bool = False) -> Tuple[int, str]:
|
||||
self.log.debug("running host-ok-to-stop checks")
|
||||
daemons = self.cache.get_daemons()
|
||||
daemon_map: Dict[str, List[str]] = defaultdict(lambda: [])
|
||||
@ -1261,12 +1261,28 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
|
||||
if dd.hostname == hostname:
|
||||
daemon_map[dd.daemon_type].append(dd.daemon_id)
|
||||
|
||||
notifications: List[str] = []
|
||||
error_notifications: List[str] = []
|
||||
okay: bool = True
|
||||
for daemon_type, daemon_ids in daemon_map.items():
|
||||
r = self.cephadm_services[daemon_type_to_service(daemon_type)].ok_to_stop(daemon_ids)
|
||||
r = self.cephadm_services[daemon_type_to_service(
|
||||
daemon_type)].ok_to_stop(daemon_ids, force)
|
||||
if r.retval:
|
||||
self.log.error(f'It is NOT safe to stop host {hostname}')
|
||||
return r.retval, r.stderr
|
||||
okay = False
|
||||
# collect error notifications so user can see every daemon causing host
|
||||
# to not be okay to stop
|
||||
error_notifications.append(r.stderr)
|
||||
if r.stdout:
|
||||
# if extra notifications to print for user, add them to notifications list
|
||||
notifications.append(r.stdout)
|
||||
|
||||
if not okay:
|
||||
# at least one daemon is not okay to stop
|
||||
return 1, '\n'.join(error_notifications)
|
||||
|
||||
if notifications:
|
||||
return 0, (f'It is presumed safe to stop host {hostname}. ' +
|
||||
'Note the following:\n\n' + '\n'.join(notifications))
|
||||
return 0, f'It is presumed safe to stop host {hostname}'
|
||||
|
||||
@trivial_completion
|
||||
@ -1301,7 +1317,7 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
|
||||
|
||||
@trivial_completion
|
||||
@host_exists()
|
||||
def enter_host_maintenance(self, hostname: str) -> str:
|
||||
def enter_host_maintenance(self, hostname: str, force: bool = False) -> str:
|
||||
""" Attempt to place a cluster host in maintenance
|
||||
|
||||
Placing a host into maintenance disables the cluster's ceph target in systemd
|
||||
@ -1330,9 +1346,10 @@ class CephadmOrchestrator(orchestrator.Orchestrator, MgrModule,
|
||||
if host_daemons:
|
||||
# daemons on this host, so check the daemons can be stopped
|
||||
# and if so, place the host into maintenance by disabling the target
|
||||
rc, msg = self._host_ok_to_stop(hostname)
|
||||
rc, msg = self._host_ok_to_stop(hostname, force)
|
||||
if rc:
|
||||
raise OrchestratorError(msg, errno=rc)
|
||||
raise OrchestratorError(
|
||||
msg + '\nNote: Warnings can be bypassed with the --force flag', errno=rc)
|
||||
|
||||
# call the host-maintenance function
|
||||
out, _err, _code = CephadmServe(self)._run_cephadm(hostname, cephadmNoImage, "host-maintenance",
|
||||
|
@ -593,7 +593,8 @@ class CephadmServe:
|
||||
def _ok_to_stop(remove_daemon_hosts: Set[orchestrator.DaemonDescription]) -> bool:
|
||||
daemon_ids = [d.daemon_id for d in remove_daemon_hosts]
|
||||
assert None not in daemon_ids
|
||||
r = self.mgr.cephadm_services[service_type].ok_to_stop(cast(List[str], daemon_ids))
|
||||
# setting force flag retains previous behavior, should revisit later.
|
||||
r = self.mgr.cephadm_services[service_type].ok_to_stop(cast(List[str], daemon_ids), force=True)
|
||||
return not r.retval
|
||||
|
||||
while remove_daemon_hosts and not _ok_to_stop(remove_daemon_hosts):
|
||||
|
@ -208,7 +208,7 @@ class CephadmService(metaclass=ABCMeta):
|
||||
except MonCommandFailed as e:
|
||||
logger.warning('Failed to set Dashboard config for %s: %s', service_name, e)
|
||||
|
||||
def ok_to_stop(self, daemon_ids: List[str]) -> HandleCommandResult:
|
||||
def ok_to_stop(self, daemon_ids: List[str], force: bool = False) -> HandleCommandResult:
|
||||
names = [f'{self.TYPE}.{d_id}' for d_id in daemon_ids]
|
||||
out = f'It is presumed safe to stop {names}'
|
||||
err = f'It is NOT safe to stop {names}'
|
||||
@ -231,6 +231,27 @@ class CephadmService(metaclass=ABCMeta):
|
||||
logger.info(out)
|
||||
return HandleCommandResult(r.retval, out, r.stderr)
|
||||
|
||||
def _enough_daemons_to_stop(self, daemon_type: str, daemon_ids: List[str], service: str, low_limit: int) -> Tuple[bool, str]:
|
||||
# Provides a warning about if it possible or not to stop <n> daemons in a service
|
||||
names = [f'{daemon_type}.{d_id}' for d_id in daemon_ids]
|
||||
number_of_running_daemons = len(
|
||||
[daemon for daemon in self.mgr.cache.get_daemons_by_type(daemon_type) if daemon.status == 1])
|
||||
if (number_of_running_daemons - len(daemon_ids)) >= low_limit:
|
||||
return False, f'It is presumed safe to stop {names}'
|
||||
|
||||
num_daemons_left = number_of_running_daemons - len(daemon_ids)
|
||||
|
||||
def plural(count: int) -> str:
|
||||
return 'daemon' if count == 1 else 'daemons'
|
||||
|
||||
daemon_count = "only" if number_of_running_daemons == 1 else number_of_running_daemons
|
||||
left_count = "no" if num_daemons_left == 0 else num_daemons_left
|
||||
|
||||
out = (f'WARNING: Stopping {len(daemon_ids)} out of {number_of_running_daemons} daemons in {service} service. '
|
||||
f'Service will not be operational with {left_count} {plural(num_daemons_left)} left. '
|
||||
f'At least {low_limit} {plural(low_limit)} must be running to guarantee service. ')
|
||||
return True, out
|
||||
|
||||
def pre_remove(self, daemon: DaemonDescription) -> None:
|
||||
"""
|
||||
Called before the daemon is removed.
|
||||
|
@ -1,7 +1,10 @@
|
||||
import errno
|
||||
import logging
|
||||
import os
|
||||
from typing import List, Any, Tuple, Dict
|
||||
|
||||
from mgr_module import HandleCommandResult
|
||||
|
||||
from orchestrator import DaemonDescription
|
||||
from ceph.deployment.service_spec import AlertManagerSpec
|
||||
from cephadm.services.cephadmservice import CephadmService, CephadmDaemonSpec
|
||||
@ -80,6 +83,12 @@ class GrafanaService(CephadmService):
|
||||
service_url
|
||||
)
|
||||
|
||||
def ok_to_stop(self, daemon_ids: List[str], force: bool = False) -> HandleCommandResult:
|
||||
warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Grafana', 1)
|
||||
if warn and not force:
|
||||
return HandleCommandResult(-errno.EBUSY, None, warn_message)
|
||||
return HandleCommandResult(0, warn_message, None)
|
||||
|
||||
|
||||
class AlertmanagerService(CephadmService):
|
||||
TYPE = 'alertmanager'
|
||||
@ -165,6 +174,12 @@ class AlertmanagerService(CephadmService):
|
||||
service_url
|
||||
)
|
||||
|
||||
def ok_to_stop(self, daemon_ids: List[str], force: bool = False) -> HandleCommandResult:
|
||||
warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Alertmanager', 1)
|
||||
if warn and not force:
|
||||
return HandleCommandResult(-errno.EBUSY, None, warn_message)
|
||||
return HandleCommandResult(0, warn_message, None)
|
||||
|
||||
|
||||
class PrometheusService(CephadmService):
|
||||
TYPE = 'prometheus'
|
||||
@ -263,6 +278,12 @@ class PrometheusService(CephadmService):
|
||||
service_url
|
||||
)
|
||||
|
||||
def ok_to_stop(self, daemon_ids: List[str], force: bool = False) -> HandleCommandResult:
|
||||
warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Prometheus', 1)
|
||||
if warn and not force:
|
||||
return HandleCommandResult(-errno.EBUSY, None, warn_message)
|
||||
return HandleCommandResult(0, warn_message, None)
|
||||
|
||||
|
||||
class NodeExporterService(CephadmService):
|
||||
TYPE = 'node-exporter'
|
||||
@ -274,3 +295,9 @@ class NodeExporterService(CephadmService):
|
||||
def generate_config(self, daemon_spec: CephadmDaemonSpec) -> Tuple[Dict[str, Any], List[str]]:
|
||||
assert self.TYPE == daemon_spec.daemon_type
|
||||
return {}, []
|
||||
|
||||
def ok_to_stop(self, daemon_ids: List[str], force: bool = False) -> HandleCommandResult:
|
||||
# since node exporter runs on each host and cannot compromise data, no extra checks required
|
||||
names = [f'{self.TYPE}.{d_id}' for d_id in daemon_ids]
|
||||
out = f'It is presumed safe to stop {names}'
|
||||
return HandleCommandResult(0, out, None)
|
||||
|
@ -776,7 +776,7 @@ class TestCephadm(object):
|
||||
match_glob(out, "Scheduled mds.fsname update...")
|
||||
CephadmServe(cephadm_module)._apply_all_services()
|
||||
|
||||
ok_to_stop.assert_called_with([daemon[4:]])
|
||||
ok_to_stop.assert_called_with([daemon[4:]], force=True)
|
||||
|
||||
assert_rm_daemon(cephadm_module, spec.service_name(), 'host1') # verifies ok-to-stop
|
||||
assert_rm_daemon(cephadm_module, spec.service_name(), 'host2')
|
||||
|
@ -7,7 +7,7 @@ from typing import TYPE_CHECKING, Optional, Dict
|
||||
import orchestrator
|
||||
from cephadm.serve import CephadmServe
|
||||
from cephadm.utils import name_to_config_section, CEPH_UPGRADE_ORDER
|
||||
from orchestrator import OrchestratorError, DaemonDescription
|
||||
from orchestrator import OrchestratorError, DaemonDescription, daemon_type_to_service, service_to_daemon_types
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .module import CephadmOrchestrator
|
||||
@ -179,7 +179,9 @@ class CephadmUpgrade:
|
||||
if not self.upgrade_state or self.upgrade_state.paused:
|
||||
return False
|
||||
|
||||
r = self.mgr.cephadm_services[s.daemon_type].ok_to_stop([s.daemon_id])
|
||||
# setting force flag to retain old functionality.
|
||||
r = self.mgr.cephadm_services[daemon_type_to_service(s.daemon_type)].ok_to_stop([
|
||||
s.daemon_id], force=True)
|
||||
|
||||
if not r.retval:
|
||||
logger.info(f'Upgrade: {r.stdout}')
|
||||
|
@ -16,4 +16,4 @@ from ._interface import \
|
||||
DaemonDescription, \
|
||||
OrchestratorEvent, set_exception_subject, \
|
||||
InventoryHost, DeviceLightLoc, \
|
||||
UpgradeStatusSpec
|
||||
UpgradeStatusSpec, daemon_type_to_service, service_to_daemon_types
|
||||
|
@ -836,7 +836,7 @@ class Orchestrator(object):
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def enter_host_maintenance(self, hostname: str) -> Completion:
|
||||
def enter_host_maintenance(self, hostname: str, force: bool = False) -> Completion:
|
||||
"""
|
||||
Place a host in maintenance, stopping daemons and disabling it's systemd target
|
||||
"""
|
||||
|
@ -393,11 +393,11 @@ class OrchestratorCli(OrchestratorClientMixin, MgrModule,
|
||||
|
||||
@_cli_write_command(
|
||||
'orch host maintenance enter')
|
||||
def _host_maintenance_enter(self, hostname: str) -> HandleCommandResult:
|
||||
def _host_maintenance_enter(self, hostname: str, force: bool = False) -> HandleCommandResult:
|
||||
"""
|
||||
Prepare a host for maintenance by shutting down and disabling all Ceph daemons (cephadm only)
|
||||
"""
|
||||
completion = self.enter_host_maintenance(hostname)
|
||||
completion = self.enter_host_maintenance(hostname, force=force)
|
||||
self._orchestrator_wait([completion])
|
||||
raise_if_exception(completion)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user