Merge pull request #26768 from sebastian-philipp/upstream-pull-26684

mgr/orchestrator: device lights

Reviewed-by: Ernesto Puerta <epuertat@redhat.com>
Reviewed-by: Sage Weil <sage@redhat.com>
Reviewed-by: Volker Theile <vtheile@suse.com>
This commit is contained in:
Sebastian Wagner 2019-11-06 11:48:29 +01:00 committed by GitHub
commit 3feda32916
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 194 additions and 13 deletions

View File

@ -182,10 +182,19 @@ Example::
^^^^^^^^^^^^^^^^^^^
::
ceph orchestrator device ident-on <host> <devname>
ceph orchestrator device ident-off <host> <devname>
ceph orchestrator device fault-on <host> <devname>
ceph orchestrator device fault-off <host> <devname>
ceph orchestrator device ident-on <dev_id>
ceph orchestrator device ident-on <dev_name> <host>
ceph orchestrator device fault-on <dev_id>
ceph orchestrator device fault-on <dev_name> <host>
ceph orchestrator device ident-off <dev_id> [--force=true]
ceph orchestrator device ident-off <dev_id> <host> [--force=true]
ceph orchestrator device fault-off <dev_id> [--force=true]
ceph orchestrator device fault-off <dev_id> <host> [--force=true]
where ``dev_id`` is the device id as listed in ``osd metadata``,
``dev_name`` is the name of the device on the system and ``host`` is the host as
returned by ``orchestrator host ls``
ceph orchestrator osd ident-on {primary,journal,db,wal,all} <osd-id>
ceph orchestrator osd ident-off {primary,journal,db,wal,all} <osd-id>

View File

@ -260,6 +260,9 @@ OSD management
.. py:currentmodule:: orchestrator
.. automethod:: Orchestrator.blink_device_light
.. autoclass:: DeviceLightLoc
.. _orchestrator-osd-replace:
OSD Replacement

View File

@ -8,8 +8,8 @@ tasks:
log-whitelist:
- overall HEALTH_
- \(MGR_DOWN\)
- \(MGR_INSIGHTS_WARNING\)
- \(insights_health_check
- \(DEVICE_IDENT_ON\)
- \(DEVICE_FAULT_ON\)
- \(PG_
- replacing it with standby
- No standby daemons available

View File

@ -14,8 +14,11 @@ log = logging.getLogger(__name__)
class TestOrchestratorCli(MgrTestCase):
MGRS_REQUIRED = 1
def _cmd(self, module, *args):
return self.mgr_cluster.mon_manager.raw_cluster_cmd(module, *args)
def _orch_cmd(self, *args):
return self.mgr_cluster.mon_manager.raw_cluster_cmd("orchestrator", *args)
return self._cmd("orchestrator", *args)
def _progress_cmd(self, *args):
return self.mgr_cluster.mon_manager.raw_cluster_cmd("progress", *args)
@ -93,6 +96,30 @@ class TestOrchestratorCli(MgrTestCase):
with self.assertRaises(CommandFailedError):
self._orch_cmd("osd", "create", "notfound:device")
def test_blink_device_light(self):
def _ls_lights(what):
return json.loads(self._cmd("device", "ls-lights"))[what]
metadata = json.loads(self._cmd("osd", "metadata"))
dev_name_ids = [osd["device_ids"] for osd in metadata]
_, dev_id = [d.split('=') for d in dev_name_ids if len(d.split('=')) == 2][0]
for t in ["ident", "fault"]:
self.assertNotIn(dev_id, _ls_lights(t))
self._cmd("device", "light", "on", dev_id, t)
self.assertIn(dev_id, _ls_lights(t))
health = {
'ident': 'DEVICE_IDENT_ON',
'fault': 'DEVICE_FAULT_ON',
}[t]
self.wait_for_health(health, 30)
self._cmd("device", "light", "off", dev_id, t)
self.assertNotIn(dev_id, _ls_lights(t))
self.wait_for_health_clear(30)
def test_mds_add(self):
self._orch_cmd("mds", "add", "service_name")

View File

@ -4,17 +4,15 @@ ceph-mgr orchestrator interface
Please see the ceph-mgr module developer's guide for more information.
"""
import copy
import sys
import time
import fnmatch
from collections import namedtuple
from functools import wraps
import uuid
import string
import random
import datetime
import six
import copy
from mgr_module import MgrModule, PersistentStoreDict
from mgr_util import format_bytes
@ -449,6 +447,17 @@ class Orchestrator(object):
"""
raise NotImplementedError()
def blink_device_light(self, ident_fault, on, locations):
# type: (str, bool, List[DeviceLightLoc]) -> WriteCompletion
"""
Instructs the orchestrator to enable or disable either the ident or the fault LED.
:param ident_fault: either ``"ident"`` or ``"fault"``
:param on: ``True`` = on.
:param locations: See :class:`orchestrator.DeviceLightLoc`
"""
raise NotImplementedError()
def update_mgrs(self, num, hosts):
# type: (int, List[str]) -> WriteCompletion
"""
@ -947,6 +956,19 @@ class InventoryNode(object):
return [cls(item[0], devs(item[1].data)) for item in hosts]
class DeviceLightLoc(namedtuple('DeviceLightLoc', ['host', 'dev'])):
"""
Describes a specific device on a specific host. Used for enabling or disabling LEDs
on devices.
hostname as in :func:`orchestrator.Orchestrator.get_hosts`
device_id: e.g. ``ABC1234DEF567-1R1234_ABC8DE0Q``.
See ``ceph osd metadata | jq '.[].device_ids'``
"""
__slots__ = ()
def _mk_orch_methods(cls):
# Needs to be defined outside of for.
# Otherwise meth is always bound to last key

View File

@ -1,14 +1,14 @@
import errno
import json
from functools import wraps
from prettytable import PrettyTable
try:
from typing import Dict, List
from typing import List, Set, Optional
except ImportError:
pass # just for type checking.
from functools import wraps
from ceph.deployment.drive_group import DriveGroupSpec, DriveGroupValidationError, \
DeviceSelection
@ -46,6 +46,120 @@ class OrchestratorCli(orchestrator.OrchestratorClientMixin, MgrModule):
{'name': 'orchestrator'}
]
def __init__(self, *args, **kwargs):
super(OrchestratorCli, self).__init__(*args, **kwargs)
self.ident = set() # type: Set[str]
self.fault = set() # type: Set[str]
self._load()
self._refresh_health()
def _load(self):
active = self.get_store('active_devices')
if active:
decoded = json.loads(active)
self.ident = set(decoded.get('ident', []))
self.fault = set(decoded.get('fault', []))
self.log.debug('ident {}, fault {}'.format(self.ident, self.fault))
def _save(self):
encoded = json.dumps({
'ident': list(self.ident),
'fault': list(self.fault),
})
self.set_store('active_devices', encoded)
def _refresh_health(self):
h = {}
if self.ident:
h['DEVICE_IDENT_ON'] = {
'severity': 'warning',
'summary': '%d devices have ident light turned on' % len(
self.ident),
'detail': ['{} ident light enabled'.format(d) for d in self.ident]
}
if self.fault:
h['DEVICE_FAULT_ON'] = {
'severity': 'warning',
'summary': '%d devices have fault light turned on' % len(
self.fault),
'detail': ['{} fault light enabled'.format(d) for d in self.ident]
}
self.set_health_checks(h)
def _get_device_locations(self, dev_id):
# type: (str) -> List[orchestrator.DeviceLightLoc]
locs = [d['location'] for d in self.get('devices')['devices'] if d['devid'] == dev_id]
return [orchestrator.DeviceLightLoc(**l) for l in sum(locs, [])]
@_read_cli(prefix='device ls-lights',
desc='List currently active device indicator lights')
def _device_ls(self):
return HandleCommandResult(
stdout=json.dumps({
'ident': list(self.ident),
'fault': list(self.fault)
}, indent=4))
def light_on(self, fault_ident, devid):
# type: (str, str) -> HandleCommandResult
assert fault_ident in ("fault", "ident")
locs = self._get_device_locations(devid)
if locs is None:
return HandleCommandResult(stderr='device {} not found'.format(devid),
retval=-errno.ENOENT)
getattr(self, fault_ident).add(devid)
self._save()
self._refresh_health()
completion = self.blink_device_light(fault_ident, True, locs)
self._orchestrator_wait([completion])
return HandleCommandResult(stdout=str(completion.result))
def light_off(self, fault_ident, devid, force):
# type: (str, str, bool) -> HandleCommandResult
assert fault_ident in ("fault", "ident")
locs = self._get_device_locations(devid)
if locs is None:
return HandleCommandResult(stderr='device {} not found'.format(devid),
retval=-errno.ENOENT)
try:
completion = self.blink_device_light(fault_ident, False, locs)
self._orchestrator_wait([completion])
if devid in getattr(self, fault_ident):
getattr(self, fault_ident).remove(devid)
self._save()
self._refresh_health()
return HandleCommandResult(stdout=str(completion.result))
except:
# There are several reasons the try: block might fail:
# 1. the device no longer exist
# 2. the device is no longer known to Ceph
# 3. the host is not reachable
if force and devid in getattr(self, fault_ident):
getattr(self, fault_ident).remove(devid)
self._save()
self._refresh_health()
raise
@_write_cli(prefix='device light',
cmd_args='name=enable,type=CephChoices,strings=on|off '
'name=devid,type=CephString '
'name=light_type,type=CephChoices,strings=ident|fault,req=false '
'name=force,type=CephBool,req=false',
desc='Enable or disable the device light. Default type is `ident`\n'
'Usage: device light (on|off) <devid> [ident|fault] [--force]')
def _device_light(self, enable, devid, light_type=None, force=False):
# type: (str, str, Optional[str], bool) -> HandleCommandResult
light_type = light_type or 'ident'
on = enable == 'on'
if on:
return self.light_on(light_type, devid)
else:
return self.light_off(light_type, devid, force)
def _select_orchestrator(self):
return self.get_module_option("orchestrator")

View File

@ -246,6 +246,12 @@ class TestOrchestrator(MgrModule, orchestrator.Orchestrator):
def remove_osds(self, osd_ids, destroy=False):
assert isinstance(osd_ids, list)
@deferred_write("blink_device_light")
def blink_device_light(self, ident_fault, on, locations):
assert ident_fault in ("ident", "fault")
assert len(locations)
return ''
@deferred_write("service_action")
def service_action(self, action, service_type, service_name=None, service_id=None):
pass