mgr/dashboard: fix: get SMART data from single-daemon device

Return SMART data even when a device is only associated with a single daemon.

Fixes: https://tracker.ceph.com/issues/53858
Signed-off-by: Alfonso Martínez <almartin@redhat.com>
This commit is contained in:
Alfonso Martínez 2022-01-13 15:20:48 +01:00
parent b709091d81
commit 6cd3729e27
2 changed files with 65 additions and 34 deletions

View File

@ -8,7 +8,6 @@ from mgr_module import CommandResult
from mgr_util import get_most_recent_rate, get_time_series_rates
from .. import mgr
from ..exceptions import DashboardException
try:
from typing import Any, Dict, Optional, Union
@ -235,11 +234,7 @@ class CephService(object):
# type: (dict) -> Dict[str, dict]
# Check whether the device is associated with daemons.
if 'daemons' in device and device['daemons']:
dev_smart_data = None
# The daemons associated with the device. Note, the list may
# contain daemons that are 'down' or 'destroyed'.
daemons = device.get('daemons')
dev_smart_data: Dict[str, Any] = {}
# Get a list of all OSD daemons on all hosts that are 'up'
# because SMART data can not be retrieved from daemons that
@ -250,26 +245,27 @@ class CephService(object):
if node.get('status') == 'up'
]
# Finally get the daemons on the host of the given device
# that are 'up'. All daemons on the same host can deliver
# SMART data, thus it is not relevant for us which daemon
# we are using.
daemons = list(set(daemons) & set(osd_daemons_up)) # type: ignore
for daemon in daemons:
# All daemons on the same host can deliver SMART data,
# thus it is not relevant for us which daemon we are using.
# NOTE: the list may contain daemons that are 'down' or 'destroyed'.
for daemon in device['daemons']:
svc_type, svc_id = daemon.split('.')
if 'osd' in svc_type:
if daemon not in osd_daemons_up:
continue
try:
dev_smart_data = CephService.send_command(
svc_type, 'smart', svc_id, devid=device['devid'])
except SendCommandError:
except SendCommandError as error:
logger.warning(str(error))
# Try to retrieve SMART data from another daemon.
continue
elif 'mon' in svc_type:
try:
dev_smart_data = CephService.send_command(
svc_type, 'device query-daemon-health-metrics', who=daemon)
except SendCommandError:
except SendCommandError as error:
logger.warning(str(error))
# Try to retrieve SMART data from another daemon.
continue
else:
@ -280,10 +276,7 @@ class CephService(object):
'[SMART] Error retrieving smartctl data for device ID "%s": %s',
dev_id, dev_data)
break
if dev_smart_data is None:
raise DashboardException(
'Failed to retrieve SMART data for device ID "{}"'.format(
device['devid']))
return dev_smart_data
logger.warning('[SMART] No daemons associated with device ID "%s"',
device['devid'])

View File

@ -109,23 +109,61 @@ def test_get_smart_data(caplog, by, args, log):
@mock.patch.object(CephService, 'send_command')
def test_get_smart_data_from_appropriate_ceph_command(send_command):
def test_get_smart_data_by_device(send_command):
# pylint: disable=protected-access
send_command.side_effect = [
{'nodes': [{'name': 'osd.1', 'status': 'up'}, {'name': 'mon.1', 'status': 'down'}]},
{'fake': {'device': {'name': '/dev/sda'}}}
]
CephService._get_smart_data_by_device({'devid': '1', 'daemons': ['osd.1', 'mon.1']})
send_command.assert_has_calls([mock.call('mon', 'osd tree'),
mock.call('osd', 'smart', '1', devid='1')])
device_id = 'Hitachi_HUA72201_JPW9K0N20D22SE'
osd_tree_payload = {'nodes':
[
{'name': 'osd.1', 'status': 'down'},
{'name': 'osd.2', 'status': 'up'},
{'name': 'osd.3', 'status': 'up'}
]}
health_metrics_payload = {device_id: {'ata_apm': {'enabled': False}}}
side_effect = [osd_tree_payload, health_metrics_payload]
send_command.side_effect = [
{'nodes': [{'name': 'osd.1', 'status': 'down'}, {'name': 'mon.1', 'status': 'up'}]},
{'fake': {'device': {'name': '/dev/sda'}}}
]
CephService._get_smart_data_by_device({'devid': '1', 'daemons': ['osd.1', 'mon.1']})
# Daemons associated: 1 osd down, 2 osd up.
send_command.side_effect = side_effect
smart_data = CephService._get_smart_data_by_device(
{'devid': device_id, 'daemons': ['osd.1', 'osd.2', 'osd.3']})
assert smart_data == health_metrics_payload
send_command.assert_has_calls([mock.call('mon', 'osd tree'),
mock.call('osd', 'smart', '2', devid=device_id)])
# Daemons associated: 1 osd down.
send_command.reset_mock()
send_command.side_effect = [osd_tree_payload]
smart_data = CephService._get_smart_data_by_device({'devid': device_id, 'daemons': ['osd.1']})
assert smart_data == {}
send_command.assert_has_calls([mock.call('mon', 'osd tree')])
# Daemons associated: 1 osd down, 1 mon.
send_command.reset_mock()
send_command.side_effect = side_effect
smart_data = CephService._get_smart_data_by_device(
{'devid': device_id, 'daemons': ['osd.1', 'mon.1']})
assert smart_data == health_metrics_payload
send_command.assert_has_calls([mock.call('mon', 'osd tree'),
mock.call('osd', 'smart', '1', devid='1'),
mock.call('mon', 'osd tree'),
mock.call('mon', 'device query-daemon-health-metrics',
who='mon.1')])
# Daemons associated: 1 mon.
send_command.reset_mock()
send_command.side_effect = side_effect
smart_data = CephService._get_smart_data_by_device({'devid': device_id, 'daemons': ['mon.1']})
assert smart_data == health_metrics_payload
send_command.assert_has_calls([mock.call('mon', 'osd tree'),
mock.call('mon', 'device query-daemon-health-metrics',
who='mon.1')])
# Daemons associated: 1 other (non-osd, non-mon).
send_command.reset_mock()
send_command.side_effect = [osd_tree_payload]
smart_data = CephService._get_smart_data_by_device({'devid': device_id, 'daemons': ['rgw.1']})
assert smart_data == {}
send_command.assert_has_calls([mock.call('mon', 'osd tree')])
# Daemons associated: no daemons.
send_command.reset_mock()
smart_data = CephService._get_smart_data_by_device({'devid': device_id, 'daemons': []})
assert smart_data == {}
send_command.assert_has_calls([])