From 6cd3729e2737f9012569cffc6fd69cc5eed287ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alfonso=20Mart=C3=ADnez?= Date: Thu, 13 Jan 2022 15:20:48 +0100 Subject: [PATCH] mgr/dashboard: fix: get SMART data from single-daemon device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Return SMART data even when a device is only associated with a single daemon. Fixes: https://tracker.ceph.com/issues/53858 Signed-off-by: Alfonso Martínez --- .../mgr/dashboard/services/ceph_service.py | 31 ++++----- .../mgr/dashboard/tests/test_ceph_service.py | 68 +++++++++++++++---- 2 files changed, 65 insertions(+), 34 deletions(-) diff --git a/src/pybind/mgr/dashboard/services/ceph_service.py b/src/pybind/mgr/dashboard/services/ceph_service.py index c1e0db2daa0..b261dc6b02b 100644 --- a/src/pybind/mgr/dashboard/services/ceph_service.py +++ b/src/pybind/mgr/dashboard/services/ceph_service.py @@ -8,7 +8,6 @@ from mgr_module import CommandResult from mgr_util import get_most_recent_rate, get_time_series_rates from .. import mgr -from ..exceptions import DashboardException try: from typing import Any, Dict, Optional, Union @@ -235,11 +234,7 @@ class CephService(object): # type: (dict) -> Dict[str, dict] # Check whether the device is associated with daemons. if 'daemons' in device and device['daemons']: - dev_smart_data = None - - # The daemons associated with the device. Note, the list may - # contain daemons that are 'down' or 'destroyed'. - daemons = device.get('daemons') + dev_smart_data: Dict[str, Any] = {} # Get a list of all OSD daemons on all hosts that are 'up' # because SMART data can not be retrieved from daemons that @@ -250,26 +245,27 @@ class CephService(object): if node.get('status') == 'up' ] - # Finally get the daemons on the host of the given device - # that are 'up'. All daemons on the same host can deliver - # SMART data, thus it is not relevant for us which daemon - # we are using. - daemons = list(set(daemons) & set(osd_daemons_up)) # type: ignore - - for daemon in daemons: + # All daemons on the same host can deliver SMART data, + # thus it is not relevant for us which daemon we are using. + # NOTE: the list may contain daemons that are 'down' or 'destroyed'. + for daemon in device['daemons']: svc_type, svc_id = daemon.split('.') if 'osd' in svc_type: + if daemon not in osd_daemons_up: + continue try: dev_smart_data = CephService.send_command( svc_type, 'smart', svc_id, devid=device['devid']) - except SendCommandError: + except SendCommandError as error: + logger.warning(str(error)) # Try to retrieve SMART data from another daemon. continue elif 'mon' in svc_type: try: dev_smart_data = CephService.send_command( svc_type, 'device query-daemon-health-metrics', who=daemon) - except SendCommandError: + except SendCommandError as error: + logger.warning(str(error)) # Try to retrieve SMART data from another daemon. continue else: @@ -280,10 +276,7 @@ class CephService(object): '[SMART] Error retrieving smartctl data for device ID "%s": %s', dev_id, dev_data) break - if dev_smart_data is None: - raise DashboardException( - 'Failed to retrieve SMART data for device ID "{}"'.format( - device['devid'])) + return dev_smart_data logger.warning('[SMART] No daemons associated with device ID "%s"', device['devid']) diff --git a/src/pybind/mgr/dashboard/tests/test_ceph_service.py b/src/pybind/mgr/dashboard/tests/test_ceph_service.py index 2ad9576af80..7d178695c8d 100644 --- a/src/pybind/mgr/dashboard/tests/test_ceph_service.py +++ b/src/pybind/mgr/dashboard/tests/test_ceph_service.py @@ -109,23 +109,61 @@ def test_get_smart_data(caplog, by, args, log): @mock.patch.object(CephService, 'send_command') -def test_get_smart_data_from_appropriate_ceph_command(send_command): +def test_get_smart_data_by_device(send_command): # pylint: disable=protected-access - send_command.side_effect = [ - {'nodes': [{'name': 'osd.1', 'status': 'up'}, {'name': 'mon.1', 'status': 'down'}]}, - {'fake': {'device': {'name': '/dev/sda'}}} - ] - CephService._get_smart_data_by_device({'devid': '1', 'daemons': ['osd.1', 'mon.1']}) - send_command.assert_has_calls([mock.call('mon', 'osd tree'), - mock.call('osd', 'smart', '1', devid='1')]) + device_id = 'Hitachi_HUA72201_JPW9K0N20D22SE' + osd_tree_payload = {'nodes': + [ + {'name': 'osd.1', 'status': 'down'}, + {'name': 'osd.2', 'status': 'up'}, + {'name': 'osd.3', 'status': 'up'} + ]} + health_metrics_payload = {device_id: {'ata_apm': {'enabled': False}}} + side_effect = [osd_tree_payload, health_metrics_payload] - send_command.side_effect = [ - {'nodes': [{'name': 'osd.1', 'status': 'down'}, {'name': 'mon.1', 'status': 'up'}]}, - {'fake': {'device': {'name': '/dev/sda'}}} - ] - CephService._get_smart_data_by_device({'devid': '1', 'daemons': ['osd.1', 'mon.1']}) + # Daemons associated: 1 osd down, 2 osd up. + send_command.side_effect = side_effect + smart_data = CephService._get_smart_data_by_device( + {'devid': device_id, 'daemons': ['osd.1', 'osd.2', 'osd.3']}) + assert smart_data == health_metrics_payload + send_command.assert_has_calls([mock.call('mon', 'osd tree'), + mock.call('osd', 'smart', '2', devid=device_id)]) + + # Daemons associated: 1 osd down. + send_command.reset_mock() + send_command.side_effect = [osd_tree_payload] + smart_data = CephService._get_smart_data_by_device({'devid': device_id, 'daemons': ['osd.1']}) + assert smart_data == {} + send_command.assert_has_calls([mock.call('mon', 'osd tree')]) + + # Daemons associated: 1 osd down, 1 mon. + send_command.reset_mock() + send_command.side_effect = side_effect + smart_data = CephService._get_smart_data_by_device( + {'devid': device_id, 'daemons': ['osd.1', 'mon.1']}) + assert smart_data == health_metrics_payload send_command.assert_has_calls([mock.call('mon', 'osd tree'), - mock.call('osd', 'smart', '1', devid='1'), - mock.call('mon', 'osd tree'), mock.call('mon', 'device query-daemon-health-metrics', who='mon.1')]) + + # Daemons associated: 1 mon. + send_command.reset_mock() + send_command.side_effect = side_effect + smart_data = CephService._get_smart_data_by_device({'devid': device_id, 'daemons': ['mon.1']}) + assert smart_data == health_metrics_payload + send_command.assert_has_calls([mock.call('mon', 'osd tree'), + mock.call('mon', 'device query-daemon-health-metrics', + who='mon.1')]) + + # Daemons associated: 1 other (non-osd, non-mon). + send_command.reset_mock() + send_command.side_effect = [osd_tree_payload] + smart_data = CephService._get_smart_data_by_device({'devid': device_id, 'daemons': ['rgw.1']}) + assert smart_data == {} + send_command.assert_has_calls([mock.call('mon', 'osd tree')]) + + # Daemons associated: no daemons. + send_command.reset_mock() + smart_data = CephService._get_smart_data_by_device({'devid': device_id, 'daemons': []}) + assert smart_data == {} + send_command.assert_has_calls([])