From 2abf4f1fc368ee74637b0ba1c200e03c02883db8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 29 Jan 2021 10:36:39 -0600 Subject: [PATCH 1/2] mgr/devicehealth: only create pool when we have some osds If we create the pool before we have OSDs, the PGs won't be active+clean and we'll raise a health warning that we shouldn't. This will annoy a new users deploying a new cluster, and (more importantly?) make qa tests fail while deploying initial clusters due to the health warning. Signed-off-by: Sage Weil --- src/pybind/mgr/devicehealth/module.py | 36 +++++++++++++++++---------- 1 file changed, 23 insertions(+), 13 deletions(-) diff --git a/src/pybind/mgr/devicehealth/module.py b/src/pybind/mgr/devicehealth/module.py index 14b7dfdacfc..36cb182f4f1 100644 --- a/src/pybind/mgr/devicehealth/module.py +++ b/src/pybind/mgr/devicehealth/module.py @@ -221,11 +221,28 @@ class Module(MgrModule): self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name'])) def notify(self, notify_type: str, notify_id: str) -> None: - # create device_health_metrics pool if it doesn't exist if notify_type == "osd_map" and self.enable_monitoring: - if not self.has_device_pool: - self.create_device_pool() - self.has_device_pool = True + # create device_health_metrics pool if it doesn't exist + self.maybe_create_device_pool() + + def have_enough_osds(self) -> bool: + # wait until we have enough OSDs to allow the pool to be healthy + up = 0 + for osd in self.get("osd_map")["osds"]: + if osd["up"]: + up += 1 + + need = cast(int, self.get_ceph_option("osd_pool_default_size")) + return up >= need + + def maybe_create_device_pool(self) -> bool: + if not self.has_device_pool: + if not self.have_enough_osds(): + self.log.warning("Not enough OSDs yet to create monitoring pool") + return False + self.create_device_pool() + self.has_device_pool = True + return True def create_device_pool(self) -> None: self.log.debug('create %s pool' % self.pool_name) @@ -304,16 +321,9 @@ class Module(MgrModule): self.event.set() def open_connection(self, create_if_missing: bool = True) -> rados.Ioctx: - osdmap = self.get("osd_map") - assert osdmap is not None - if len(osdmap['osds']) == 0: - return None - if not self.has_device_pool: - if not create_if_missing: + if create_if_missing: + if not self.maybe_create_device_pool(): return None - if self.enable_monitoring: - self.create_device_pool() - self.has_device_pool = True ioctx = self.rados.open_ioctx(self.pool_name) return ioctx From db31d09efa60fad98429b1ec7e0d30188f355035 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 29 Jan 2021 10:40:44 -0600 Subject: [PATCH 2/2] mgr/devicehealth: make CLI commands error when pool doesn't exist This is better than silently failing to do anything. Signed-off-by: Sage Weil --- src/pybind/mgr/devicehealth/module.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/pybind/mgr/devicehealth/module.py b/src/pybind/mgr/devicehealth/module.py index 36cb182f4f1..c1c04896c01 100644 --- a/src/pybind/mgr/devicehealth/module.py +++ b/src/pybind/mgr/devicehealth/module.py @@ -330,7 +330,7 @@ class Module(MgrModule): def scrape_daemon(self, daemon_type: str, daemon_id: str) -> Tuple[int, str, str]: ioctx = self.open_connection() if not ioctx: - return 0, "", "" + return -errno.EAGAIN, "", "device_health_metrics pool not yet available" raw_smart_data = self.do_scrape_daemon(daemon_type, daemon_id) if raw_smart_data: for device, raw_data in raw_smart_data.items(): @@ -345,7 +345,7 @@ class Module(MgrModule): assert osdmap is not None ioctx = self.open_connection() if not ioctx: - return 0, "", "" + return -errno.EAGAIN, "", "device_health_metrics pool not yet available" did_device = {} ids = [] for osd in osdmap['osds']: @@ -379,7 +379,7 @@ class Module(MgrModule): (daemon_type, daemon_id) = daemons[0].split('.') ioctx = self.open_connection() if not ioctx: - return 0, "", "" + return -errno.EAGAIN, "", "device_health_metrics pool not yet available" raw_smart_data = self.do_scrape_daemon(daemon_type, daemon_id, devid=devid) if raw_smart_data: