From b70d4a9caae0eb859e10b68f93573d507625d267 Mon Sep 17 00:00:00 2001 From: Kamoltat Date: Wed, 12 Jan 2022 02:41:01 +0000 Subject: [PATCH] pybind/mgr/progress: enforced try and except on accessing event dictionary There is a certain race condition scenario where an event gets deleted while the progress module iterates through the ``events`` dictionary, without a ``try and except``, this will cause an unhandled exception error and will crash the module. This commit will enforce ``try and except`` on every part of the code where we are accessing the ``events`` dictionary. Fixes: https://tracker.ceph.com/issues/53803 Signed-off-by: Kamoltat --- src/pybind/mgr/progress/module.py | 40 ++++++++++++++++++------------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/src/pybind/mgr/progress/module.py b/src/pybind/mgr/progress/module.py index 5f9aa86f647..422aba962a1 100644 --- a/src/pybind/mgr/progress/module.py +++ b/src/pybind/mgr/progress/module.py @@ -539,12 +539,15 @@ class Module(MgrModule): # previous recovery event for that osd if marked == "in": for ev_id in list(self._events): - ev = self._events[ev_id] - if isinstance(ev, PgRecoveryEvent) and osd_id in ev.which_osds: - self.log.info("osd.{0} came back in, cancelling event".format( - osd_id - )) - self._complete(ev) + try: + ev = self._events[ev_id] + if isinstance(ev, PgRecoveryEvent) and osd_id in ev.which_osds: + self.log.info("osd.{0} came back in, cancelling event".format( + osd_id + )) + self._complete(ev) + except KeyError: + self.log.warning("_osd_in_out: ev {0} does not exist".format(ev_id)) if len(affected_pgs) > 0: r_ev = PgRecoveryEvent( @@ -625,16 +628,20 @@ class Module(MgrModule): global_event = False data = self.get("pg_progress") for ev_id in list(self._events): - ev = self._events[ev_id] - # Check for types of events - # we have to update - if isinstance(ev, PgRecoveryEvent): - ev.pg_update(data, self.log) - self.maybe_complete(ev) - elif isinstance(ev, GlobalRecoveryEvent): - global_event = True - ev.global_event_update_progress(self.log) - self.maybe_complete(ev) + try: + ev = self._events[ev_id] + # Check for types of events + # we have to update + if isinstance(ev, PgRecoveryEvent): + ev.pg_update(data, self.log) + self.maybe_complete(ev) + elif isinstance(ev, GlobalRecoveryEvent): + global_event = True + ev.global_event_update_progress(self.log) + self.maybe_complete(ev) + except KeyError: + self.log.warning("_process_pg_summary: ev {0} does not exist".format(ev_id)) + continue if not global_event: # If there is no global event @@ -736,6 +743,7 @@ class Module(MgrModule): ev = self._events[ev_id] assert isinstance(ev, RemoteEvent) except KeyError: + # if key doesn't exist we create an event ev = RemoteEvent(ev_id, ev_msg, refs, add_to_ceph_s) self._events[ev_id] = ev self.log.info("update: starting ev {0} ({1})".format(