Merge pull request #40480 from kamoltat/wip-ksirivad-fix-bug-49988

pybind/mgr/progress: Disregard unreported pgs

Reviewed-by: Josh Durgin <jdurgin@redhat.com>
This commit is contained in:
Kefu Chai 2021-06-12 08:37:35 +08:00 committed by GitHub
commit 7513b24aa5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 32 additions and 20 deletions

View File

@ -14,7 +14,7 @@ class TestProgress(MgrTestCase):
# How long we expect to wait at most between taking an OSD out
# and seeing the progress event pop up.
EVENT_CREATION_PERIOD = 5
EVENT_CREATION_PERIOD = 15
WRITE_PERIOD = 30

View File

@ -12,7 +12,7 @@ import threading
import datetime
import uuid
import time
import logging
import json
@ -62,7 +62,6 @@ class Event(object):
# type: () -> bool
return self._add_to_ceph_s
@property
def progress(self):
# type: () -> float
@ -180,6 +179,7 @@ class GhostEvent(Event):
d["failure_message"] = self._failure_message
return d
class GlobalRecoveryEvent(Event):
"""
An event whoese completion is determined by active+clean/total_pg_num
@ -194,20 +194,28 @@ class GlobalRecoveryEvent(Event):
self._active_clean_num = active_clean_num
self._refresh()
def global_event_update_progress(self, pg_dump):
# type: (Dict) -> None
def global_event_update_progress(self, pg_dump, log):
# type: (Dict, logging.Logger) -> None
"Update progress of Global Recovery Event"
pgs = pg_dump['pg_stats']
new_active_clean_num = 0
for pg in pgs:
skipped_pgs = 0
if int(pg['reported_epoch']) < int(self._start_epoch):
continue
for pg in pgs:
# Disregard PGs that are not being reported
# if the states are active+clean. Since it is
# possible that some pgs might not have any movement
# even before the start of the event.
state = pg['state']
states = state.split("+")
if pg['reported_epoch'] < self._start_epoch:
if "active" in states and "clean" in states:
log.debug("Skipping pg {0} since reported_epoch {1} < start_epoch {2}"
.format(pg['pgid'], pg['reported_epoch'], self._start_epoch))
skipped_pgs += 1
continue
if "active" in states and "clean" in states:
new_active_clean_num += 1
@ -218,10 +226,11 @@ class GlobalRecoveryEvent(Event):
# the progress
try:
# Might be that total_pg_num is 0
self._progress = float(new_active_clean_num) / total_pg_num
self._progress = float(new_active_clean_num) / (total_pg_num - skipped_pgs)
except ZeroDivisionError:
self._progress = 0.0
log.debug("Updated progress to %s", self.summary())
self._refresh()
@property
@ -301,7 +310,7 @@ class PgRecoveryEvent(Event):
# FIXME: far more fields getting pythonized than we really care about
# Sanity check to see if there are any missing PGs and to assign
# empty array and dictionary if there hasn't been any recovery
pg_to_state = dict([(p['pgid'], p) for p in raw_pg_stats['pg_stats']]) # type: Dict[str, Any]
pg_to_state = dict((p['pgid'], p) for p in raw_pg_stats['pg_stats']) # type: Dict[str, Any]
if self._original_bytes_recovered is None:
self._original_bytes_recovered = {}
missing_pgs = []
@ -451,7 +460,7 @@ class Module(MgrModule):
super(Module, self).__init__(*args, **kwargs)
self._events = {} # type: Dict[str, Union[RemoteEvent, PgRecoveryEvent, GlobalRecoveryEvent]]
self._completed_events = [] # type: List[GhostEvent]
self._completed_events = [] # type: List[GhostEvent]
self._old_osd_map = None # type: Optional[OSDMap]
@ -538,7 +547,6 @@ class Module(MgrModule):
self.log.warning("{0} PGs affected by osd.{1} being marked {2}".format(
len(affected_pgs), osd_id, marked))
# In the case of the osd coming back in, we might need to cancel
# previous recovery event for that osd
if marked == "in":
@ -596,7 +604,8 @@ class Module(MgrModule):
active_clean_num = 0
for pg in pgs:
state = pg['state']
# TODO right here we can keep track of epoch as well
# and parse it to global_event_update_progress()
states = state.split("+")
if "active" in states and "clean" in states:
@ -607,12 +616,15 @@ class Module(MgrModule):
except ZeroDivisionError:
return
if progress < 1.0:
self.log.warning(("Starting Global Recovery Event,"
"%d pgs not in active + clean state"),
total_pg_num - active_clean_num)
ev = GlobalRecoveryEvent("Global Recovery Event",
refs=[("global","")],
refs=[("global", "")],
add_to_ceph_s=True,
start_epoch=self.get_osdmap().get_epoch(),
active_clean_num=active_clean_num)
ev.global_event_update_progress(pg_dump)
ev.global_event_update_progress(self.get('pg_stats'), self.log)
self._events[ev.id] = ev
def notify(self, notify_type, notify_data):
@ -625,9 +637,9 @@ class Module(MgrModule):
assert old_osdmap
assert self._latest_osdmap
self.log.info("Processing OSDMap change {0}..{1}".format(
old_osdmap.get_epoch(), self._latest_osdmap.get_epoch()
))
self.log.info(("Processing OSDMap change %d..%d"),
old_osdmap.get_epoch(), self._latest_osdmap.get_epoch())
self._osdmap_changed(old_osdmap, self._latest_osdmap)
elif notify_type == "pg_summary":
# if there are no events we will skip this here to avoid
@ -647,7 +659,7 @@ class Module(MgrModule):
self.maybe_complete(ev)
elif isinstance(ev, GlobalRecoveryEvent):
global_event = True
ev.global_event_update_progress(data)
ev.global_event_update_progress(data, self.log)
self.maybe_complete(ev)
if not global_event: