pybind/mgr/progress: disable pg recovery event by default

The progress module disabled the pg recovery event by default
since the event is expensive and has interrupted other serviceis
when there is OSDs being marked in/out from the the cluster.

To turn the event on manually:

ceph config set mgr mgr/progress/allow_pg_recovery_event true

Updated qa/tasks/mgr/test_progress.py to enable
the pg recovery event when testing the progress module.

Signed-off-by: Kamoltat <ksirivad@redhat.com>
This commit is contained in:
Kamoltat 2022-01-14 02:44:16 +00:00
parent 6c0c75fa17
commit f06da20dff
4 changed files with 71 additions and 1 deletions

View File

@ -126,6 +126,13 @@
https://docs.ceph.com/en/latest/mgr/telemetry/
* MGR: The progress module disables the pg recovery event by default
since the event is expensive and has interrupted other service when
there are OSDs being marked in/out from the the cluster. However,
the user may still enable this event anytime. For more details, see:
https://docs.ceph.com/en/latest/mgr/progress/
>=16.0.0
--------
* mgr/nfs: ``nfs`` module is moved out of volumes plugin. Prior using the

View File

@ -45,3 +45,14 @@ Clear all ongoing and completed events:
.. prompt:: bash #
ceph progress clear
PG Recovery Event
-----------------
An event for each PG affected by recovery event can be shown in
`ceph progress` This is completely optional, and disabled by default
due to CPU overheard:
.. prompt:: bash #
ceph config set mgr mgr/progress/allow_pg_recovery_event true

View File

@ -280,6 +280,11 @@ class TestProgress(MgrTestCase):
self.mgr_cluster.mon_manager.raw_cluster_cmd(
'osd', 'in', str(osd['osd']))
# Unset allow_pg_recovery_event in case it's set to true
self.mgr_cluster.mon_manager.raw_cluster_cmd(
'config', 'set', 'mgr',
'mgr/progress/allow_pg_recovery_event', 'false')
super(TestProgress, self).tearDown()
def test_osd_healthy_recovery(self):
@ -288,6 +293,10 @@ class TestProgress(MgrTestCase):
placement, and we wait for the PG to get healthy in its new
locations.
"""
self.mgr_cluster.mon_manager.raw_cluster_cmd(
'config', 'set', 'mgr',
'mgr/progress/allow_pg_recovery_event', 'true')
ev = self._simulate_failure()
# Wait for progress event to ultimately reach completion
@ -301,6 +310,10 @@ class TestProgress(MgrTestCase):
progress event to be correctly marked complete once there
is no more data to move.
"""
self.mgr_cluster.mon_manager.raw_cluster_cmd(
'config', 'set', 'mgr',
'mgr/progress/allow_pg_recovery_event', 'true')
ev = self._simulate_failure()
self.mgr_cluster.mon_manager.remove_pool(self.POOL)
@ -317,6 +330,10 @@ class TestProgress(MgrTestCase):
It should create another event for when osd is marked in
and cancel the one that is still ongoing.
"""
self.mgr_cluster.mon_manager.raw_cluster_cmd(
'config', 'set', 'mgr',
'mgr/progress/allow_pg_recovery_event', 'true')
ev1 = self._simulate_failure()
ev2 = self._simulate_back_in([0], ev1)
@ -336,6 +353,9 @@ class TestProgress(MgrTestCase):
coming in from other module, however, once it is turned
back, on creating an event should be working as it is.
"""
self.mgr_cluster.mon_manager.raw_cluster_cmd(
'config', 'set', 'mgr',
'mgr/progress/allow_pg_recovery_event', 'true')
pool_size = 3
self._setup_pool(size=pool_size)
@ -378,3 +398,26 @@ class TestProgress(MgrTestCase):
check_fn=lambda: self._is_inprogress_or_complete(ev1['id']),
timeout=self.RECOVERY_PERIOD)
self.assertTrue(self._is_quiet())
def test_default_progress_test(self):
"""
progress module disabled the event of pg recovery event
by default, we test this to see if this holds true
"""
pool_size = 3
self._setup_pool(size=pool_size)
self._write_some_data(self.WRITE_PERIOD)
with self.recovery_backfill_disabled():
self.mgr_cluster.mon_manager.raw_cluster_cmd(
'osd', 'out', '0')
time.sleep(self.EVENT_CREATION_PERIOD/2)
with self.recovery_backfill_disabled():
self.mgr_cluster.mon_manager.raw_cluster_cmd(
'osd', 'in', '0')
time.sleep(self.EVENT_CREATION_PERIOD/2)
self.assertEqual(self._osd_in_out_events_count(), 0)

View File

@ -450,6 +450,13 @@ class Module(MgrModule):
'enabled',
default=True,
type='bool',
),
Option(
'allow_pg_recovery_event',
default=False,
type='bool',
desc='allow the module to show pg recovery progress',
runtime=True
)
]
@ -476,6 +483,7 @@ class Module(MgrModule):
self.max_completed_events = 0
self.sleep_interval = 0
self.enabled = True
self.allow_pg_recovery_event = False
def config_notify(self):
for opt in self.MODULE_OPTIONS:
@ -718,7 +726,8 @@ class Module(MgrModule):
self._dirty = False
if self.enabled:
self._process_osdmap()
if self.allow_pg_recovery_event:
self._process_osdmap()
self._process_pg_summary()
self._shutdown.wait(timeout=self.sleep_interval)