osd/: differentiate priority for PGRecovery[Context]

PGs with degraded objects should be higher priority. Signed-off-by: Samuel Just <sjust@redhat.com>
2025-02-24 11:37:37 +00:00 · 2023-04-06 00:04:05 -07:00 · 2023-04-06 00:04:05 -07:00 · a1f0ccf74a
commit a1f0ccf74a
parent 69a17bdb0f
6 changed files with 42 additions and 21 deletions
--- a/src/osd/OSD.cc
+++ b/src/osd/OSD.cc
@ -1697,7 +1697,8 @@ void OSDService::enqueue_front(OpSchedulerItem&& qi)
 void OSDService::queue_recovery_context(
  PG *pg,
  GenContext<ThreadPool::TPHandle&> *c,
-  uint64_t cost)
+  uint64_t cost,
+  int priority)
 {
  epoch_t e = get_osdmap_epoch();

@ -1717,7 +1718,7 @@ void OSDService::queue_recovery_context(
  enqueue_back(
    OpSchedulerItem(
      unique_ptr<OpSchedulerItem::OpQueueable>(
-	new PGRecoveryContext(pg->get_pgid(), c, e)),
+	new PGRecoveryContext(pg->get_pgid(), c, e, priority)),
      cost_for_queue,
      cct->_conf->osd_recovery_priority,
      ceph_clock_now(),
@ -2063,7 +2064,8 @@ void OSDService::_queue_for_recovery(
 	new PGRecovery(
 	  p.pg->get_pgid(),
 	  p.epoch_queued,
-          reserved_pushes)),
+          reserved_pushes,
+	  p.priority)),
      cost_for_queue,
      cct->_conf->osd_recovery_priority,
      ceph_clock_now(),
@ -9390,7 +9392,7 @@ unsigned OSDService::get_target_pg_log_entries() const
 }

 void OSD::do_recovery(
-  PG *pg, epoch_t queued, uint64_t reserved_pushes,
+  PG *pg, epoch_t queued, uint64_t reserved_pushes, int priority,
  ThreadPool::TPHandle &handle)
 {
  uint64_t started = 0;
@ -9407,13 +9409,14 @@ void OSD::do_recovery(
    std::lock_guard l(service.sleep_lock);
    if (recovery_sleep > 0 && service.recovery_needs_sleep) {
      PGRef pgref(pg);
-      auto recovery_requeue_callback = new LambdaContext([this, pgref, queued, reserved_pushes](int r) {
+      auto recovery_requeue_callback = new LambdaContext(
+	[this, pgref, queued, reserved_pushes, priority](int r) {
        dout(20) << "do_recovery wake up at "
                 << ceph_clock_now()
 	         << ", re-queuing recovery" << dendl;
 	std::lock_guard l(service.sleep_lock);
        service.recovery_needs_sleep = false;
-        service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes);
+        service.queue_recovery_after_sleep(pgref.get(), queued, reserved_pushes, priority);
      });

      // This is true for the first recovery op and when the previous recovery op
--- a/src/osd/OSD.h
+++ b/src/osd/OSD.h
@ -509,7 +509,8 @@ public:
  AsyncReserver<spg_t, Finisher> snap_reserver;
  void queue_recovery_context(PG *pg,
                              GenContext<ThreadPool::TPHandle&> *c,
-                              uint64_t cost);
+                              uint64_t cost,
+			      int priority);
  void queue_for_snap_trim(PG *pg);
  void queue_for_scrub(PG* pg, Scrub::scrub_prio_t with_priority);

@ -589,6 +590,7 @@ private:
    const epoch_t epoch_queued;
    PGRef pg;
    const uint64_t cost_per_object;
+    const int priority;
  };
  std::list<pg_awaiting_throttle_t> awaiting_throttle;

@ -651,25 +653,31 @@ public:
  unsigned get_target_pg_log_entries() const;

  // delayed pg activation
-  void queue_for_recovery(PG *pg, uint64_t cost_per_object) {
+  void queue_for_recovery(
+    PG *pg, uint64_t cost_per_object,
+    int priority) {
    std::lock_guard l(recovery_lock);

    if (pg->is_forced_recovery_or_backfill()) {
      awaiting_throttle.emplace_front(
        pg_awaiting_throttle_t{
-          pg->get_osdmap()->get_epoch(), pg, cost_per_object});
+          pg->get_osdmap()->get_epoch(), pg, cost_per_object, priority});
    } else {
      awaiting_throttle.emplace_back(
        pg_awaiting_throttle_t{
-          pg->get_osdmap()->get_epoch(), pg, cost_per_object});
+          pg->get_osdmap()->get_epoch(), pg, cost_per_object, priority});
    }
    _maybe_queue_recovery();
  }
-  void queue_recovery_after_sleep(PG *pg, epoch_t queued, uint64_t reserved_pushes) {
+  void queue_recovery_after_sleep(
+    PG *pg, epoch_t queued, uint64_t reserved_pushes,
+    int priority) {
    std::lock_guard l(recovery_lock);
    // Send cost as 1 in pg_awaiting_throttle_t below. The cost is ignored
    // as this path is only applicable for WeightedPriorityQueue scheduler.
-    _queue_for_recovery(pg_awaiting_throttle_t{queued, pg, 1}, reserved_pushes);
+    _queue_for_recovery(
+      pg_awaiting_throttle_t{queued, pg, 1, priority},
+      reserved_pushes);
  }

  void queue_check_readable(spg_t spgid,
@ -1875,6 +1883,7 @@ protected:

  // -- pg recovery --
  void do_recovery(PG *pg, epoch_t epoch_queued, uint64_t pushes_reserved,
+		   int priority,
 		   ThreadPool::TPHandle &handle);


--- a/src/osd/PG.cc
+++ b/src/osd/PG.cc
@ -429,7 +429,9 @@ void PG::queue_recovery()
 	1, // ensure objects is non-negative and non-zero
 	info.stats.stats.sum.num_objects));
    uint64_t cost_per_object = std::max<uint64_t>(num_bytes / num_objects, 1);
-    osd->queue_for_recovery(this, cost_per_object);
+    osd->queue_for_recovery(
+      this, cost_per_object, recovery_state.get_recovery_op_priority()
+    );
  }
 }

--- a/src/osd/PrimaryLogPG.cc
+++ b/src/osd/PrimaryLogPG.cc
@ -526,7 +526,9 @@ void PrimaryLogPG::schedule_recovery_work(
  GenContext<ThreadPool::TPHandle&> *c,
  uint64_t cost)
 {
-  osd->queue_recovery_context(this, c, cost);
+  osd->queue_recovery_context(
+    this, c, cost,
+    recovery_state.get_recovery_op_priority());
 }

 void PrimaryLogPG::replica_clear_repop_obc(
--- a/src/osd/scheduler/OpSchedulerItem.cc
+++ b/src/osd/scheduler/OpSchedulerItem.cc
@ -224,7 +224,7 @@ void PGRecovery::run(
  PGRef& pg,
  ThreadPool::TPHandle &handle)
 {
-  osd->do_recovery(pg.get(), epoch_queued, reserved_pushes, handle);
+  osd->do_recovery(pg.get(), epoch_queued, reserved_pushes, priority, handle);
  pg->unlock();
 }

--- a/src/osd/scheduler/OpSchedulerItem.h
+++ b/src/osd/scheduler/OpSchedulerItem.h
@ -502,14 +502,17 @@ class PGScrubChunkIsFree : public PGScrubItem {
 class PGRecovery : public PGOpQueueable {
  epoch_t epoch_queued;
  uint64_t reserved_pushes;
+  int priority;
 public:
  PGRecovery(
    spg_t pg,
    epoch_t epoch_queued,
-    uint64_t reserved_pushes)
+    uint64_t reserved_pushes,
+    int priority)
    : PGOpQueueable(pg),
      epoch_queued(epoch_queued),
-      reserved_pushes(reserved_pushes) {}
+      reserved_pushes(reserved_pushes),
+      priority(priority) {}
  std::ostream &print(std::ostream &rhs) const final {
    return rhs << "PGRecovery(pgid=" << get_pgid()
 	       << " epoch_queued=" << epoch_queued
@ -522,18 +525,20 @@ public:
  void run(
    OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
  op_scheduler_class get_scheduler_class() const final {
-    return op_scheduler_class::background_recovery;
+    return priority_to_scheduler_class(priority);
  }
 };

 class PGRecoveryContext : public PGOpQueueable {
  std::unique_ptr<GenContext<ThreadPool::TPHandle&>> c;
  epoch_t epoch;
+  int priority;
 public:
  PGRecoveryContext(spg_t pgid,
-		    GenContext<ThreadPool::TPHandle&> *c, epoch_t epoch)
+		    GenContext<ThreadPool::TPHandle&> *c, epoch_t epoch,
+		    int priority)
    : PGOpQueueable(pgid),
-      c(c), epoch(epoch) {}
+      c(c), epoch(epoch), priority(priority) {}
  std::ostream &print(std::ostream &rhs) const final {
    return rhs << "PGRecoveryContext(pgid=" << get_pgid()
 	       << " c=" << c.get() << " epoch=" << epoch
@ -542,7 +547,7 @@ public:
  void run(
    OSD *osd, OSDShard *sdata, PGRef& pg, ThreadPool::TPHandle &handle) final;
  op_scheduler_class get_scheduler_class() const final {
-    return op_scheduler_class::background_recovery;
+    return priority_to_scheduler_class(priority);
  }
 };