osd/osd_types: use appropriate cost value for PullOp

See included comments -- previous values did not account for object size. This causes problems for mclock which is much more strict in how it interprets costs. Fixes: https://tracker.ceph.com/issues/58607 Signed-off-by: Samuel Just <sjust@redhat.com> Signed-off-by: Sridhar Seshasayee <sseshasa@redhat.com>
2025-01-10 05:00:59 +00:00 · 2023-02-03 11:06:06 +05:30 · 2023-02-03 11:06:06 +05:30 · 7dc3024355
commit 7dc3024355
parent ee26df6d56
2 changed files with 20 additions and 2 deletions
--- a/src/osd/osd_types.cc
+++ b/src/osd/osd_types.cc
@ -15,6 +15,7 @@
 *
 */

+#include <algorithm>
 #include <list>
 #include <map>
 #include <ostream>
@ -6807,8 +6808,20 @@ ostream& operator<<(ostream& out, const PullOp &op)

 uint64_t PullOp::cost(CephContext *cct) const
 {
-  return cct->_conf->osd_push_per_object_cost +
-    cct->_conf->osd_recovery_max_chunk;
+  if (cct->_conf->osd_op_queue == "mclock_scheduler") {
+    return std::clamp<uint64_t>(
+      recovery_progress.estimate_remaining_data_to_recover(recovery_info),
+      1,
+      cct->_conf->osd_recovery_max_chunk);
+  } else {
+    /* We retain this legacy behavior for WeightedPriorityQueue. It seems to
+     * require very large costs for several messages in order to do any
+     * meaningful amount of throttling.  This branch should be removed after
+     * Reef.
+     */
+    return cct->_conf->osd_push_per_object_cost +
+      cct->_conf->osd_recovery_max_chunk;
+  }
 }

 // -- PushOp --
--- a/src/osd/osd_types.h
+++ b/src/osd/osd_types.h
@ -6028,6 +6028,11 @@ struct ObjectRecoveryProgress {
      omap_complete;
  }

+  uint64_t estimate_remaining_data_to_recover(const ObjectRecoveryInfo& info) const {
+    // Overestimates in case of clones, but avoids traversing copy_subset
+    return info.size - data_recovered_to;
+  }
+
  static void generate_test_instances(std::list<ObjectRecoveryProgress*>& o);
  void encode(ceph::buffer::list &bl) const;
  void decode(ceph::buffer::list::const_iterator &bl);