diff --git a/doc/rados/operations/pools.rst b/doc/rados/operations/pools.rst index 662b34ff31a..98bb27babd9 100644 --- a/doc/rados/operations/pools.rst +++ b/doc/rados/operations/pools.rst @@ -601,8 +601,10 @@ You may set values for the following keys: ``recovery_priority`` -:Description: When a value is set it will boost the computed reservation priority - by this amount. This value should be less than 30. +:Description: When a value is set it will increase or decrease the computed + reservation priority. This value must be in the range -10 to + 10. Use a negative priority for less important pools so they + have lower priority than any new pools. :Type: Integer :Default: ``0`` diff --git a/qa/standalone/osd/osd-backfill-prio.sh b/qa/standalone/osd/osd-backfill-prio.sh index 6491cd19efd..2a69ba12d03 100755 --- a/qa/standalone/osd/osd-backfill-prio.sh +++ b/qa/standalone/osd/osd-backfill-prio.sh @@ -30,8 +30,8 @@ function run() { export objects=50 export poolprefix=test export FORCE_PRIO="254" # See OSD_BACKFILL_PRIORITY_FORCED - export DEGRADED_PRIO="140" # See OSD_BACKFILL_DEGRADED_PRIORITY_BASE - export NORMAL_PRIO="100" # See OSD_BACKFILL_PRIORITY_BASE + export DEGRADED_PRIO="150" # See OSD_BACKFILL_DEGRADED_PRIORITY_BASE + 10 + export NORMAL_PRIO="110" # See OSD_BACKFILL_PRIORITY_BASE + 10 local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} for func in $funcs ; do diff --git a/qa/standalone/osd/osd-recovery-prio.sh b/qa/standalone/osd/osd-recovery-prio.sh index 9b017800f67..d246dda66db 100755 --- a/qa/standalone/osd/osd-recovery-prio.sh +++ b/qa/standalone/osd/osd-recovery-prio.sh @@ -29,7 +29,7 @@ function run() { export objects=200 export poolprefix=test export FORCE_PRIO="255" # See OSD_RECOVERY_PRIORITY_FORCED - export NORMAL_PRIO="180" # See OSD_RECOVERY_PRIORITY_BASE + export NORMAL_PRIO="190" # See OSD_RECOVERY_PRIORITY_BASE + 10 local funcs=${@:-$(set | sed -n -e 's/^\(TEST_[0-9a-z_]*\) .*/\1/p')} for func in $funcs ; do diff --git a/qa/workunits/cephtool/test.sh b/qa/workunits/cephtool/test.sh index e9610526d99..47da1dcb1d1 100755 --- a/qa/workunits/cephtool/test.sh +++ b/qa/workunits/cephtool/test.sh @@ -2037,10 +2037,12 @@ function test_mon_osd_pool_set() ceph osd pool get $TEST_POOL_GETSET recovery_priority | expect_false grep '.' ceph osd pool set $TEST_POOL_GETSET recovery_priority 5 ceph osd pool get $TEST_POOL_GETSET recovery_priority | grep 'recovery_priority: 5' + ceph osd pool set $TEST_POOL_GETSET recovery_priority -5 + ceph osd pool get $TEST_POOL_GETSET recovery_priority | grep 'recovery_priority: -5' ceph osd pool set $TEST_POOL_GETSET recovery_priority 0 ceph osd pool get $TEST_POOL_GETSET recovery_priority | expect_false grep '.' - expect_false ceph osd pool set $TEST_POOL_GETSET recovery_priority -1 - expect_false ceph osd pool set $TEST_POOL_GETSET recovery_priority 30 + expect_false ceph osd pool set $TEST_POOL_GETSET recovery_priority -11 + expect_false ceph osd pool set $TEST_POOL_GETSET recovery_priority 11 ceph osd pool get $TEST_POOL_GETSET recovery_op_priority | expect_false grep '.' ceph osd pool set $TEST_POOL_GETSET recovery_op_priority 5 diff --git a/src/mon/OSDMonitor.cc b/src/mon/OSDMonitor.cc index e5d031c9a37..58a1096a1aa 100644 --- a/src/mon/OSDMonitor.cc +++ b/src/mon/OSDMonitor.cc @@ -652,6 +652,11 @@ void OSDMonitor::on_active() if (mon->is_leader()) { mon->clog->debug() << "osdmap " << osdmap; + if (!priority_convert) { + // Only do this once at start-up + convert_pool_priorities(); + priority_convert = true; + } } else { list<MonOpRequestRef> ls; take_all_failures(ls); @@ -7620,12 +7625,9 @@ int OSDMonitor::prepare_command_pool_set(const cmdmap_t& cmdmap, ss << "error parsing int value '" << val << "': " << interr; return -EINVAL; } - if (n < 0) { - ss << "pool recovery_priority can not be negative"; - return -EINVAL; - } else if (n >= 30) { - ss << "pool recovery_priority should be less than 30 due to " - << "Ceph internal implementation restrictions"; + if (n > OSD_POOL_PRIORITY_MAX || n < OSD_POOL_PRIORITY_MIN) { + ss << "pool recovery_priority must be between " << OSD_POOL_PRIORITY_MIN + << " and " << OSD_POOL_PRIORITY_MAX; return -EINVAL; } } else if (var == "pg_autoscale_bias") { @@ -13178,3 +13180,55 @@ void OSDMonitor::_pool_op_reply(MonOpRequestRef op, ret, epoch, get_last_committed(), blp); mon->send_reply(op, reply); } + +void OSDMonitor::convert_pool_priorities(void) +{ + pool_opts_t::key_t key = pool_opts_t::get_opt_desc("recovery_priority").key; + int64_t max_prio = 0; + int64_t min_prio = 0; + for (const auto &i : osdmap.get_pools()) { + const auto &pool = i.second; + + if (pool.opts.is_set(key)) { + int64_t prio; + pool.opts.get(key, &prio); + if (prio > max_prio) + max_prio = prio; + if (prio < min_prio) + min_prio = prio; + } + } + if (max_prio <= OSD_POOL_PRIORITY_MAX && min_prio >= OSD_POOL_PRIORITY_MIN) { + dout(20) << __func__ << " nothing to fix" << dendl; + return; + } + // Current pool priorities exceeds new maximum + for (const auto &i : osdmap.get_pools()) { + const auto pool_id = i.first; + pg_pool_t pool = i.second; + + int64_t prio = 0; + pool.opts.get(key, &prio); + int64_t n; + + if (prio > 0 && max_prio > OSD_POOL_PRIORITY_MAX) { // Likely scenario + // Scaled priority range 0 to OSD_POOL_PRIORITY_MAX + n = (float)prio / max_prio * OSD_POOL_PRIORITY_MAX; + } else if (prio < 0 && min_prio < OSD_POOL_PRIORITY_MIN) { + // Scaled priority range OSD_POOL_PRIORITY_MIN to 0 + n = (float)prio / min_prio * OSD_POOL_PRIORITY_MIN; + } else { + continue; + } + if (n == 0) { + pool.opts.unset(key); + } else { + pool.opts.set(key, static_cast<int64_t>(n)); + } + dout(10) << __func__ << " pool " << pool_id + << " recovery_priority adjusted " + << prio << " to " << n << dendl; + pool.last_change = pending_inc.epoch; + pending_inc.new_pools[pool_id] = pool; + } +} diff --git a/src/mon/OSDMonitor.h b/src/mon/OSDMonitor.h index 9055ec45260..4968a2d6270 100644 --- a/src/mon/OSDMonitor.h +++ b/src/mon/OSDMonitor.h @@ -220,6 +220,7 @@ public: set<int> pending_metadata_rm; map<int, failure_info_t> failure_info; map<int,utime_t> down_pending_out; // osd down -> out + bool priority_convert = false; map<int,double> osd_weight; @@ -701,6 +702,7 @@ public: pending_inc.new_flags &= ~flag; } } + void convert_pool_priorities(void); }; #endif diff --git a/src/osd/PG.cc b/src/osd/PG.cc index b4ad94181d6..ba5a2245134 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -2453,11 +2453,22 @@ bool PG::set_force_backfill(bool b) return did; } -inline int PG::clamp_recovery_priority(int priority) +int PG::clamp_recovery_priority(int priority, int pool_recovery_priority) { static_assert(OSD_RECOVERY_PRIORITY_MIN < OSD_RECOVERY_PRIORITY_MAX, "Invalid priority range"); static_assert(OSD_RECOVERY_PRIORITY_MIN >= 0, "Priority range must match unsigned type"); + // User can't set this too high anymore, but might be a legacy value + if (pool_recovery_priority > OSD_POOL_PRIORITY_MAX) + pool_recovery_priority = OSD_POOL_PRIORITY_MAX; + if (pool_recovery_priority < OSD_POOL_PRIORITY_MIN) + pool_recovery_priority = OSD_POOL_PRIORITY_MIN; + // Shift range from min to max to 0 to max - min + pool_recovery_priority += (0 - OSD_POOL_PRIORITY_MIN); + ceph_assert(pool_recovery_priority >= 0 && pool_recovery_priority <= (OSD_POOL_PRIORITY_MAX - OSD_POOL_PRIORITY_MIN)); + + priority += pool_recovery_priority; + // Clamp to valid range if (priority > OSD_RECOVERY_PRIORITY_MAX) { return OSD_RECOVERY_PRIORITY_MAX; @@ -2485,7 +2496,7 @@ unsigned PG::get_recovery_priority() int64_t pool_recovery_priority = 0; pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority); - ret = clamp_recovery_priority(pool_recovery_priority + ret); + ret = clamp_recovery_priority(ret, pool_recovery_priority); } dout(20) << __func__ << " recovery priority is " << ret << dendl; return static_cast<unsigned>(ret); @@ -2516,7 +2527,7 @@ unsigned PG::get_backfill_priority() int64_t pool_recovery_priority = 0; pool.info.opts.get(pool_opts_t::RECOVERY_PRIORITY, &pool_recovery_priority); - ret = clamp_recovery_priority(pool_recovery_priority + ret); + ret = clamp_recovery_priority(ret, pool_recovery_priority); } dout(20) << __func__ << " backfill priority is " << ret << dendl; diff --git a/src/osd/PG.h b/src/osd/PG.h index a8d9d745fc6..5a423452cb5 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -1432,7 +1432,7 @@ protected: bool needs_backfill() const; /// clip calculated priority to reasonable range - inline int clamp_recovery_priority(int priority); + int clamp_recovery_priority(int prio, int pool_recovery_prio); /// get log recovery reservation priority unsigned get_recovery_priority(); /// get backfill reservation priority diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 85cdd89014b..b329bbdd779 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -68,6 +68,10 @@ #define CEPH_OSD_FEATURE_INCOMPAT_RECOVERY_DELETES CompatSet::Feature(16, "deletes in missing set") +/// pool priority range set by user +#define OSD_POOL_PRIORITY_MAX 10 +#define OSD_POOL_PRIORITY_MIN -OSD_POOL_PRIORITY_MAX + /// min recovery priority for MBackfillReserve #define OSD_RECOVERY_PRIORITY_MIN 0