Merge pull request #39140 from sseshasa/wip-dmclock-profiles-improvements

osd: Refinements to mclock built-in profiles implementation.

Reviewed-by: Josh Durgin <jdurgin@redhat.com>
Reviewed-by: Sunny Kumar <sunkumar@redhat.com>
This commit is contained in:
Neha Ojha 2021-02-24 14:18:04 -08:00 committed by GitHub
commit afb6b8d109
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 248 additions and 205 deletions

View File

@ -3081,7 +3081,7 @@ std::vector<Option> get_global_options() {
.set_flag(Option::FLAG_RUNTIME),
Option("osd_mclock_profile", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("balanced")
.set_default("high_client_ops")
.set_enum_allowed( { "balanced", "high_recovery_ops", "high_client_ops", "custom" } )
.set_description("Which mclock profile to use")
.set_long_description("This option specifies the mclock profile to enable - one among the set of built-in profiles or a custom profile. Only considered for osd_op_queue = mclock_scheduler")

View File

@ -16,7 +16,6 @@
#include <memory>
#include <functional>
#include "include/stringify.h"
#include "osd/scheduler/mClockScheduler.h"
#include "common/dout.h"
@ -46,18 +45,10 @@ mClockScheduler::mClockScheduler(CephContext *cct,
{
cct->_conf.add_observer(this);
ceph_assert(num_shards > 0);
// Set default blocksize and cost for all op types.
for (op_type_t op_type = op_type_t::client_op;
op_type <= op_type_t::bg_pg_delete;
op_type = op_type_t(static_cast<size_t>(op_type) + 1)) {
client_cost_infos[op_type] = 4 * 1024;
client_scaled_cost_infos[op_type] = 1;
}
set_max_osd_capacity();
set_osd_mclock_cost_per_io();
mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
set_client_allocations();
enable_mclock_profile();
set_mclock_profile();
enable_mclock_profile_settings();
client_registry.update_from_config(cct->_conf);
}
@ -119,6 +110,8 @@ void mClockScheduler::set_max_osd_capacity()
cct->_conf.get_val<double>("osd_mclock_max_capacity_iops_ssd");
}
}
// Set max osd bandwidth across all shards (at 4KiB blocksize)
max_osd_bandwidth = max_osd_capacity * 4 * 1024;
// Set per op-shard iops limit
max_osd_capacity /= num_shards;
}
@ -139,45 +132,156 @@ void mClockScheduler::set_osd_mclock_cost_per_io()
}
}
void mClockScheduler::set_client_allocations()
void mClockScheduler::set_mclock_profile()
{
// Set profile specific client capacity allocations
if (mclock_profile == "balanced") {
double capacity = std::round(0.5 * max_osd_capacity);
client_allocs[op_scheduler_class::client] = capacity;
client_allocs[op_scheduler_class::background_recovery] = capacity;
} else if (mclock_profile == "high_recovery_ops") {
client_allocs[op_scheduler_class::client] =
std::round(0.25 * max_osd_capacity);
client_allocs[op_scheduler_class::background_recovery] =
std::round(0.75 * max_osd_capacity);
} else if (mclock_profile == "high_client_ops") {
client_allocs[op_scheduler_class::client] =
std::round(0.75 * max_osd_capacity);
client_allocs[op_scheduler_class::background_recovery] =
std::round(0.25 * max_osd_capacity);
} else {
ceph_assert("Invalid mclock profile" == 0);
return;
}
mclock_profile = cct->_conf.get_val<std::string>("osd_mclock_profile");
}
double mClockScheduler::get_client_allocation(op_type_t op_type)
std::string mClockScheduler::get_mclock_profile()
{
double default_allocation = 1.0;
switch (op_type) {
case op_type_t::client_op:
return client_allocs[op_scheduler_class::client];
case op_type_t::bg_recovery:
return client_allocs[op_scheduler_class::background_recovery];
default:
// TODO for other op types.
return default_allocation;
}
return mclock_profile;
}
void mClockScheduler::enable_mclock_profile()
void mClockScheduler::set_balanced_profile_allocations()
{
// Client Allocation:
// reservation: 40% | weight: 1 | limit: 100% |
// Background Recovery Allocation:
// reservation: 40% | weight: 1 | limit: 150% |
// Background Best Effort Allocation:
// reservation: 20% | weight: 2 | limit: max |
// Client
uint64_t client_res = static_cast<uint64_t>(
std::round(0.40 * max_osd_capacity));
uint64_t client_lim = static_cast<uint64_t>(
std::round(max_osd_capacity));
uint64_t client_wgt = default_min;
// Background Recovery
uint64_t rec_res = static_cast<uint64_t>(
std::round(0.40 * max_osd_capacity));
uint64_t rec_lim = static_cast<uint64_t>(
std::round(1.5 * max_osd_capacity));
uint64_t rec_wgt = default_min;
// Background Best Effort
uint64_t best_effort_res = static_cast<uint64_t>(
std::round(0.20 * max_osd_capacity));
uint64_t best_effort_lim = default_max;
uint64_t best_effort_wgt = 2;
// Set the allocations for the mclock clients
client_allocs[
static_cast<size_t>(op_scheduler_class::client)].update(
client_res,
client_wgt,
client_lim);
client_allocs[
static_cast<size_t>(op_scheduler_class::background_recovery)].update(
rec_res,
rec_wgt,
rec_lim);
client_allocs[
static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
best_effort_res,
best_effort_wgt,
best_effort_lim);
}
void mClockScheduler::set_high_recovery_ops_profile_allocations()
{
// Client Allocation:
// reservation: 30% | weight: 1 | limit: 80% |
// Background Recovery Allocation:
// reservation: 60% | weight: 2 | limit: 200% |
// Background Best Effort Allocation:
// reservation: 1 | weight: 2 | limit: max |
// Client
uint64_t client_res = static_cast<uint64_t>(
std::round(0.30 * max_osd_capacity));
uint64_t client_lim = static_cast<uint64_t>(
std::round(0.80 * max_osd_capacity));
uint64_t client_wgt = default_min;
// Background Recovery
uint64_t rec_res = static_cast<uint64_t>(
std::round(0.60 * max_osd_capacity));
uint64_t rec_lim = static_cast<uint64_t>(
std::round(2.0 * max_osd_capacity));
uint64_t rec_wgt = 2;
// Background Best Effort
uint64_t best_effort_res = default_min;
uint64_t best_effort_lim = default_max;
uint64_t best_effort_wgt = 2;
// Set the allocations for the mclock clients
client_allocs[
static_cast<size_t>(op_scheduler_class::client)].update(
client_res,
client_wgt,
client_lim);
client_allocs[
static_cast<size_t>(op_scheduler_class::background_recovery)].update(
rec_res,
rec_wgt,
rec_lim);
client_allocs[
static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
best_effort_res,
best_effort_wgt,
best_effort_lim);
}
void mClockScheduler::set_high_client_ops_profile_allocations()
{
// Client Allocation:
// reservation: 50% | weight: 2 | limit: max |
// Background Recovery Allocation:
// reservation: 25% | weight: 1 | limit: 100% |
// Background Best Effort Allocation:
// reservation: 25% | weight: 2 | limit: max |
// Client
uint64_t client_res = static_cast<uint64_t>(
std::round(0.50 * max_osd_capacity));
uint64_t client_wgt = 2;
uint64_t client_lim = default_max;
// Background Recovery
uint64_t rec_res = static_cast<uint64_t>(
std::round(0.25 * max_osd_capacity));
uint64_t rec_lim = static_cast<uint64_t>(
std::round(max_osd_capacity));
uint64_t rec_wgt = default_min;
// Background Best Effort
uint64_t best_effort_res = static_cast<uint64_t>(
std::round(0.25 * max_osd_capacity));
uint64_t best_effort_lim = default_max;
uint64_t best_effort_wgt = 2;
// Set the allocations for the mclock clients
client_allocs[
static_cast<size_t>(op_scheduler_class::client)].update(
client_res,
client_wgt,
client_lim);
client_allocs[
static_cast<size_t>(op_scheduler_class::background_recovery)].update(
rec_res,
rec_wgt,
rec_lim);
client_allocs[
static_cast<size_t>(op_scheduler_class::background_best_effort)].update(
best_effort_res,
best_effort_wgt,
best_effort_lim);
}
void mClockScheduler::enable_mclock_profile_settings()
{
// Nothing to do for "custom" profile
if (mclock_profile == "custom") {
@ -186,92 +290,54 @@ void mClockScheduler::enable_mclock_profile()
// Set mclock and ceph config options for the chosen profile
if (mclock_profile == "balanced") {
set_balanced_profile_config();
set_balanced_profile_allocations();
} else if (mclock_profile == "high_recovery_ops") {
set_high_recovery_ops_profile_config();
set_high_recovery_ops_profile_allocations();
} else if (mclock_profile == "high_client_ops") {
set_high_client_ops_profile_config();
set_high_client_ops_profile_allocations();
} else {
ceph_assert("Invalid choice of mclock profile" == 0);
return;
}
// Set the mclock config parameters
set_profile_config();
// Set recovery specific Ceph options
set_global_recovery_options();
}
std::string mClockScheduler::get_mclock_profile()
void mClockScheduler::set_profile_config()
{
return mclock_profile;
}
void mClockScheduler::set_balanced_profile_config()
{
double client_lim = get_client_allocation(op_type_t::client_op);
double rec_lim = get_client_allocation(op_type_t::bg_recovery);
int client_wgt = 10;
ClientAllocs client = client_allocs[
static_cast<size_t>(op_scheduler_class::client)];
ClientAllocs rec = client_allocs[
static_cast<size_t>(op_scheduler_class::background_recovery)];
ClientAllocs best_effort = client_allocs[
static_cast<size_t>(op_scheduler_class::background_best_effort)];
// Set external client params
cct->_conf.set_val(
"osd_mclock_scheduler_client_res", stringify(default_min));
cct->_conf.set_val(
"osd_mclock_scheduler_client_wgt", stringify(client_wgt));
cct->_conf.set_val(
"osd_mclock_scheduler_client_lim", stringify(client_lim));
cct->_conf.set_val("osd_mclock_scheduler_client_res",
std::to_string(client.res));
cct->_conf.set_val("osd_mclock_scheduler_client_wgt",
std::to_string(client.wgt));
cct->_conf.set_val("osd_mclock_scheduler_client_lim",
std::to_string(client.lim));
// Set background recovery client params
cct->_conf.set_val(
"osd_mclock_scheduler_background_recovery_res", stringify(default_min));
cct->_conf.set_val(
"osd_mclock_scheduler_background_recovery_wgt", stringify(default_min));
cct->_conf.set_val(
"osd_mclock_scheduler_background_recovery_lim", stringify(rec_lim));
}
cct->_conf.set_val("osd_mclock_scheduler_background_recovery_res",
std::to_string(rec.res));
cct->_conf.set_val("osd_mclock_scheduler_background_recovery_wgt",
std::to_string(rec.wgt));
cct->_conf.set_val("osd_mclock_scheduler_background_recovery_lim",
std::to_string(rec.lim));
void mClockScheduler::set_high_recovery_ops_profile_config()
{
double client_lim = get_client_allocation(op_type_t::client_op);
double rec_lim = get_client_allocation(op_type_t::bg_recovery);
int rec_wgt = 10;
// Set external client params
cct->_conf.set_val(
"osd_mclock_scheduler_client_res", stringify(default_min));
cct->_conf.set_val(
"osd_mclock_scheduler_client_wgt", stringify(default_min));
cct->_conf.set_val(
"osd_mclock_scheduler_client_lim", stringify(client_lim));
// Set background recovery client params
cct->_conf.set_val(
"osd_mclock_scheduler_background_recovery_res", stringify(default_min));
cct->_conf.set_val(
"osd_mclock_scheduler_background_recovery_wgt", stringify(rec_wgt));
cct->_conf.set_val(
"osd_mclock_scheduler_background_recovery_lim", stringify(rec_lim));
}
void mClockScheduler::set_high_client_ops_profile_config()
{
double client_lim = get_client_allocation(op_type_t::client_op);
double rec_lim = get_client_allocation(op_type_t::bg_recovery);
int client_wgt = 10;
// Set external client params
cct->_conf.set_val(
"osd_mclock_scheduler_client_res", stringify(default_min));
cct->_conf.set_val(
"osd_mclock_scheduler_client_wgt", stringify(client_wgt));
cct->_conf.set_val(
"osd_mclock_scheduler_client_lim", stringify(client_lim));
// Set background recovery client params
cct->_conf.set_val(
"osd_mclock_scheduler_background_recovery_res", stringify(default_min));
cct->_conf.set_val(
"osd_mclock_scheduler_background_recovery_wgt", stringify(default_min));
cct->_conf.set_val(
"osd_mclock_scheduler_background_recovery_lim", stringify(rec_lim));
// Set background best effort client params
cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_res",
std::to_string(best_effort.res));
cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_wgt",
std::to_string(best_effort.wgt));
cct->_conf.set_val("osd_mclock_scheduler_background_best_effort_lim",
std::to_string(best_effort.lim));
}
void mClockScheduler::set_global_recovery_options()
@ -279,32 +345,23 @@ void mClockScheduler::set_global_recovery_options()
// Set high value for recovery max active and max backfill
int rec_max_active = 1000;
int max_backfills = 1000;
cct->_conf.set_val("osd_recovery_max_active", stringify(rec_max_active));
cct->_conf.set_val("osd_max_backfills", stringify(max_backfills));
cct->_conf.set_val("osd_recovery_max_active", std::to_string(rec_max_active));
cct->_conf.set_val("osd_max_backfills", std::to_string(max_backfills));
// Disable recovery sleep
cct->_conf.set_val("osd_recovery_sleep", stringify(0));
cct->_conf.set_val("osd_recovery_sleep_hdd", stringify(0));
cct->_conf.set_val("osd_recovery_sleep_ssd", stringify(0));
cct->_conf.set_val("osd_recovery_sleep_hybrid", stringify(0));
cct->_conf.set_val("osd_recovery_sleep", std::to_string(0));
cct->_conf.set_val("osd_recovery_sleep_hdd", std::to_string(0));
cct->_conf.set_val("osd_recovery_sleep_ssd", std::to_string(0));
cct->_conf.set_val("osd_recovery_sleep_hybrid", std::to_string(0));
// Apply the changes
cct->_conf.apply_changes(nullptr);
}
int mClockScheduler::calc_scaled_cost(op_type_t op_type, int cost)
int mClockScheduler::calc_scaled_cost(int cost)
{
double client_alloc = get_client_allocation(op_type);
if (client_alloc == 1.0) {
// Client not yet supported, return default cost.
return 1;
}
// Calculate bandwidth from max osd capacity (at 4KiB blocksize).
double max_osd_bandwidth = max_osd_capacity * num_shards * 4 * 1024;
// Calculate scaled cost based on item cost
double scaled_cost = (cost / max_osd_bandwidth) * client_alloc;
// Calculate scaled cost in msecs based on item cost
int scaled_cost = std::floor((cost / max_osd_bandwidth) * 1000);
// Scale the cost down by an additional cost factor if specified
// to account for different device characteristics (hdd, ssd).
@ -314,45 +371,7 @@ int mClockScheduler::calc_scaled_cost(op_type_t op_type, int cost)
scaled_cost *= osd_mclock_cost_per_io_msec / 1000.0;
}
return std::floor(scaled_cost);
}
bool mClockScheduler::maybe_update_client_cost_info(
op_type_t op_type, int new_cost)
{
int capped_item_cost = 4 * 1024 * 1024;
if (new_cost == 0) {
return false;
}
// The mclock params represented in terms of the per-osd capacity
// are scaled up or down according to the cost associated with
// item cost and updated within the dmclock server.
int cur_cost = client_cost_infos[op_type];
// Note: Cap the scaling of item cost to ~4MiB as the tag increments
// beyond this point are too long causing performance issues. This may
// need to be in place until benchmark data is available or a better
// scaling model can be put in place. This is a TODO.
if (new_cost >= capped_item_cost) {
new_cost = capped_item_cost;
}
bool cost_changed =
((new_cost >= (cur_cost << 1)) || (cur_cost >= (new_cost << 1)));
if (cost_changed) {
client_cost_infos[op_type] = new_cost;
// Update client scaled cost info
int scaled_cost = std::max(calc_scaled_cost(op_type, new_cost), 1);
if (scaled_cost != client_scaled_cost_infos[op_type]) {
client_scaled_cost_infos[op_type] = scaled_cost;
return true;
}
}
return false;
return std::max(scaled_cost, 1);
}
void mClockScheduler::dump(ceph::Formatter &f) const
@ -362,18 +381,13 @@ void mClockScheduler::dump(ceph::Formatter &f) const
void mClockScheduler::enqueue(OpSchedulerItem&& item)
{
auto id = get_scheduler_id(item);
auto op_type = item.get_op_type();
int cost = client_scaled_cost_infos[op_type];
// Re-calculate the scaled cost for the client if the item cost changed
if (maybe_update_client_cost_info(op_type, item.get_cost())) {
cost = client_scaled_cost_infos[op_type];
}
// TODO: move this check into OpSchedulerItem, handle backwards compat
if (op_scheduler_class::immediate == item.get_scheduler_class()) {
if (op_scheduler_class::immediate == id.class_id) {
immediate.push_front(std::move(item));
} else {
int cost = calc_scaled_cost(item.get_cost());
// Add item to scheduler queue
scheduler.add_request(
std::move(item),
id,
@ -448,12 +462,15 @@ void mClockScheduler::handle_conf_change(
changed.count("osd_mclock_max_capacity_iops_hdd") ||
changed.count("osd_mclock_max_capacity_iops_ssd")) {
set_max_osd_capacity();
enable_mclock_profile();
client_registry.update_from_config(conf);
if (mclock_profile != "custom") {
enable_mclock_profile_settings();
client_registry.update_from_config(conf);
}
}
if (changed.count("osd_mclock_profile")) {
enable_mclock_profile();
set_mclock_profile();
if (mclock_profile != "custom") {
enable_mclock_profile_settings();
client_registry.update_from_config(conf);
}
}
@ -469,4 +486,9 @@ void mClockScheduler::handle_conf_change(
}
}
mClockScheduler::~mClockScheduler()
{
cct->_conf.remove_observer(this);
}
}

View File

@ -38,7 +38,6 @@ constexpr uint64_t default_max = 999999;
using client_id_t = uint64_t;
using profile_id_t = uint64_t;
using op_type_t = OpSchedulerItem::OpQueueable::op_type_t;
struct client_profile_id_t {
client_id_t client_id;
@ -68,11 +67,34 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
const uint32_t num_shards;
bool is_rotational;
double max_osd_capacity;
double max_osd_bandwidth;
uint64_t osd_mclock_cost_per_io_msec;
std::string mclock_profile = "balanced";
std::map<op_scheduler_class, double> client_allocs;
std::map<op_type_t, int> client_cost_infos;
std::map<op_type_t, int> client_scaled_cost_infos;
std::string mclock_profile = "high_client_ops";
struct ClientAllocs {
uint64_t res;
uint64_t wgt;
uint64_t lim;
ClientAllocs(uint64_t _res, uint64_t _wgt, uint64_t _lim) {
update(_res, _wgt, _lim);
}
inline void update(uint64_t _res, uint64_t _wgt, uint64_t _lim) {
res = _res;
wgt = _wgt;
lim = _lim;
}
};
std::array<
ClientAllocs,
static_cast<size_t>(op_scheduler_class::client) + 1
> client_allocs = {
// Placeholder, get replaced with configured values
ClientAllocs(1, 1, 1), // background_recovery
ClientAllocs(1, 1, 1), // background_best_effort
ClientAllocs(1, 1, 1), // immediate (not used)
ClientAllocs(1, 1, 1) // client
};
class ClientRegistry {
std::array<
crimson::dmclock::ClientInfo,
@ -115,6 +137,7 @@ class mClockScheduler : public OpScheduler, md_config_obs_t {
public:
mClockScheduler(CephContext *cct, uint32_t num_shards, bool is_rotational);
~mClockScheduler() override;
// Set the max osd capacity in iops
void set_max_osd_capacity();
@ -122,35 +145,32 @@ public:
// Set the cost per io for the osd
void set_osd_mclock_cost_per_io();
// Set the mclock related config params based on the profile
void enable_mclock_profile();
// Set the mclock profile type to enable
void set_mclock_profile();
// Get the active mclock profile
std::string get_mclock_profile();
// Set client capacity allocations based on profile
void set_client_allocations();
// Set "balanced" profile allocations
void set_balanced_profile_allocations();
// Get client allocation
double get_client_allocation(op_type_t op_type);
// Set "high_recovery_ops" profile allocations
void set_high_recovery_ops_profile_allocations();
// Set "balanced" profile parameters
void set_balanced_profile_config();
// Set "high_client_ops" profile allocations
void set_high_client_ops_profile_allocations();
// Set "high_recovery_ops" profile parameters
void set_high_recovery_ops_profile_config();
// Set the mclock related config params based on the profile
void enable_mclock_profile_settings();
// Set "high_client_ops" profile parameters
void set_high_client_ops_profile_config();
// Set mclock config parameter based on allocations
void set_profile_config();
// Set recovery specific Ceph settings for profiles
void set_global_recovery_options();
// Calculate scale cost per item
int calc_scaled_cost(op_type_t op_type, int cost);
// Update mclock client cost info
bool maybe_update_client_cost_info(op_type_t op_type, int new_cost);
int calc_scaled_cost(int cost);
// Enqueue op in the back of the regular queue
void enqueue(OpSchedulerItem &&item) final;

View File

@ -93,7 +93,7 @@ TEST_F(mClockSchedulerTest, TestEmpty) {
for (unsigned i = 100; i < 105; i+=2) {
q.enqueue(create_item(i, client1, op_scheduler_class::client));
std::this_thread::sleep_for(std::chrono::milliseconds(1));
std::this_thread::sleep_for(std::chrono::microseconds(1));
}
ASSERT_FALSE(q.empty());
@ -126,7 +126,7 @@ TEST_F(mClockSchedulerTest, TestSingleClientOrderedEnqueueDequeue) {
for (unsigned i = 100; i < 105; ++i) {
q.enqueue(create_item(i, client1, op_scheduler_class::client));
std::this_thread::sleep_for(std::chrono::milliseconds(1));
std::this_thread::sleep_for(std::chrono::microseconds(1));
}
auto r = get_item(q.dequeue());
@ -150,6 +150,7 @@ TEST_F(mClockSchedulerTest, TestMultiClientOrderedEnqueueDequeue) {
for (unsigned i = 0; i < NUM; ++i) {
for (auto &&c: {client1, client2, client3}) {
q.enqueue(create_item(i, c));
std::this_thread::sleep_for(std::chrono::microseconds(1));
}
}