diff --git a/doc/dev/bluestore-codel.rst b/doc/dev/bluestore-codel.rst new file mode 100644 index 00000000000..4aa0154456b --- /dev/null +++ b/doc/dev/bluestore-codel.rst @@ -0,0 +1,41 @@ +============================================ +BlueStore Bufferbloat Mitigation Using CoDel +============================================ + + +Introduction +------------ +Bufferbloat happens when a frontend buffer too much data to a backend. +This can introduce latency spikes to the backend and compromise the +request schedulability of the frontend. + +BlueStore has the bufferbloat problem due to its large queue. All +write requests are submitted immediately to BlueStore to achieve high +performance. However, this can compromise request schedulability in OSD. +As a solution, the CoDel algorithm is implemented in the BlueStore as +an admission control system to control the amount of transaction +submitted to BlueStore. This mechanism will negatively impact the +throughput of BlueStore. However, a tradeoff parameter has been introduced +to control BlueStore throughput loss versus BlueStore latency decrease. + +Configurations +-------------- +CoDel can be enabled using "*bluestore_codel*" config. The other important +config that needs to be set is "*bluestore_codel_throughput_latency_tradeoff*". +This config adjust the tradeoff between BlueStore throughput loss and +BlueStore latency decrease. This parameter defines the amount of throughput +loss in MB/s for one ms decrease in BlueStore latency. For example, a value +of 5 means that we are willing to lose maximum of 5 MB/s of throughput for +every 1 ms decrease in BlueStore latency. + +Experiments +----------- +For measuring the impact of BlueStore CoDel on BlueStore, we measured the +transaction latency inside the BlueStore (BlueStore latency) and BlueStore +throughput. We compared this measurements with measurements from Vanilla BlueStore. +These experiments shows that: + +1. The BlueStore CoDel can decrease the BlueStore latency by small and controllable +impact on throughput. +2. The BlueStore CoDel can react to workload changes to keep the desired tradeoff +between latency and throughput. diff --git a/qa/objectstore_debug/bluestore-codel.yaml b/qa/objectstore_debug/bluestore-codel.yaml new file mode 100644 index 00000000000..40f6fc084e1 --- /dev/null +++ b/qa/objectstore_debug/bluestore-codel.yaml @@ -0,0 +1,45 @@ +overrides: + thrashosds: + bdev_inject_crash: 2 + bdev_inject_crash_probability: .5 + ceph: + fs: xfs + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + bluestore allocator: bitmap + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + # this doesn't work with failures bc the log writes are not atomic across the two backends + # bluestore bluefs env mirror: true + bdev enable discard: true + bdev async discard: true + bluestore codel: true + ceph-deploy: + fs: xfs + bluestore: yes + conf: + osd: + osd objectstore: bluestore + bluestore block size: 96636764160 + debug bluestore: 20 + debug bluefs: 20 + debug rocksdb: 10 + bluestore fsck on mount: true + # lower the full ratios since we can fill up a 100gb osd so quickly + mon osd full ratio: .9 + mon osd backfillfull_ratio: .85 + mon osd nearfull ratio: .8 + osd failsafe full ratio: .95 + bdev enable discard: true + bdev async discard: true + bluestore codel: true + diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index e4e271a1b12..5183360b730 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -6312,3 +6312,72 @@ options: default: 0 services: - mgr +- name: bluestore_codel + type: bool + level: advanced + desc: enable/disable bluestore SlowFastCodel + default: false + with_legacy: true +- name: bluestore_codel_throughput_latency_tradeoff + type: float + level: advanced + desc: adjust the tradeoff between throughput and bluestore latency in SlowFastCodel + long_desc: This parameter defines the amount of throughput loss (MB/s) for one ms + decrease in bluestore latency. (a value of 5 means that we are willing to lose + maximum of 5 MB/s of throughput for every 1 ms decrease in bluestore latency) + default: 5 + with_legacy: true +- name: bluestore_codel_initial_target_latency + type: float + level: advanced + desc: initial target latency for SlowFastCodel in ms + default: 5.0 + with_legacy: true +- name: bluestore_codel_slow_interval + type: float + level: advanced + desc: the interval of slow loop in SlowFastCodel in ms (this parameter should be larger that 'bluestore_codel_fast_interval') + default: 500.0 + with_legacy: true +- name: bluestore_codel_fast_interval + type: float + level: advanced + desc: the interval of the fast loop in SlowFastCodel in ms + default: 50.0 + with_legacy: true +- name: bluestore_codel_min_target_latency + type: float + level: advanced + desc: the minimum possible target latency in SlowFastCodel in ms + default: 1.0 + with_legacy: true +- name: bluestore_codel_max_target_latency + type: float + level: advanced + desc: the maximum possible target latency in SlowFastCodel in ms + default: 1000.0 + with_legacy: true +- name: bluestore_codel_initial_budget_bytes + type: size + level: advanced + desc: the initial bluestore throttle budget in SlowFastCodel + default: 100_K + with_legacy: true +- name: bluestore_codel_min_budget_bytes + type: size + level: advanced + desc: the minimum bluestore throttle budget in SlowFastCodel + default: 100_K + with_legacy: true +- name: bluestore_codel_budget_increment_bytes + type: size + level: advanced + desc: the increment size for opening the bluestore throttle in SlowFastCodel + default: 10_K + with_legacy: true +- name: bluestore_codel_regression_history_size + type: int + level: advanced + desc: number of the slow interval throughput and latency samples that SlowFastCodel keeps for regression + default: 100 + with_legacy: true diff --git a/src/common/regression_utils.h b/src/common/regression_utils.h new file mode 100644 index 00000000000..8f2182949b7 --- /dev/null +++ b/src/common/regression_utils.h @@ -0,0 +1,127 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#pragma once + +#include +#include +#include +#include + +#define Z_P 2.33 // z score for 99th percentile + + +namespace ceph { + /*** + * Calculate the inverse of a 2x2 matrix. + * @param matrix& m, an square 2x2 matrix + * @return the inverse of the m (m^-1) + */ + static boost::numeric::ublas::matrix + matrix_inverse(boost::numeric::ublas::matrix &m) { + assert(m.size1() == m.size2() && + "Can only calculate the inverse of square matrices"); + assert(m.size1() == 2 && m.size2() == 2 && "Only for 2x2 matrices"); + boost::numeric::ublas::matrix m_inverse(2, 2); + const double a = m(0, 0); + const double b = m(0, 1); + const double c = m(1, 0); + const double d = m(1, 1); + const double determinant = 1.0 / ((a * d) - (b * c)); + m_inverse(0, 0) = d * determinant; + m_inverse(0, 1) = -b * determinant; + m_inverse(1, 0) = -c * determinant; + m_inverse(1, 1) = a * determinant; + return m_inverse; + } + + /*** + * Find a logarithmic function in form of "y = a + b * ln(x)" which fits + * the given points (x_values and y_values). + * @param std::vector x_values, x values for sample points + * @param std::vector y_values, y values for sample points + * @param double theta[2], holds the a and b as output (theta[0] = a and theta[1] = b) + */ + static void regression( + const std::vector &x_values, + const std::vector &y_values, + double theta[2]) { + assert(x_values.size() == y_values.size() && + "x and y values vectors should have a same size."); + const int n = x_values.size(); + + boost::numeric::ublas::matrix y_m(n, 1); + for (int i = 0; i < n; i++) { + y_m(i, 0) = y_values[i]; + } + + boost::numeric::ublas::scalar_matrix sm(n, 2, 1); + boost::numeric::ublas::matrix x_new_m(sm); + for (int i = 0; i < n; i++) { + x_new_m(i, 0) = 1; + x_new_m(i, 1) = std::log(x_values[i]); + } + boost::numeric::ublas::matrix x_new_trans_m = boost::numeric::ublas::trans( + x_new_m); + boost::numeric::ublas::matrix x_new_trans_dot_x_new_m = boost::numeric::ublas::prod( + x_new_trans_m, x_new_m); + boost::numeric::ublas::matrix temp_1_m = matrix_inverse( + x_new_trans_dot_x_new_m); + boost::numeric::ublas::matrix temp_2_m = boost::numeric::ublas::prod( + x_new_trans_m, y_m); + boost::numeric::ublas::matrix theta_m = boost::numeric::ublas::prod( + temp_1_m, temp_2_m); + theta[0] = theta_m(0, 0); + theta[1] = theta_m(1, 0); + } + + /*** + * Finds the x location on a fitted logarithmic curve on sample points where + * the slope is equal to target_slope + * @param x_values, x values for sample points + * @param y_values, y values for sample points + * @param target_slope, the slope that we are looking for + * @return the x location where the slope of the curve is target_slope + */ + static double find_slope_on_curve( + const std::vector &x_values, + const std::vector &y_values, + double target_slope) { + assert(x_values.size() == y_values.size() && + "x and y values vectors should have a same size."); + assert(target_slope != 0 && + "The target slope of zero will result to a inf x, try a nonzero value."); + assert(target_slope >= 0 && + "The target slope for a logarithmic function should be positive."); + double theta[2]; // theta[0] + theta[1] * ln(x) + regression(x_values, y_values, + theta); // find the logarithmic function using regression + double target_x = theta[1] / + target_slope; // find the x where the slope is close to target_slope + return target_x; + } + + /*** + * Finds the mu and std parameters of the lognormal distribution from its mode + * and x boundaries. + * @param mode, the mode of the distribution. + * @param min_x, x lower boundary of distribution (zero percentile) + * @param max_x, x upper boundary of distribution (99th percentile) + * @param params, holds the calculated distribution parameters (mu and std) as + * output (params[0] = mu and params[1] = std) + */ + static void + find_log_normal_dist_params(double mode, double min_x, double max_x, + double params[2]) { + assert(min_x < max_x && "The min_x should be smaller than max_x"); + assert(mode >= min_x && mode < max_x && + "The mode should be between min_x and max_x"); + double max_x_normalized = max_x - min_x; + double mode_normalized = mode - min_x; + double std_dev = (-Z_P + std::sqrt( + Z_P * Z_P + 4 * std::log(max_x_normalized) - + 4 * std::log(mode_normalized))) / 2; + double mu = std::log(max_x_normalized) - Z_P * std_dev; + params[0] = mu; + params[1] = std_dev; + } +} diff --git a/src/crimson/os/alienstore/CMakeLists.txt b/src/crimson/os/alienstore/CMakeLists.txt index f006ba33a85..9d59225c79b 100644 --- a/src/crimson/os/alienstore/CMakeLists.txt +++ b/src/crimson/os/alienstore/CMakeLists.txt @@ -55,6 +55,7 @@ set(alien_store_srcs ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueStore.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/simple_bitmap.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/bluestore_types.cc + ${PROJECT_SOURCE_DIR}/src/os/bluestore/BlueStoreSlowFastCoDel.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/fastbmap_allocator_impl.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/FreelistManager.cc ${PROJECT_SOURCE_DIR}/src/os/bluestore/HybridAllocator.cc diff --git a/src/os/CMakeLists.txt b/src/os/CMakeLists.txt index 204a29fea8c..9bb6be0db3d 100644 --- a/src/os/CMakeLists.txt +++ b/src/os/CMakeLists.txt @@ -29,6 +29,7 @@ if(WITH_BLUESTORE) bluestore/BlueStore.cc bluestore/simple_bitmap.cc bluestore/bluestore_types.cc + bluestore/BlueStoreSlowFastCoDel.cc bluestore/fastbmap_allocator_impl.cc bluestore/FreelistManager.cc bluestore/StupidAllocator.cc diff --git a/src/os/bluestore/BlueStore.cc b/src/os/bluestore/BlueStore.cc index 86062f290f0..a780cf92578 100644 --- a/src/os/bluestore/BlueStore.cc +++ b/src/os/bluestore/BlueStore.cc @@ -4601,6 +4601,15 @@ BlueStore::BlueStore(CephContext *cct, _init_logger(); cct->_conf.add_observer(this); set_cache_shards(1); + if ( cct->_conf->bluestore_codel) { + codel = std::make_unique( + cct, [this](int64_t x) mutable { + this->throttle.reset_kv_throttle_max(x); + }, + [this]() mutable { + return this->throttle.get_kv_throttle_current(); + }); + } } BlueStore::~BlueStore() @@ -4666,6 +4675,17 @@ const char **BlueStore::get_tracked_conf_keys() const "bluestore_warn_on_no_per_pool_omap", "bluestore_warn_on_no_per_pg_omap", "bluestore_max_defer_interval", + "bluestore_codel", + "bluestore_codel_slow_interval", + "bluestore_codel_fast_interval", + "bluestore_codel_initial_target_latency", + "bluestore_codel_min_target_latency", + "bluestore_codel_max_target_latency", + "bluestore_codel_throughput_latency_tradeoff", + "bluestore_codel_initial_budget_bytes", + "bluestore_codel_min_budget_bytes", + "bluestore_codel_budget_increment_bytes", + "bluestore_codel_regression_history_size", NULL }; return KEYS; @@ -4724,6 +4744,9 @@ void BlueStore::handle_conf_change(const ConfigProxy& conf, changed.count("bluestore_throttle_deferred_bytes") || changed.count("bluestore_throttle_trace_rate")) { throttle.reset_throttle(conf); + if (codel) { + codel->reset_bluestore_budget(); + } } if (changed.count("bluestore_max_defer_interval")) { if (bdev) { @@ -4736,6 +4759,21 @@ void BlueStore::handle_conf_change(const ConfigProxy& conf, changed.count("osd_memory_expected_fragmentation")) { _update_osd_memory_options(); } + if (changed.count("bluestore_codel") || + changed.count("bluestore_codel_slow_interval") || + changed.count("bluestore_codel_fast_interval") || + changed.count("bluestore_codel_initial_target_latency") || + changed.count("bluestore_codel_min_target_latency") || + changed.count("bluestore_codel_max_target_latency") || + changed.count("bluestore_codel_throughput_latency_tradeoff") || + changed.count("bluestore_codel_initial_budget_bytes") || + changed.count("bluestore_codel_min_budget_bytes") || + changed.count("bluestore_codel_budget_increment_bytes") || + changed.count("bluestore_codel_regression_history_size")) { + if (codel) { + codel->on_config_changed(cct); + } + } } void BlueStore::_set_compression() @@ -12561,6 +12599,9 @@ void BlueStore::_txc_state_proc(TransContext *txc) case TransContext::STATE_KV_DONE: throttle.log_state_latency(*txc, logger, l_bluestore_state_kv_done_lat); + if (codel) { + codel->update_from_txc_info(txc->txc_state_proc_start, txc->bytes); + } if (txc->deferred_txn) { txc->set_state(TransContext::STATE_DEFERRED_QUEUED); _deferred_queue(txc); @@ -14032,6 +14073,7 @@ int BlueStore::queue_transactions( logger->inc(l_bluestore_txc); // execute (start) + txc->txc_state_proc_start = mono_clock::now(); _txc_state_proc(txc); if (bdev->is_smr()) { diff --git a/src/os/bluestore/BlueStore.h b/src/os/bluestore/BlueStore.h index 0f804595ebb..e35e0bd4c34 100644 --- a/src/os/bluestore/BlueStore.h +++ b/src/os/bluestore/BlueStore.h @@ -51,6 +51,7 @@ #include "bluestore_types.h" #include "BlueFS.h" #include "common/EventTrace.h" +#include "BlueStoreSlowFastCoDel.h" #ifdef WITH_BLKIN #include "common/zipkin_trace.h" @@ -1724,6 +1725,7 @@ public: uint64_t seq = 0; ceph::mono_clock::time_point start; ceph::mono_clock::time_point last_stamp; + ceph::mono_clock::time_point txc_state_proc_start; uint64_t last_nid = 0; ///< if non-zero, highest new nid we allocated uint64_t last_blobid = 0; ///< if non-zero, highest new blobid we allocated @@ -1900,8 +1902,16 @@ public: trace_period_mcs = rate > 0 ? floor((1/rate) * 1000000.0) : 0; #endif } + int64_t get_kv_throttle_current() { + return throttle_bytes.get_current(); + } + void reset_kv_throttle_max(int64_t m) { + throttle_bytes.reset_max(m); + } } throttle; + std::unique_ptr codel; + typedef boost::intrusive::list< TransContext, boost::intrusive::member_hook< diff --git a/src/os/bluestore/BlueStoreSlowFastCoDel.cc b/src/os/bluestore/BlueStoreSlowFastCoDel.cc new file mode 100644 index 00000000000..e85a934ec82 --- /dev/null +++ b/src/os/bluestore/BlueStoreSlowFastCoDel.cc @@ -0,0 +1,272 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#include "BlueStoreSlowFastCoDel.h" + +#include "common/regression_utils.h" + +BlueStoreSlowFastCoDel::BlueStoreSlowFastCoDel( + CephContext *_cct, + std::function _bluestore_budget_reset_callback, + std::function _get_kv_throttle_current) : + fast_timer(_cct, fast_timer_lock), + slow_timer(_cct, slow_timer_lock), + bluestore_budget_reset_callback(_bluestore_budget_reset_callback), + get_kv_throttle_current(_get_kv_throttle_current) { + on_config_changed(_cct); +} + +BlueStoreSlowFastCoDel::~BlueStoreSlowFastCoDel() { + { + std::lock_guard l1{fast_timer_lock}; + fast_timer.cancel_all_events(); + fast_timer.shutdown(); + } + + { + std::lock_guard l2{slow_timer_lock}; + slow_timer.cancel_all_events(); + slow_timer.shutdown(); + } + + regression_throughput_history.clear(); + regression_target_latency_history.clear(); +} + +void BlueStoreSlowFastCoDel::update_from_txc_info( + ceph::mono_clock::time_point txc_start_time, + uint64_t txc_bytes) { + std::lock_guard l(register_lock); + ceph::mono_clock::time_point now = ceph::mono_clock::now(); + int64_t latency = std::chrono::nanoseconds(now - txc_start_time).count(); + + if (activated && max_queue_length < get_kv_throttle_current()) { + max_queue_length = get_kv_throttle_current(); + } + if (min_latency == INITIAL_LATENCY_VALUE || latency < min_latency) { + min_latency = latency; + } + slow_interval_txc_cnt++; + slow_interval_registered_bytes += txc_bytes; +} + +void BlueStoreSlowFastCoDel::on_min_latency_violation() { + if (target_latency > 0) { + double diff = (double) (target_latency - min_latency); + auto error_ratio = std::abs(diff) / min_latency; + if (error_ratio > 0.5) { + error_ratio = 0.5; + } + bluestore_budget = std::max(bluestore_budget * (1 - error_ratio), + min_bluestore_budget * 1.0); + } +} + +void BlueStoreSlowFastCoDel::on_no_violation() { + if (bluestore_budget < max_queue_length * 1.5) { + bluestore_budget = bluestore_budget + bluestore_budget_increment; + } +} + +void BlueStoreSlowFastCoDel::on_config_changed(CephContext *cct) { + { + std::lock_guard l(register_lock); + + activated = cct->_conf->bluestore_codel; + target_slope = cct->_conf->bluestore_codel_throughput_latency_tradeoff; + slow_interval = ((int64_t) cct->_conf->bluestore_codel_slow_interval) * + 1000 * 1000; + initial_fast_interval = ((int64_t) + cct->_conf->bluestore_codel_fast_interval) * 1000 * 1000; + initial_target_latency = ((int64_t) + cct->_conf->bluestore_codel_initial_target_latency) * 1000 * 1000; + min_target_latency = ((int64_t) + cct->_conf->bluestore_codel_min_target_latency) * 1000 * 1000; + max_target_latency = ((int64_t) + cct->_conf->bluestore_codel_max_target_latency) * 1000 * 1000; + initial_bluestore_budget = cct->_conf->bluestore_codel_initial_budget_bytes; + min_bluestore_budget = cct->_conf->bluestore_codel_min_budget_bytes; + bluestore_budget_increment = + cct->_conf->bluestore_codel_budget_increment_bytes; + regression_history_size = + cct->_conf->bluestore_codel_regression_history_size; + + bluestore_budget = initial_bluestore_budget; + min_bluestore_budget = initial_bluestore_budget; + max_queue_length = min_bluestore_budget; + fast_interval = initial_fast_interval; + target_latency = initial_target_latency; + min_latency = INITIAL_LATENCY_VALUE; + slow_interval_registered_bytes = 0; + regression_throughput_history.clear(); + regression_target_latency_history.clear(); + slow_interval_start = ceph::mono_clock::zero(); + } + + { + std::lock_guard l1{fast_timer_lock}; + fast_timer.cancel_all_events(); + fast_timer.init(); + } + _fast_interval_process(); + { + std::lock_guard l2{slow_timer_lock}; + slow_timer.cancel_all_events(); + slow_timer.init(); + } + _slow_interval_process(); +} + +void BlueStoreSlowFastCoDel::reset_bluestore_budget() { + if (activated) { + bluestore_budget = std::max(min_bluestore_budget, bluestore_budget); + bluestore_budget_reset_callback(bluestore_budget); + } +} + +void BlueStoreSlowFastCoDel::_fast_interval_process() { + std::lock_guard l(register_lock); + if (target_latency != INITIAL_LATENCY_VALUE && + min_latency != INITIAL_LATENCY_VALUE) { + if (activated) { + if (_check_latency_violation()) { + // min latency violation + violation_count++; + _update_interval(); + on_min_latency_violation(); // handle the violation + } else { + // no latency violation + violation_count = 0; + fast_interval = initial_fast_interval; + on_no_violation(); + } + bluestore_budget = std::max(min_bluestore_budget, bluestore_budget); + bluestore_budget_reset_callback(bluestore_budget); + } + + // reset interval + min_latency = INITIAL_LATENCY_VALUE; + + on_fast_interval_finished(); + } + + auto codel_ctx = new LambdaContext( + [this](int r) { + _fast_interval_process(); + }); + auto interval_duration = std::chrono::nanoseconds(fast_interval); + fast_timer.add_event_after(interval_duration, codel_ctx); +} + +void BlueStoreSlowFastCoDel::_slow_interval_process() { + std::lock_guard l(register_lock); + ceph::mono_clock::time_point now = ceph::mono_clock::now(); + if (activated && !ceph::mono_clock::is_zero(slow_interval_start) + && slow_interval_txc_cnt > 0) { + double time_sec = nanosec_to_sec( + std::chrono::nanoseconds(now - slow_interval_start).count()); + + double slow_interval_throughput = + (slow_interval_registered_bytes * 1.0) / time_sec; + slow_interval_throughput = slow_interval_throughput / (1024.0 * 1024.0); + regression_target_latency_history.push_back( + nanosec_to_millisec(target_latency)); + regression_throughput_history.push_back(slow_interval_throughput); + if (regression_target_latency_history.size() > regression_history_size) { + regression_target_latency_history.erase( + regression_target_latency_history.begin()); + regression_throughput_history.erase( + regression_throughput_history.begin()); + } + std::vector targets; + std::vector throughputs; + double target_ms = nanosec_to_millisec(initial_target_latency); + // If there is sufficient number of points, use the regression to find the + // target_ms. Otherwise, target_ms will be initial_target_latency + if (regression_target_latency_history.size() >= regression_history_size) { + target_ms = ceph::find_slope_on_curve( + regression_target_latency_history, + regression_throughput_history, + target_slope); + } + + target_latency_without_noise = millisec_to_nanosec(target_ms); + target_latency_without_noise = std::max(target_latency_without_noise, + min_target_latency); + target_latency_without_noise = std::min(target_latency_without_noise, + max_target_latency); + target_ms = nanosec_to_millisec(target_latency_without_noise); + + // add log_normal noise + unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + std::default_random_engine generator(seed); + double dist_params[2]; + double rnd_std_dev = 5; + ceph::find_log_normal_dist_params( + target_ms, + nanosec_to_millisec(min_target_latency), + target_ms * rnd_std_dev, + dist_params); + std::lognormal_distribution distribution(dist_params[0], + dist_params[1]); + + target_latency = millisec_to_nanosec(distribution(generator)); + target_latency += min_target_latency; + + if (target_latency < millisec_to_nanosec(target_ms)) { + std::uniform_real_distribution<> distr(0, 0.5); + target_latency = target_latency + + (target_latency - millisec_to_nanosec(target_ms)) * + distr(generator); + } + + if (target_latency != INITIAL_LATENCY_VALUE) { + target_latency = std::max(target_latency, min_target_latency); + target_latency = std::min(target_latency, max_target_latency); + } + + on_slow_interval_finished(); + } + + slow_interval_start = ceph::mono_clock::now(); + slow_interval_registered_bytes = 0; + slow_interval_txc_cnt = 0; + max_queue_length = min_bluestore_budget; + + auto codel_ctx = new LambdaContext( + [this](int r) { + _slow_interval_process(); + }); + auto interval_duration = std::chrono::nanoseconds(slow_interval); + slow_timer.add_event_after(interval_duration, codel_ctx); +} + + +/** +* check if the min latency violate the target +* @return true if min latency violate the target, false otherwise +*/ +bool BlueStoreSlowFastCoDel::_check_latency_violation() { + if (target_latency != INITIAL_LATENCY_VALUE && + min_latency != INITIAL_LATENCY_VALUE) { + if (min_latency > target_latency) { + return true; + } + } + return false; +} + +void BlueStoreSlowFastCoDel::_update_interval() { + auto sqrt = (int) std::round(std::sqrt(violation_count)); + fast_interval = initial_fast_interval / sqrt; + if (fast_interval <= 0) { + fast_interval = 1000; + } +} + +int64_t BlueStoreSlowFastCoDel::get_bluestore_budget() { + return bluestore_budget; +} + +int64_t BlueStoreSlowFastCoDel::get_target_latency() { + return target_latency; +} diff --git a/src/os/bluestore/BlueStoreSlowFastCoDel.h b/src/os/bluestore/BlueStoreSlowFastCoDel.h new file mode 100644 index 00000000000..242260f00b4 --- /dev/null +++ b/src/os/bluestore/BlueStoreSlowFastCoDel.h @@ -0,0 +1,128 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- + +#pragma once + +#include + +#include "include/Context.h" +#include "common/Timer.h" +#include "common/ceph_time.h" + +class BlueStoreSlowFastCoDel { +public: + BlueStoreSlowFastCoDel( + CephContext *_cct, + std::function _bluestore_budget_reset_callback, + std::function _get_kv_throttle_current); + + virtual ~BlueStoreSlowFastCoDel(); + + void on_config_changed(CephContext *cct); + + void reset_bluestore_budget(); + + void update_from_txc_info( + ceph::mono_clock::time_point txc_start_time, + uint64_t txc_bytes); + + int64_t get_bluestore_budget(); + + int64_t get_target_latency(); + + bool is_activated(); + +protected: + static const int64_t INITIAL_LATENCY_VALUE = -1; + + /* config values */ + // Config value 'bluestore_codel',true if SlowFastCodel is activated + bool activated = false; + // Config value 'bluestore_codel_fast_interval', Initial interval for fast loop + int64_t initial_fast_interval = INITIAL_LATENCY_VALUE; + // Config value 'bluestore_codel_initial_target_latency', Initial target latency + // to start the algorithm + int64_t initial_target_latency = INITIAL_LATENCY_VALUE; + // Config value 'bluestore_codel_slow_interval', the interval for the slow loop + int64_t slow_interval = INITIAL_LATENCY_VALUE; + // Config value 'bluestore_codel_min_target_latency', min possible value for target + int64_t min_target_latency = INITIAL_LATENCY_VALUE; // in ns + // Config value 'bluestore_codel_max_target_latency', max possible value for target + int64_t max_target_latency = INITIAL_LATENCY_VALUE; // in ns + // Config value 'bluestore_codel_throughput_latency_tradeoff', define the + // tradeoff between throughput and latency (MB/s loss for every 1ms latency drop) + double target_slope = 5; + // Config value 'bluestore_codel_regression_history_size', regression history size + int64_t regression_history_size = 100; + // Config value 'bluestore_codel_min_budget_bytes', the minimum bluestore + // throttle budget + int64_t min_bluestore_budget = 102400; + // Config value 'bluestore_codel_initial_budget_bytes', the initial bluestore + // throttle budget + int64_t initial_bluestore_budget = 102400; + // Config value 'bluestore_codel_budget_increment_bytes', the increment size + // for opening the bluestore throttle + int64_t bluestore_budget_increment = 102400; + + /* internal state variables */ + // current interval for the fast loop + int64_t fast_interval = INITIAL_LATENCY_VALUE; + // current target latency that fast loop is using + int64_t target_latency = INITIAL_LATENCY_VALUE; + int64_t target_latency_without_noise = INITIAL_LATENCY_VALUE; + // min latency in the current fast interval + int64_t min_latency = INITIAL_LATENCY_VALUE; + int64_t violation_count = 0; + ceph::mutex fast_timer_lock = ceph::make_mutex("CoDel::fast_timer_lock"); + ceph::mutex slow_timer_lock = ceph::make_mutex("CoDel::slow_timer_lock"); + ceph::mutex register_lock = ceph::make_mutex("CoDel::register_lock"); + SafeTimer fast_timer; // fast loop timer + SafeTimer slow_timer; // slow loop timer + // marks the start of the current slow interval + ceph::mono_clock::time_point slow_interval_start = ceph::mono_clock::zero(); + // amount of bytes that has been processed in current slow interval + int64_t slow_interval_registered_bytes = 0; + // number of transactions that has been processed in current slow interval + int64_t slow_interval_txc_cnt = 0; + // target latency history for regression + std::vector regression_target_latency_history; + // throughput history for regression + std::vector regression_throughput_history; + int64_t bluestore_budget = 102400; // current bluestore throttle budget + // maximum amount of inflight data in current slow interval + int64_t max_queue_length = 102400; + std::function bluestore_budget_reset_callback; + std::function get_kv_throttle_current; + + void on_min_latency_violation(); + + void on_no_violation(); + + virtual void on_fast_interval_finished() {} + + virtual void on_slow_interval_finished() {} + +private: + + bool _check_latency_violation(); + + void _update_interval(); + + void _fast_interval_process(); + + void _slow_interval_process(); + + template + double millisec_to_nanosec(T ms) { + return ms * 1000.0 * 1000.0; + } + + template + double nanosec_to_millisec(T ns) { + return ns / (1000.0 * 1000.0); + } + + template + double nanosec_to_sec(T ns) { + return ns / (1000.0 * 1000.0 * 1000.0); + } +};