Merge pull request #25697 from Aran85/fix-onode-trim

os/bluestore: more aggressive deferred submit when onode trim skipping

Reviewed-by: xie xingguo <xie.xingguo@zte.com.cn>
Reviewed-by: Igor Fedotov <ifedotov@suse.com>
This commit is contained in:
Kefu Chai 2019-08-23 13:15:27 +08:00 committed by GitHub
commit 263a78c3dd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 37 additions and 0 deletions

View File

@ -1035,6 +1035,7 @@ OPTION(bluestore_freelist_blocks_per_key, OPT_INT)
OPTION(bluestore_bitmapallocator_blocks_per_zone, OPT_INT) // must be power of 2 aligned, e.g., 512, 1024, 2048...
OPTION(bluestore_bitmapallocator_span_size, OPT_INT) // must be power of 2 aligned, e.g., 512, 1024, 2048...
OPTION(bluestore_max_deferred_txc, OPT_U64)
OPTION(bluestore_max_defer_interval, OPT_U64)
OPTION(bluestore_rocksdb_options, OPT_STR)
OPTION(bluestore_fsck_on_mount, OPT_BOOL)
OPTION(bluestore_fsck_on_mount_deep, OPT_BOOL)

View File

@ -4700,6 +4700,10 @@ std::vector<Option> get_global_options() {
.set_default(32)
.set_description("Max transactions with deferred writes that can accumulate before we force flush deferred writes"),
Option("bluestore_max_defer_interval", Option::TYPE_FLOAT, Option::LEVEL_ADVANCED)
.set_default(3)
.set_description("max duration to force deferred submit"),
Option("bluestore_rocksdb_options", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("compression=kNoCompression,max_write_buffer_number=4,min_write_buffer_number_to_merge=1,recycle_log_file_num=4,write_buffer_size=268435456,writable_file_max_buffer_size=0,compaction_readahead_size=2097152,max_background_compactions=2")
.set_description("Rocksdb options"),

View File

@ -3759,12 +3759,14 @@ void *BlueStore::MempoolThread::entry()
utime_t next_balance = ceph_clock_now();
utime_t next_resize = ceph_clock_now();
utime_t next_deferred_force_submit = ceph_clock_now();
bool interval_stats_trim = false;
while (!stop) {
// Before we trim, check and see if it's time to rebalance/resize.
double autotune_interval = store->cache_autotune_interval;
double resize_interval = store->osd_memory_cache_resize_interval;
double max_defer_interval = store->max_defer_interval;
if (autotune_interval > 0 && next_balance < ceph_clock_now()) {
_adjust_cache_settings();
@ -3787,6 +3789,16 @@ void *BlueStore::MempoolThread::entry()
next_resize += resize_interval;
}
if (max_defer_interval > 0 &&
next_deferred_force_submit < ceph_clock_now()) {
if (store->get_deferred_last_submitted() + max_defer_interval <
ceph_clock_now()) {
store->deferred_try_submit();
}
next_deferred_force_submit = ceph_clock_now();
next_deferred_force_submit += max_defer_interval/3;
}
// Now Resize the shards
_resize_shards(interval_stats_trim);
interval_stats_trim = false;
@ -4136,6 +4148,7 @@ const char **BlueStore::get_tracked_conf_keys() const
"bluestore_no_per_pool_stats_tolerance",
"bluestore_warn_on_legacy_statfs",
"bluestore_warn_on_no_per_pool_omap",
"bluestore_max_defer_interval",
NULL
};
return KEYS;
@ -4199,6 +4212,11 @@ void BlueStore::handle_conf_change(const ConfigProxy& conf,
throttle_deferred_bytes.reset_max(
conf->bluestore_throttle_bytes + conf->bluestore_throttle_deferred_bytes);
}
if (changed.count("bluestore_max_defer_interval")) {
if (bdev) {
_set_max_defer_interval();
}
}
}
void BlueStore::_set_compression()
@ -4774,6 +4792,7 @@ int BlueStore::_open_bdev(bool create)
block_mask = ~(block_size - 1);
block_size_order = ctz(block_size);
ceph_assert(block_size == 1u << block_size_order);
_set_max_defer_interval();
// and set cache_size based on device type
r = _set_cache_sizes();
if (r < 0) {
@ -11195,6 +11214,8 @@ void BlueStore::deferred_try_submit()
dout(20) << __func__ << " osr " << osr << " has no pending" << dendl;
}
}
deferred_last_submitted = ceph_clock_now();
}
void BlueStore::_deferred_submit_unlock(OpSequencer *osr)

View File

@ -152,6 +152,10 @@ public:
void _set_compression();
void _set_throttle_params();
int _set_cache_sizes();
void _set_max_defer_interval() {
max_defer_interval =
cct->_conf.get_val<double>("bluestore_max_defer_interval");
}
class TransContext;
@ -1812,6 +1816,7 @@ private:
int deferred_queue_size = 0; ///< num txc's queued across all osrs
atomic_int deferred_aggressive = {0}; ///< aggressive wakeup of kv thread
Finisher deferred_finisher, finisher;
utime_t deferred_last_submitted = utime_t();
KVSyncThread kv_sync_thread;
ceph::mutex kv_lock = ceph::make_mutex("BlueStore::kv_lock");
@ -1893,6 +1898,7 @@ private:
double osd_memory_expected_fragmentation = 0; ///< expected memory fragmentation
uint64_t osd_memory_cache_min = 0; ///< Min memory to assign when autotuning cache
double osd_memory_cache_resize_interval = 0; ///< Time to wait between cache resizing
double max_defer_interval = 0; ///< Time to wait between last deferred submit
typedef map<uint64_t, volatile_statfs> osd_pools_map;
@ -2105,6 +2111,11 @@ private:
bool create);
public:
utime_t get_deferred_last_submitted() {
std::lock_guard l(deferred_lock);
return deferred_last_submitted;
}
static int _write_bdev_label(CephContext* cct,
string path, bluestore_bdev_label_t label);
static int _read_bdev_label(CephContext* cct, string path,