From 40f05b971f5a8064cf9819f80fc3bbf21d5206da Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Wed, 2 Jun 2021 15:57:04 +0800 Subject: [PATCH 1/2] os/bluestore/AvlAllocator: introduce bluestore_avl_alloc_ff_max_search_count so AvlAllocator can switch from the first-first mode to best-fit mode without walking through the whole space map tree. in the highly-fragmented system, iterating the whole tree could hurt the performance of fast storage system a lot. the idea comes from openzfs's metaslab allocator. Signed-off-by: Kefu Chai --- src/common/options/global.yaml.in | 6 ++++ src/os/bluestore/AvlAllocator.cc | 50 +++++++++++++++++-------------- src/os/bluestore/AvlAllocator.h | 7 ++++- 3 files changed, 39 insertions(+), 24 deletions(-) diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 7a69fee070f..05d7d38234a 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -5029,6 +5029,12 @@ options: - hdd - ssd with_legacy: true +- name: bluestore_avl_alloc_ff_max_search_count + type: uint + level: dev + desc: Search for this many ranges in first-fit mode before switching over to + to best-fit mode. 0 to iterate through all ranges for required chunk. + default: 100 - name: bluestore_avl_alloc_bf_threshold type: uint level: dev diff --git a/src/os/bluestore/AvlAllocator.cc b/src/os/bluestore/AvlAllocator.cc index f3b721e8765..1d463b31a55 100644 --- a/src/os/bluestore/AvlAllocator.cc +++ b/src/os/bluestore/AvlAllocator.cc @@ -34,6 +34,7 @@ uint64_t AvlAllocator::_pick_block_after(uint64_t *cursor, uint64_t align) { const auto compare = range_tree.key_comp(); + uint32_t search_count = 0; auto rs_start = range_tree.lower_bound(range_t{*cursor, size}, compare); for (auto rs = rs_start; rs != range_tree.end(); ++rs) { uint64_t offset = p2roundup(rs->start, align); @@ -41,6 +42,9 @@ uint64_t AvlAllocator::_pick_block_after(uint64_t *cursor, *cursor = offset + size; return offset; } + if (max_search_count > 0 && ++search_count > max_search_count) { + return -1ULL; + } } if (*cursor == 0) { // If we already started from beginning, don't bother with searching from beginning @@ -53,6 +57,9 @@ uint64_t AvlAllocator::_pick_block_after(uint64_t *cursor, *cursor = offset + size; return offset; } + if (max_search_count > 0 && ++search_count > max_search_count) { + return -1ULL; + } } return -1ULL; } @@ -240,13 +247,27 @@ int AvlAllocator::_allocate( const int free_pct = num_free * 100 / device_size; uint64_t start = 0; - /* - * If we're running low on space switch to using the size - * sorted AVL tree (best-fit). - */ + // If we're running low on space, find a range by size by looking up in the size + // sorted tree (best-fit), instead of searching in the area pointed by cursor if (force_range_size_alloc || max_size < range_size_alloc_threshold || free_pct < range_size_alloc_free_pct) { + start = -1ULL; + } else { + /* + * Find the largest power of 2 block size that evenly divides the + * requested size. This is used to try to allocate blocks with similar + * alignment from the same area (i.e. same cursor bucket) but it does + * not guarantee that other allocations sizes may exist in the same + * region. + */ + uint64_t align = size & -size; + ceph_assert(align != 0); + uint64_t* cursor = &lbas[cbits(align) - 1]; + start = _pick_block_after(cursor, size, unit); + dout(20) << __func__ << " first fit=" << start << " size=" << size << dendl; + } + if (start == -1ULL) { do { start = _pick_block_fits(size, unit); dout(20) << __func__ << " best fit=" << start << " size=" << size << dendl; @@ -257,25 +278,6 @@ int AvlAllocator::_allocate( // that large block due to misaligned extents size = p2align(size >> 1, unit); } while (size >= unit); - } else { - do { - /* - * Find the largest power of 2 block size that evenly divides the - * requested size. This is used to try to allocate blocks with similar - * alignment from the same area (i.e. same cursor bucket) but it does - * not guarantee that other allocations sizes may exist in the same - * region. - */ - uint64_t* cursor = &lbas[cbits(size) - 1]; - start = _pick_block_after(cursor, size, unit); - dout(20) << __func__ << " first fit=" << start << " size=" << size << dendl; - if (start != uint64_t(-1ULL)) { - break; - } - // try to collect smaller extents as we could fail to retrieve - // that large block due to misaligned extents - size = p2align(size >> 1, unit); - } while (size >= unit); } if (start == -1ULL) { return -ENOSPC; @@ -328,6 +330,8 @@ AvlAllocator::AvlAllocator(CephContext* cct, cct->_conf.get_val("bluestore_avl_alloc_bf_threshold")), range_size_alloc_free_pct( cct->_conf.get_val("bluestore_avl_alloc_bf_free_pct")), + max_search_count( + cct->_conf.get_val("bluestore_avl_alloc_ff_max_search_count")), range_count_cap(max_mem / sizeof(range_seg_t)), cct(cct) {} diff --git a/src/os/bluestore/AvlAllocator.h b/src/os/bluestore/AvlAllocator.h index bb4c44ea0a5..f813fa41c82 100644 --- a/src/os/bluestore/AvlAllocator.h +++ b/src/os/bluestore/AvlAllocator.h @@ -158,7 +158,12 @@ private: * switch to using best-fit allocations. */ int range_size_alloc_free_pct = 0; - + /* + * Maximum number of segments to check in the first-fit mode, without this + * limit, fragmented device can see lots of iterations and _block_picker() + * becomes the performance limiting factor on high-performance storage. + */ + const uint32_t max_search_count; /* * Max amount of range entries allowed. 0 - unlimited */ From 5a26875049d13130ffe5954428da0e1b9750359f Mon Sep 17 00:00:00 2001 From: Kefu Chai Date: Tue, 1 Jun 2021 19:14:33 +0800 Subject: [PATCH 2/2] os/bluestore/AvlAllocator: introduce bluestore_avl_alloc_ff_max_search_bytes so AvlAllocator can switch from the first-first mode to best-fit mode without walking through the whole space map tree. in the highly-fragmented system, iterating the whole tree could hurt the performance of fast storage system a lot. the idea comes from openzfs's metaslab allocator. Signed-off-by: Kefu Chai --- src/common/options/global.yaml.in | 6 ++++++ src/os/bluestore/AvlAllocator.cc | 10 ++++++++++ src/os/bluestore/AvlAllocator.h | 6 ++++++ 3 files changed, 22 insertions(+) diff --git a/src/common/options/global.yaml.in b/src/common/options/global.yaml.in index 05d7d38234a..1dd0b57d467 100644 --- a/src/common/options/global.yaml.in +++ b/src/common/options/global.yaml.in @@ -5035,6 +5035,12 @@ options: desc: Search for this many ranges in first-fit mode before switching over to to best-fit mode. 0 to iterate through all ranges for required chunk. default: 100 +- name: bluestore_avl_alloc_ff_max_search_bytes + type: size + level: dev + desc: Maximum distance to search in first-fit mode before switching over to + to best-fit mode. 0 to iterate through all ranges for required chunk. + default: 16_M - name: bluestore_avl_alloc_bf_threshold type: uint level: dev diff --git a/src/os/bluestore/AvlAllocator.cc b/src/os/bluestore/AvlAllocator.cc index 1d463b31a55..e7a9befef05 100644 --- a/src/os/bluestore/AvlAllocator.cc +++ b/src/os/bluestore/AvlAllocator.cc @@ -35,6 +35,7 @@ uint64_t AvlAllocator::_pick_block_after(uint64_t *cursor, { const auto compare = range_tree.key_comp(); uint32_t search_count = 0; + uint64_t search_bytes = 0; auto rs_start = range_tree.lower_bound(range_t{*cursor, size}, compare); for (auto rs = rs_start; rs != range_tree.end(); ++rs) { uint64_t offset = p2roundup(rs->start, align); @@ -45,6 +46,10 @@ uint64_t AvlAllocator::_pick_block_after(uint64_t *cursor, if (max_search_count > 0 && ++search_count > max_search_count) { return -1ULL; } + if (search_bytes = rs->start - rs_start->start; + max_search_bytes > 0 && search_bytes > max_search_bytes) { + return -1ULL; + } } if (*cursor == 0) { // If we already started from beginning, don't bother with searching from beginning @@ -60,6 +65,9 @@ uint64_t AvlAllocator::_pick_block_after(uint64_t *cursor, if (max_search_count > 0 && ++search_count > max_search_count) { return -1ULL; } + if (max_search_bytes > 0 && search_bytes + rs->start > max_search_bytes) { + return -1ULL; + } } return -1ULL; } @@ -332,6 +340,8 @@ AvlAllocator::AvlAllocator(CephContext* cct, cct->_conf.get_val("bluestore_avl_alloc_bf_free_pct")), max_search_count( cct->_conf.get_val("bluestore_avl_alloc_ff_max_search_count")), + max_search_bytes( + cct->_conf.get_val("bluestore_avl_alloc_ff_max_search_bytes")), range_count_cap(max_mem / sizeof(range_seg_t)), cct(cct) {} diff --git a/src/os/bluestore/AvlAllocator.h b/src/os/bluestore/AvlAllocator.h index f813fa41c82..3779a670294 100644 --- a/src/os/bluestore/AvlAllocator.h +++ b/src/os/bluestore/AvlAllocator.h @@ -164,6 +164,12 @@ private: * becomes the performance limiting factor on high-performance storage. */ const uint32_t max_search_count; + /* + * Maximum distance to search forward from the last offset, without this + * limit, fragmented device can see lots of iterations and _block_picker() + * becomes the performance limiting factor on high-performance storage. + */ + const uint32_t max_search_bytes; /* * Max amount of range entries allowed. 0 - unlimited */