mirror of
https://github.com/ceph/ceph
synced 2025-04-01 00:26:47 +00:00
os/bluestore: implement avl-extent-based allocator
Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn> Signed-off-by: Kefu Chai <kchai@redhat.com>
This commit is contained in:
parent
de50fcdf79
commit
adbc79a8ac
@ -4698,7 +4698,7 @@ std::vector<Option> get_global_options() {
|
||||
|
||||
Option("bluestore_allocator", Option::TYPE_STR, Option::LEVEL_ADVANCED)
|
||||
.set_default("bitmap")
|
||||
.set_enum_allowed({"bitmap", "stupid"})
|
||||
.set_enum_allowed({"bitmap", "stupid", "avl"})
|
||||
.set_description("Allocator policy")
|
||||
.set_long_description("Allocator to use for bluestore. Stupid should only be used for testing."),
|
||||
|
||||
@ -4944,6 +4944,13 @@ std::vector<Option> get_global_options() {
|
||||
.set_description("Enforces specific hw profile settings")
|
||||
.set_long_description("'hdd' enforces settings intended for BlueStore above a rotational drive. 'ssd' enforces settings intended for BlueStore above a solid drive. 'default' - using settings for the actual hardware."),
|
||||
|
||||
Option("bluestore_avl_alloc_bf_threshold", Option::TYPE_UINT, Option::LEVEL_DEV)
|
||||
.set_default(131072)
|
||||
.set_description(""),
|
||||
|
||||
Option("bluestore_avl_alloc_bf_free_pct", Option::TYPE_UINT, Option::LEVEL_DEV)
|
||||
.set_default(4)
|
||||
.set_description(""),
|
||||
|
||||
// -----------------------------------------
|
||||
// kstore
|
||||
|
@ -32,6 +32,7 @@ if(WITH_BLUESTORE)
|
||||
bluestore/FreelistManager.cc
|
||||
bluestore/StupidAllocator.cc
|
||||
bluestore/BitmapAllocator.cc
|
||||
bluestore/AvlAllocator.cc
|
||||
)
|
||||
endif(WITH_BLUESTORE)
|
||||
|
||||
|
@ -4,6 +4,7 @@
|
||||
#include "Allocator.h"
|
||||
#include "StupidAllocator.h"
|
||||
#include "BitmapAllocator.h"
|
||||
#include "AvlAllocator.h"
|
||||
#include "common/debug.h"
|
||||
#include "common/admin_socket.h"
|
||||
#define dout_subsys ceph_subsys_bluestore
|
||||
@ -107,6 +108,8 @@ Allocator *Allocator::create(CephContext* cct, string type,
|
||||
alloc = new StupidAllocator(cct, name, block_size);
|
||||
} else if (type == "bitmap") {
|
||||
alloc = new BitmapAllocator(cct, size, block_size, name);
|
||||
} else if (type == "avl") {
|
||||
return new AvlAllocator(cct, size, block_size, name);
|
||||
}
|
||||
if (alloc == nullptr) {
|
||||
lderr(cct) << "Allocator::" << __func__ << " unknown alloc type "
|
||||
|
336
src/os/bluestore/AvlAllocator.cc
Executable file
336
src/os/bluestore/AvlAllocator.cc
Executable file
@ -0,0 +1,336 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
|
||||
#include "AvlAllocator.h"
|
||||
|
||||
#include <limits>
|
||||
|
||||
#include "common/config_proxy.h"
|
||||
#include "common/debug.h"
|
||||
|
||||
#define dout_context cct
|
||||
#define dout_subsys ceph_subsys_bluestore
|
||||
#undef dout_prefix
|
||||
#define dout_prefix *_dout << "AvlAllocator "
|
||||
|
||||
MEMPOOL_DEFINE_OBJECT_FACTORY(range_seg_t, range_seg_t, bluestore_alloc);
|
||||
|
||||
namespace {
|
||||
// a light-weight "range_seg_t", which only used as the key when searching in
|
||||
// range_tree and range_size_tree
|
||||
struct range_t {
|
||||
uint64_t start;
|
||||
uint64_t end;
|
||||
};
|
||||
}
|
||||
|
||||
/*
|
||||
* This is a helper function that can be used by the allocator to find
|
||||
* a suitable block to allocate. This will search the specified AVL
|
||||
* tree looking for a block that matches the specified criteria.
|
||||
*/
|
||||
template<class Tree>
|
||||
uint64_t AvlAllocator::_block_picker(const Tree& t,
|
||||
uint64_t *cursor,
|
||||
uint64_t size,
|
||||
uint64_t align)
|
||||
{
|
||||
const auto compare = t.key_comp();
|
||||
for (auto rs = t.lower_bound(range_t{*cursor, size}, compare);
|
||||
rs != t.end(); ++rs) {
|
||||
uint64_t offset = p2roundup(rs->start, align);
|
||||
if (offset + size <= rs->end) {
|
||||
*cursor = offset + size;
|
||||
return offset;
|
||||
}
|
||||
}
|
||||
/*
|
||||
* If we know we've searched the whole tree (*cursor == 0), give up.
|
||||
* Otherwise, reset the cursor to the beginning and try again.
|
||||
*/
|
||||
if (*cursor == 0) {
|
||||
return -1ULL;
|
||||
}
|
||||
*cursor = 0;
|
||||
return _block_picker(t, cursor, size, align);
|
||||
}
|
||||
|
||||
namespace {
|
||||
struct dispose_rs {
|
||||
void operator()(range_seg_t* p)
|
||||
{
|
||||
delete p;
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
void AvlAllocator::_add_to_tree(uint64_t start, uint64_t size)
|
||||
{
|
||||
assert(size != 0);
|
||||
|
||||
uint64_t end = start + size;
|
||||
|
||||
auto rs_after = range_tree.upper_bound(range_t{start, end},
|
||||
range_tree.key_comp());
|
||||
|
||||
/* Make sure we don't overlap with either of our neighbors */
|
||||
auto rs_before = range_tree.end();
|
||||
if (rs_after != range_tree.begin()) {
|
||||
rs_before = std::prev(rs_after);
|
||||
}
|
||||
|
||||
bool merge_before = (rs_before != range_tree.end() && rs_before->end == start);
|
||||
bool merge_after = (rs_after != range_tree.end() && rs_after->start == end);
|
||||
|
||||
if (merge_before && merge_after) {
|
||||
range_size_tree.erase(*rs_before);
|
||||
range_size_tree.erase(*rs_after);
|
||||
rs_after->start = rs_before->start;
|
||||
range_tree.erase_and_dispose(rs_before, dispose_rs{});
|
||||
range_size_tree.insert(*rs_after);
|
||||
} else if (merge_before) {
|
||||
range_size_tree.erase(*rs_before);
|
||||
rs_before->end = end;
|
||||
range_size_tree.insert(*rs_before);
|
||||
} else if (merge_after) {
|
||||
range_size_tree.erase(*rs_after);
|
||||
rs_after->start = start;
|
||||
range_size_tree.insert(*rs_after);
|
||||
} else {
|
||||
auto new_rs = new range_seg_t{start, end};
|
||||
range_tree.insert_before(rs_after, *new_rs);
|
||||
range_size_tree.insert(*new_rs);
|
||||
}
|
||||
num_free += size;
|
||||
}
|
||||
|
||||
void AvlAllocator::_remove_from_tree(uint64_t start, uint64_t size)
|
||||
{
|
||||
uint64_t end = start + size;
|
||||
|
||||
assert(size != 0);
|
||||
assert(size <= num_free);
|
||||
|
||||
auto rs = range_tree.find(range_t{start, end}, range_tree.key_comp());
|
||||
/* Make sure we completely overlap with someone */
|
||||
assert(rs != range_tree.end());
|
||||
assert(rs->start <= start);
|
||||
assert(rs->end >= end);
|
||||
|
||||
bool left_over = (rs->start != start);
|
||||
bool right_over = (rs->end != end);
|
||||
|
||||
range_size_tree.erase(*rs);
|
||||
|
||||
if (left_over && right_over) {
|
||||
auto new_seg = new range_seg_t{end, rs->end};
|
||||
rs->end = start;
|
||||
range_tree.insert(rs, *new_seg);
|
||||
range_size_tree.insert(*new_seg);
|
||||
range_size_tree.insert(*rs);
|
||||
} else if (left_over) {
|
||||
rs->end = start;
|
||||
range_size_tree.insert(*rs);
|
||||
} else if (right_over) {
|
||||
rs->start = end;
|
||||
range_size_tree.insert(*rs);
|
||||
} else {
|
||||
range_tree.erase_and_dispose(rs, dispose_rs{});
|
||||
}
|
||||
assert(num_free >= size);
|
||||
num_free -= size;
|
||||
}
|
||||
|
||||
int AvlAllocator::_allocate(
|
||||
uint64_t size,
|
||||
uint64_t unit,
|
||||
uint64_t *offset,
|
||||
uint64_t *length)
|
||||
{
|
||||
std::lock_guard l(lock);
|
||||
uint64_t max_size = 0;
|
||||
if (auto p = range_size_tree.rbegin(); p != range_size_tree.rend()) {
|
||||
max_size = p->end - p->start;
|
||||
}
|
||||
|
||||
bool force_range_size_alloc = false;
|
||||
if (max_size < size) {
|
||||
if (max_size < unit) {
|
||||
return -ENOSPC;
|
||||
}
|
||||
size = p2align(max_size, unit);
|
||||
assert(size > 0);
|
||||
force_range_size_alloc = true;
|
||||
}
|
||||
/*
|
||||
* Find the largest power of 2 block size that evenly divides the
|
||||
* requested size. This is used to try to allocate blocks with similar
|
||||
* alignment from the same area (i.e. same cursor bucket) but it does
|
||||
* not guarantee that other allocations sizes may exist in the same
|
||||
* region.
|
||||
*/
|
||||
const uint64_t align = size & -size;
|
||||
assert(align != 0);
|
||||
uint64_t *cursor = &lbas[cbits(align) - 1];
|
||||
|
||||
const int free_pct = num_free * 100 / num_total;
|
||||
uint64_t start = 0;
|
||||
/*
|
||||
* If we're running low on space switch to using the size
|
||||
* sorted AVL tree (best-fit).
|
||||
*/
|
||||
if (force_range_size_alloc ||
|
||||
max_size < range_size_alloc_threshold ||
|
||||
free_pct < range_size_alloc_free_pct) {
|
||||
*cursor = 0;
|
||||
start = _block_picker(range_size_tree, cursor, size, unit);
|
||||
} else {
|
||||
start = _block_picker(range_tree, cursor, size, unit);
|
||||
}
|
||||
if (start == -1ULL) {
|
||||
return -ENOSPC;
|
||||
}
|
||||
|
||||
_remove_from_tree(start, size);
|
||||
|
||||
*offset = start;
|
||||
*length = size;
|
||||
return 0;
|
||||
}
|
||||
|
||||
AvlAllocator::AvlAllocator(CephContext* cct,
|
||||
int64_t device_size,
|
||||
int64_t block_size,
|
||||
const std::string& name) :
|
||||
Allocator(name),
|
||||
num_total(device_size),
|
||||
block_size(block_size),
|
||||
range_size_alloc_threshold(
|
||||
cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")),
|
||||
range_size_alloc_free_pct(
|
||||
cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")),
|
||||
cct(cct)
|
||||
{}
|
||||
|
||||
int64_t AvlAllocator::allocate(
|
||||
uint64_t want,
|
||||
uint64_t unit,
|
||||
uint64_t max_alloc_size,
|
||||
int64_t hint, // unused, for now!
|
||||
PExtentVector* extents)
|
||||
{
|
||||
ldout(cct, 10) << __func__ << std::hex
|
||||
<< " want 0x" << want
|
||||
<< " unit 0x" << unit
|
||||
<< " max_alloc_size 0x" << max_alloc_size
|
||||
<< " hint 0x" << hint
|
||||
<< std::dec << dendl;
|
||||
assert(isp2(unit));
|
||||
assert(want % unit == 0);
|
||||
|
||||
if (max_alloc_size == 0) {
|
||||
max_alloc_size = want;
|
||||
}
|
||||
if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max();
|
||||
max_alloc_size >= cap) {
|
||||
max_alloc_size = cap;
|
||||
}
|
||||
|
||||
uint64_t allocated = 0;
|
||||
while (allocated < want) {
|
||||
uint64_t offset, length;
|
||||
int r = _allocate(std::min(max_alloc_size, want - allocated),
|
||||
unit, &offset, &length);
|
||||
if (r < 0) {
|
||||
// Allocation failed.
|
||||
break;
|
||||
}
|
||||
extents->emplace_back(offset, length);
|
||||
allocated += length;
|
||||
}
|
||||
return allocated ? allocated : -ENOSPC;
|
||||
}
|
||||
|
||||
void AvlAllocator::release(const interval_set<uint64_t>& release_set)
|
||||
{
|
||||
std::lock_guard l(lock);
|
||||
for (auto p = release_set.begin(); p != release_set.end(); ++p) {
|
||||
const auto offset = p.get_start();
|
||||
const auto length = p.get_len();
|
||||
ldout(cct, 10) << __func__ << std::hex
|
||||
<< " offset 0x" << offset
|
||||
<< " length 0x" << length
|
||||
<< std::dec << dendl;
|
||||
_add_to_tree(offset, length);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t AvlAllocator::get_free()
|
||||
{
|
||||
std::lock_guard l(lock);
|
||||
return num_free;
|
||||
}
|
||||
|
||||
double AvlAllocator::get_fragmentation()
|
||||
{
|
||||
std::lock_guard l(lock);
|
||||
auto free_blocks = p2align(num_free, block_size) / block_size;
|
||||
if (free_blocks <= 1) {
|
||||
return .0;
|
||||
}
|
||||
return (static_cast<double>(range_tree.size() - 1) / (free_blocks - 1));
|
||||
}
|
||||
|
||||
void AvlAllocator::dump()
|
||||
{
|
||||
std::lock_guard l(lock);
|
||||
ldout(cct, 0) << __func__ << " range_tree: " << dendl;
|
||||
for (auto& rs : range_tree) {
|
||||
ldout(cct, 0) << std::hex
|
||||
<< "0x" << rs.start << "~" << rs.end
|
||||
<< std::dec
|
||||
<< dendl;
|
||||
}
|
||||
|
||||
ldout(cct, 0) << __func__ << " range_size_tree: " << dendl;
|
||||
for (auto& rs : range_size_tree) {
|
||||
ldout(cct, 0) << std::hex
|
||||
<< "0x" << rs.start << "~" << rs.end
|
||||
<< std::dec
|
||||
<< dendl;
|
||||
}
|
||||
}
|
||||
|
||||
void AvlAllocator::dump(std::function<void(uint64_t offset, uint64_t length)> notify)
|
||||
{
|
||||
for (auto& rs : range_tree) {
|
||||
notify(rs.start, rs.end - rs.start);
|
||||
}
|
||||
}
|
||||
|
||||
void AvlAllocator::init_add_free(uint64_t offset, uint64_t length)
|
||||
{
|
||||
std::lock_guard l(lock);
|
||||
ldout(cct, 10) << __func__ << std::hex
|
||||
<< " offset 0x" << offset
|
||||
<< " length 0x" << length
|
||||
<< std::dec << dendl;
|
||||
_add_to_tree(offset, length);
|
||||
}
|
||||
|
||||
void AvlAllocator::init_rm_free(uint64_t offset, uint64_t length)
|
||||
{
|
||||
std::lock_guard l(lock);
|
||||
ldout(cct, 10) << __func__ << std::hex
|
||||
<< " offset 0x" << offset
|
||||
<< " length 0x" << length
|
||||
<< std::dec << dendl;
|
||||
_remove_from_tree(offset, length);
|
||||
}
|
||||
|
||||
void AvlAllocator::shutdown()
|
||||
{
|
||||
std::lock_guard l(lock);
|
||||
range_size_tree.clear();
|
||||
range_tree.clear_and_dispose(dispose_rs{});
|
||||
}
|
136
src/os/bluestore/AvlAllocator.h
Executable file
136
src/os/bluestore/AvlAllocator.h
Executable file
@ -0,0 +1,136 @@
|
||||
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
|
||||
// vim: ts=8 sw=2 smarttab
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <mutex>
|
||||
#include <boost/intrusive/avl_set.hpp>
|
||||
|
||||
#include "Allocator.h"
|
||||
#include "os/bluestore/bluestore_types.h"
|
||||
#include "include/mempool.h"
|
||||
|
||||
struct range_seg_t {
|
||||
MEMPOOL_CLASS_HELPERS(); ///< memory monitoring
|
||||
uint64_t start; ///< starting offset of this segment
|
||||
uint64_t end; ///< ending offset (non-inclusive)
|
||||
|
||||
range_seg_t(uint64_t start, uint64_t end)
|
||||
: start{start},
|
||||
end{end}
|
||||
{}
|
||||
// Tree is sorted by offset, greater offsets at the end of the tree.
|
||||
struct before_t {
|
||||
template<typename KeyLeft, typename KeyRight>
|
||||
bool operator()(const KeyLeft& lhs, const KeyRight& rhs) const {
|
||||
return lhs.end <= rhs.start;
|
||||
}
|
||||
};
|
||||
boost::intrusive::avl_set_member_hook<> offset_hook;
|
||||
|
||||
// Tree is sorted by size, larger sizes at the end of the tree.
|
||||
struct shorter_t {
|
||||
template<typename KeyType>
|
||||
bool operator()(const range_seg_t& lhs, const KeyType& rhs) const {
|
||||
auto lhs_size = lhs.end - lhs.start;
|
||||
auto rhs_size = rhs.end - rhs.start;
|
||||
if (lhs_size < rhs_size) {
|
||||
return true;
|
||||
} else if (lhs_size > rhs_size) {
|
||||
return false;
|
||||
} else {
|
||||
return lhs.start < rhs.start;
|
||||
}
|
||||
}
|
||||
};
|
||||
boost::intrusive::avl_set_member_hook<> size_hook;
|
||||
};
|
||||
|
||||
class AvlAllocator final : public Allocator {
|
||||
public:
|
||||
AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
|
||||
const std::string& name);
|
||||
int64_t allocate(
|
||||
uint64_t want,
|
||||
uint64_t unit,
|
||||
uint64_t max_alloc_size,
|
||||
int64_t hint,
|
||||
PExtentVector *extents) final;
|
||||
void release(const interval_set<uint64_t>& release_set) final;
|
||||
uint64_t get_free() final;
|
||||
double get_fragmentation() final;
|
||||
|
||||
void dump() final;
|
||||
void dump(std::function<void(uint64_t offset, uint64_t length)> notify) final;
|
||||
void init_add_free(uint64_t offset, uint64_t length) final;
|
||||
void init_rm_free(uint64_t offset, uint64_t length) final;
|
||||
void shutdown() final;
|
||||
|
||||
private:
|
||||
template<class Tree>
|
||||
uint64_t _block_picker(const Tree& t, uint64_t *cursor, uint64_t size,
|
||||
uint64_t align);
|
||||
void _add_to_tree(uint64_t start, uint64_t size);
|
||||
void _remove_from_tree(uint64_t start, uint64_t size);
|
||||
int _allocate(
|
||||
uint64_t size,
|
||||
uint64_t unit,
|
||||
uint64_t *offset,
|
||||
uint64_t *length);
|
||||
|
||||
using range_tree_t =
|
||||
boost::intrusive::avl_set<
|
||||
range_seg_t,
|
||||
boost::intrusive::compare<range_seg_t::before_t>,
|
||||
boost::intrusive::member_hook<
|
||||
range_seg_t,
|
||||
boost::intrusive::avl_set_member_hook<>,
|
||||
&range_seg_t::offset_hook>>;
|
||||
range_tree_t range_tree; ///< main range tree
|
||||
/*
|
||||
* The range_size_tree should always contain the
|
||||
* same number of segments as the range_tree.
|
||||
* The only difference is that the range_size_tree
|
||||
* is ordered by segment sizes.
|
||||
*/
|
||||
using range_size_tree_t =
|
||||
boost::intrusive::avl_multiset<
|
||||
range_seg_t,
|
||||
boost::intrusive::compare<range_seg_t::shorter_t>,
|
||||
boost::intrusive::member_hook<
|
||||
range_seg_t,
|
||||
boost::intrusive::avl_set_member_hook<>,
|
||||
&range_seg_t::size_hook>>;
|
||||
range_size_tree_t range_size_tree;
|
||||
|
||||
const int64_t num_total; ///< device size
|
||||
const uint64_t block_size; ///< block size
|
||||
uint64_t num_free = 0; ///< total bytes in freelist
|
||||
|
||||
/*
|
||||
* This value defines the number of elements in the ms_lbas array.
|
||||
* The value of 64 was chosen as it covers all power of 2 buckets
|
||||
* up to UINT64_MAX.
|
||||
* This is the equivalent of highest-bit of UINT64_MAX.
|
||||
*/
|
||||
static constexpr unsigned MAX_LBAS = 64;
|
||||
uint64_t lbas[MAX_LBAS] = {0};
|
||||
|
||||
/*
|
||||
* Minimum size which forces the dynamic allocator to change
|
||||
* it's allocation strategy. Once the allocator cannot satisfy
|
||||
* an allocation of this size then it switches to using more
|
||||
* aggressive strategy (i.e search by size rather than offset).
|
||||
*/
|
||||
uint64_t range_size_alloc_threshold = 0;
|
||||
/*
|
||||
* The minimum free space, in percent, which must be available
|
||||
* in allocator to continue allocations in a first-fit fashion.
|
||||
* Once the allocator's free space drops below this level we dynamically
|
||||
* switch to using best-fit allocations.
|
||||
*/
|
||||
int range_size_alloc_free_pct = 0;
|
||||
|
||||
CephContext* cct;
|
||||
std::mutex lock;
|
||||
};
|
Loading…
Reference in New Issue
Block a user