os/bluestore: implement avl-extent-based allocator

Signed-off-by: xie xingguo <xie.xingguo@zte.com.cn>
Signed-off-by: Kefu Chai <kchai@redhat.com>
This commit is contained in:
xie xingguo 2017-10-07 13:18:31 +08:00 committed by Kefu Chai
parent de50fcdf79
commit adbc79a8ac
5 changed files with 484 additions and 1 deletions

View File

@ -4698,7 +4698,7 @@ std::vector<Option> get_global_options() {
Option("bluestore_allocator", Option::TYPE_STR, Option::LEVEL_ADVANCED)
.set_default("bitmap")
.set_enum_allowed({"bitmap", "stupid"})
.set_enum_allowed({"bitmap", "stupid", "avl"})
.set_description("Allocator policy")
.set_long_description("Allocator to use for bluestore. Stupid should only be used for testing."),
@ -4944,6 +4944,13 @@ std::vector<Option> get_global_options() {
.set_description("Enforces specific hw profile settings")
.set_long_description("'hdd' enforces settings intended for BlueStore above a rotational drive. 'ssd' enforces settings intended for BlueStore above a solid drive. 'default' - using settings for the actual hardware."),
Option("bluestore_avl_alloc_bf_threshold", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(131072)
.set_description(""),
Option("bluestore_avl_alloc_bf_free_pct", Option::TYPE_UINT, Option::LEVEL_DEV)
.set_default(4)
.set_description(""),
// -----------------------------------------
// kstore

View File

@ -32,6 +32,7 @@ if(WITH_BLUESTORE)
bluestore/FreelistManager.cc
bluestore/StupidAllocator.cc
bluestore/BitmapAllocator.cc
bluestore/AvlAllocator.cc
)
endif(WITH_BLUESTORE)

View File

@ -4,6 +4,7 @@
#include "Allocator.h"
#include "StupidAllocator.h"
#include "BitmapAllocator.h"
#include "AvlAllocator.h"
#include "common/debug.h"
#include "common/admin_socket.h"
#define dout_subsys ceph_subsys_bluestore
@ -107,6 +108,8 @@ Allocator *Allocator::create(CephContext* cct, string type,
alloc = new StupidAllocator(cct, name, block_size);
} else if (type == "bitmap") {
alloc = new BitmapAllocator(cct, size, block_size, name);
} else if (type == "avl") {
return new AvlAllocator(cct, size, block_size, name);
}
if (alloc == nullptr) {
lderr(cct) << "Allocator::" << __func__ << " unknown alloc type "

336
src/os/bluestore/AvlAllocator.cc Executable file
View File

@ -0,0 +1,336 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
#include "AvlAllocator.h"
#include <limits>
#include "common/config_proxy.h"
#include "common/debug.h"
#define dout_context cct
#define dout_subsys ceph_subsys_bluestore
#undef dout_prefix
#define dout_prefix *_dout << "AvlAllocator "
MEMPOOL_DEFINE_OBJECT_FACTORY(range_seg_t, range_seg_t, bluestore_alloc);
namespace {
// a light-weight "range_seg_t", which only used as the key when searching in
// range_tree and range_size_tree
struct range_t {
uint64_t start;
uint64_t end;
};
}
/*
* This is a helper function that can be used by the allocator to find
* a suitable block to allocate. This will search the specified AVL
* tree looking for a block that matches the specified criteria.
*/
template<class Tree>
uint64_t AvlAllocator::_block_picker(const Tree& t,
uint64_t *cursor,
uint64_t size,
uint64_t align)
{
const auto compare = t.key_comp();
for (auto rs = t.lower_bound(range_t{*cursor, size}, compare);
rs != t.end(); ++rs) {
uint64_t offset = p2roundup(rs->start, align);
if (offset + size <= rs->end) {
*cursor = offset + size;
return offset;
}
}
/*
* If we know we've searched the whole tree (*cursor == 0), give up.
* Otherwise, reset the cursor to the beginning and try again.
*/
if (*cursor == 0) {
return -1ULL;
}
*cursor = 0;
return _block_picker(t, cursor, size, align);
}
namespace {
struct dispose_rs {
void operator()(range_seg_t* p)
{
delete p;
}
};
}
void AvlAllocator::_add_to_tree(uint64_t start, uint64_t size)
{
assert(size != 0);
uint64_t end = start + size;
auto rs_after = range_tree.upper_bound(range_t{start, end},
range_tree.key_comp());
/* Make sure we don't overlap with either of our neighbors */
auto rs_before = range_tree.end();
if (rs_after != range_tree.begin()) {
rs_before = std::prev(rs_after);
}
bool merge_before = (rs_before != range_tree.end() && rs_before->end == start);
bool merge_after = (rs_after != range_tree.end() && rs_after->start == end);
if (merge_before && merge_after) {
range_size_tree.erase(*rs_before);
range_size_tree.erase(*rs_after);
rs_after->start = rs_before->start;
range_tree.erase_and_dispose(rs_before, dispose_rs{});
range_size_tree.insert(*rs_after);
} else if (merge_before) {
range_size_tree.erase(*rs_before);
rs_before->end = end;
range_size_tree.insert(*rs_before);
} else if (merge_after) {
range_size_tree.erase(*rs_after);
rs_after->start = start;
range_size_tree.insert(*rs_after);
} else {
auto new_rs = new range_seg_t{start, end};
range_tree.insert_before(rs_after, *new_rs);
range_size_tree.insert(*new_rs);
}
num_free += size;
}
void AvlAllocator::_remove_from_tree(uint64_t start, uint64_t size)
{
uint64_t end = start + size;
assert(size != 0);
assert(size <= num_free);
auto rs = range_tree.find(range_t{start, end}, range_tree.key_comp());
/* Make sure we completely overlap with someone */
assert(rs != range_tree.end());
assert(rs->start <= start);
assert(rs->end >= end);
bool left_over = (rs->start != start);
bool right_over = (rs->end != end);
range_size_tree.erase(*rs);
if (left_over && right_over) {
auto new_seg = new range_seg_t{end, rs->end};
rs->end = start;
range_tree.insert(rs, *new_seg);
range_size_tree.insert(*new_seg);
range_size_tree.insert(*rs);
} else if (left_over) {
rs->end = start;
range_size_tree.insert(*rs);
} else if (right_over) {
rs->start = end;
range_size_tree.insert(*rs);
} else {
range_tree.erase_and_dispose(rs, dispose_rs{});
}
assert(num_free >= size);
num_free -= size;
}
int AvlAllocator::_allocate(
uint64_t size,
uint64_t unit,
uint64_t *offset,
uint64_t *length)
{
std::lock_guard l(lock);
uint64_t max_size = 0;
if (auto p = range_size_tree.rbegin(); p != range_size_tree.rend()) {
max_size = p->end - p->start;
}
bool force_range_size_alloc = false;
if (max_size < size) {
if (max_size < unit) {
return -ENOSPC;
}
size = p2align(max_size, unit);
assert(size > 0);
force_range_size_alloc = true;
}
/*
* Find the largest power of 2 block size that evenly divides the
* requested size. This is used to try to allocate blocks with similar
* alignment from the same area (i.e. same cursor bucket) but it does
* not guarantee that other allocations sizes may exist in the same
* region.
*/
const uint64_t align = size & -size;
assert(align != 0);
uint64_t *cursor = &lbas[cbits(align) - 1];
const int free_pct = num_free * 100 / num_total;
uint64_t start = 0;
/*
* If we're running low on space switch to using the size
* sorted AVL tree (best-fit).
*/
if (force_range_size_alloc ||
max_size < range_size_alloc_threshold ||
free_pct < range_size_alloc_free_pct) {
*cursor = 0;
start = _block_picker(range_size_tree, cursor, size, unit);
} else {
start = _block_picker(range_tree, cursor, size, unit);
}
if (start == -1ULL) {
return -ENOSPC;
}
_remove_from_tree(start, size);
*offset = start;
*length = size;
return 0;
}
AvlAllocator::AvlAllocator(CephContext* cct,
int64_t device_size,
int64_t block_size,
const std::string& name) :
Allocator(name),
num_total(device_size),
block_size(block_size),
range_size_alloc_threshold(
cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_threshold")),
range_size_alloc_free_pct(
cct->_conf.get_val<uint64_t>("bluestore_avl_alloc_bf_free_pct")),
cct(cct)
{}
int64_t AvlAllocator::allocate(
uint64_t want,
uint64_t unit,
uint64_t max_alloc_size,
int64_t hint, // unused, for now!
PExtentVector* extents)
{
ldout(cct, 10) << __func__ << std::hex
<< " want 0x" << want
<< " unit 0x" << unit
<< " max_alloc_size 0x" << max_alloc_size
<< " hint 0x" << hint
<< std::dec << dendl;
assert(isp2(unit));
assert(want % unit == 0);
if (max_alloc_size == 0) {
max_alloc_size = want;
}
if (constexpr auto cap = std::numeric_limits<decltype(bluestore_pextent_t::length)>::max();
max_alloc_size >= cap) {
max_alloc_size = cap;
}
uint64_t allocated = 0;
while (allocated < want) {
uint64_t offset, length;
int r = _allocate(std::min(max_alloc_size, want - allocated),
unit, &offset, &length);
if (r < 0) {
// Allocation failed.
break;
}
extents->emplace_back(offset, length);
allocated += length;
}
return allocated ? allocated : -ENOSPC;
}
void AvlAllocator::release(const interval_set<uint64_t>& release_set)
{
std::lock_guard l(lock);
for (auto p = release_set.begin(); p != release_set.end(); ++p) {
const auto offset = p.get_start();
const auto length = p.get_len();
ldout(cct, 10) << __func__ << std::hex
<< " offset 0x" << offset
<< " length 0x" << length
<< std::dec << dendl;
_add_to_tree(offset, length);
}
}
uint64_t AvlAllocator::get_free()
{
std::lock_guard l(lock);
return num_free;
}
double AvlAllocator::get_fragmentation()
{
std::lock_guard l(lock);
auto free_blocks = p2align(num_free, block_size) / block_size;
if (free_blocks <= 1) {
return .0;
}
return (static_cast<double>(range_tree.size() - 1) / (free_blocks - 1));
}
void AvlAllocator::dump()
{
std::lock_guard l(lock);
ldout(cct, 0) << __func__ << " range_tree: " << dendl;
for (auto& rs : range_tree) {
ldout(cct, 0) << std::hex
<< "0x" << rs.start << "~" << rs.end
<< std::dec
<< dendl;
}
ldout(cct, 0) << __func__ << " range_size_tree: " << dendl;
for (auto& rs : range_size_tree) {
ldout(cct, 0) << std::hex
<< "0x" << rs.start << "~" << rs.end
<< std::dec
<< dendl;
}
}
void AvlAllocator::dump(std::function<void(uint64_t offset, uint64_t length)> notify)
{
for (auto& rs : range_tree) {
notify(rs.start, rs.end - rs.start);
}
}
void AvlAllocator::init_add_free(uint64_t offset, uint64_t length)
{
std::lock_guard l(lock);
ldout(cct, 10) << __func__ << std::hex
<< " offset 0x" << offset
<< " length 0x" << length
<< std::dec << dendl;
_add_to_tree(offset, length);
}
void AvlAllocator::init_rm_free(uint64_t offset, uint64_t length)
{
std::lock_guard l(lock);
ldout(cct, 10) << __func__ << std::hex
<< " offset 0x" << offset
<< " length 0x" << length
<< std::dec << dendl;
_remove_from_tree(offset, length);
}
void AvlAllocator::shutdown()
{
std::lock_guard l(lock);
range_size_tree.clear();
range_tree.clear_and_dispose(dispose_rs{});
}

136
src/os/bluestore/AvlAllocator.h Executable file
View File

@ -0,0 +1,136 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
#pragma once
#include <mutex>
#include <boost/intrusive/avl_set.hpp>
#include "Allocator.h"
#include "os/bluestore/bluestore_types.h"
#include "include/mempool.h"
struct range_seg_t {
MEMPOOL_CLASS_HELPERS(); ///< memory monitoring
uint64_t start; ///< starting offset of this segment
uint64_t end; ///< ending offset (non-inclusive)
range_seg_t(uint64_t start, uint64_t end)
: start{start},
end{end}
{}
// Tree is sorted by offset, greater offsets at the end of the tree.
struct before_t {
template<typename KeyLeft, typename KeyRight>
bool operator()(const KeyLeft& lhs, const KeyRight& rhs) const {
return lhs.end <= rhs.start;
}
};
boost::intrusive::avl_set_member_hook<> offset_hook;
// Tree is sorted by size, larger sizes at the end of the tree.
struct shorter_t {
template<typename KeyType>
bool operator()(const range_seg_t& lhs, const KeyType& rhs) const {
auto lhs_size = lhs.end - lhs.start;
auto rhs_size = rhs.end - rhs.start;
if (lhs_size < rhs_size) {
return true;
} else if (lhs_size > rhs_size) {
return false;
} else {
return lhs.start < rhs.start;
}
}
};
boost::intrusive::avl_set_member_hook<> size_hook;
};
class AvlAllocator final : public Allocator {
public:
AvlAllocator(CephContext* cct, int64_t device_size, int64_t block_size,
const std::string& name);
int64_t allocate(
uint64_t want,
uint64_t unit,
uint64_t max_alloc_size,
int64_t hint,
PExtentVector *extents) final;
void release(const interval_set<uint64_t>& release_set) final;
uint64_t get_free() final;
double get_fragmentation() final;
void dump() final;
void dump(std::function<void(uint64_t offset, uint64_t length)> notify) final;
void init_add_free(uint64_t offset, uint64_t length) final;
void init_rm_free(uint64_t offset, uint64_t length) final;
void shutdown() final;
private:
template<class Tree>
uint64_t _block_picker(const Tree& t, uint64_t *cursor, uint64_t size,
uint64_t align);
void _add_to_tree(uint64_t start, uint64_t size);
void _remove_from_tree(uint64_t start, uint64_t size);
int _allocate(
uint64_t size,
uint64_t unit,
uint64_t *offset,
uint64_t *length);
using range_tree_t =
boost::intrusive::avl_set<
range_seg_t,
boost::intrusive::compare<range_seg_t::before_t>,
boost::intrusive::member_hook<
range_seg_t,
boost::intrusive::avl_set_member_hook<>,
&range_seg_t::offset_hook>>;
range_tree_t range_tree; ///< main range tree
/*
* The range_size_tree should always contain the
* same number of segments as the range_tree.
* The only difference is that the range_size_tree
* is ordered by segment sizes.
*/
using range_size_tree_t =
boost::intrusive::avl_multiset<
range_seg_t,
boost::intrusive::compare<range_seg_t::shorter_t>,
boost::intrusive::member_hook<
range_seg_t,
boost::intrusive::avl_set_member_hook<>,
&range_seg_t::size_hook>>;
range_size_tree_t range_size_tree;
const int64_t num_total; ///< device size
const uint64_t block_size; ///< block size
uint64_t num_free = 0; ///< total bytes in freelist
/*
* This value defines the number of elements in the ms_lbas array.
* The value of 64 was chosen as it covers all power of 2 buckets
* up to UINT64_MAX.
* This is the equivalent of highest-bit of UINT64_MAX.
*/
static constexpr unsigned MAX_LBAS = 64;
uint64_t lbas[MAX_LBAS] = {0};
/*
* Minimum size which forces the dynamic allocator to change
* it's allocation strategy. Once the allocator cannot satisfy
* an allocation of this size then it switches to using more
* aggressive strategy (i.e search by size rather than offset).
*/
uint64_t range_size_alloc_threshold = 0;
/*
* The minimum free space, in percent, which must be available
* in allocator to continue allocations in a first-fit fashion.
* Once the allocator's free space drops below this level we dynamically
* switch to using best-fit allocations.
*/
int range_size_alloc_free_pct = 0;
CephContext* cct;
std::mutex lock;
};