Merge pull request #44235 from xxhdx1985126/wip-onode-omap-hint-optimization

crimson/os/seastore: avoid onode/omap laddr hint conflicts as much as possible

Reviewed-by: Samuel Just <sjust@redhat.com>
Reviewed-by: Yingxin Cheng <yingxin.cheng@intel.com>
Reviewed-by: Chunmei Liu <chunmei.liu@intel.com>
This commit is contained in:
Samuel Just 2021-12-14 00:10:31 -08:00 committed by GitHub
commit 686398f742
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 112 additions and 43 deletions

View File

@ -65,3 +65,13 @@ options:
level: dev
desc: The record fullness threshold to flush a journal batch
default: 0.95
- name: seastore_default_max_object_size
type: uint
level: dev
desc: default logical address space reservation for seastore objects' data
default: 16777216
- name: seastore_default_object_metadata_reservation
type: uint
level: dev
desc: default logical address space reservation for seastore objects' metadata
default: 16777216

View File

@ -10,8 +10,11 @@
#include "crimson/common/log.h"
#include "crimson/common/errorator.h"
#ifndef NDEBUG
#define INTR_FUT_DEBUG(FMT_MSG, ...) crimson::get_logger(ceph_subsys_).trace(FMT_MSG, ##__VA_ARGS__)
#else
#define INTR_FUT_DEBUG(FMT_MSG, ...)
#endif
// The interrupt condition generally works this way:
//

View File

@ -15,16 +15,6 @@ namespace {
}
namespace crimson::os::seastore {
/**
* MAX_OBJECT_SIZE
*
* For now, we allocate a fixed region of laddr space of size MAX_OBJECT_SIZE
* for any object. In the future, once we have the ability to remap logical
* mappings (necessary for clone), we'll add the ability to grow and shrink
* these regions and remove this assumption.
*/
static constexpr extent_len_t MAX_OBJECT_SIZE = 16<<20;
#define assert_aligned(x) ceph_assert(((x)%ctx.tm.get_block_size()) == 0)
using context_t = ObjectDataHandler::context_t;
@ -261,9 +251,9 @@ ObjectDataHandler::write_ret ObjectDataHandler::prepare_data_reservation(
extent_len_t size)
{
LOG_PREFIX(ObjectDataHandler::prepare_data_reservation);
ceph_assert(size <= MAX_OBJECT_SIZE);
ceph_assert(size <= max_object_size);
if (!object_data.is_null()) {
ceph_assert(object_data.get_reserved_data_len() == MAX_OBJECT_SIZE);
ceph_assert(object_data.get_reserved_data_len() == max_object_size);
DEBUGT("reservation present: {}~{}",
ctx.t,
object_data.get_reserved_data_base(),
@ -272,14 +262,14 @@ ObjectDataHandler::write_ret ObjectDataHandler::prepare_data_reservation(
} else {
DEBUGT("reserving: {}~{}",
ctx.t,
ctx.onode.get_hint(),
MAX_OBJECT_SIZE);
ctx.onode.get_data_hint(),
max_object_size);
return ctx.tm.reserve_region(
ctx.t,
ctx.onode.get_hint(),
MAX_OBJECT_SIZE
).si_then([&object_data](auto pin) {
ceph_assert(pin->get_length() == MAX_OBJECT_SIZE);
ctx.onode.get_data_hint(),
max_object_size
).si_then([max_object_size=max_object_size, &object_data](auto pin) {
ceph_assert(pin->get_length() == max_object_size);
object_data.update_reserved(
pin->get_laddr(),
pin->get_length());

View File

@ -50,6 +50,8 @@ class ObjectDataHandler {
public:
using base_iertr = TransactionManager::base_iertr;
ObjectDataHandler(uint32_t mos) : max_object_size(mos) {}
struct context_t {
TransactionManager &tm;
Transaction &t;
@ -104,6 +106,16 @@ private:
context_t ctx,
object_data_t &object_data,
extent_len_t size);
private:
/**
* max_object_size
*
* For now, we allocate a fixed region of laddr space of size max_object_size
* for any object. In the future, once we have the ability to remap logical
* mappings (necessary for clone), we'll add the ability to grow and shrink
* these regions and remove this assumption.
*/
const uint32_t max_object_size = 0;
};
}

View File

@ -52,12 +52,29 @@ class Onode : public boost::intrusive_ref_counter<
Onode,
boost::thread_unsafe_counter>
{
protected:
virtual laddr_t get_hint() const = 0;
const uint32_t default_metadata_offset = 0;
const uint32_t default_metadata_range = 0;
public:
Onode(uint32_t ddr, uint32_t dmr)
: default_metadata_offset(ddr),
default_metadata_range(dmr)
{}
virtual const onode_layout_t &get_layout() const = 0;
virtual onode_layout_t &get_mutable_layout(Transaction &t) = 0;
virtual ~Onode() = default;
virtual laddr_t get_hint() const = 0;
laddr_t get_metadata_hint() const {
assert(default_metadata_offset);
assert(default_metadata_range);
return get_hint() + default_metadata_offset +
((uint32_t)std::rand() % default_metadata_range);
}
laddr_t get_data_hint() const {
return get_hint();
}
};

View File

@ -27,7 +27,10 @@ FLTreeOnodeManager::get_onode_ret FLTreeOnodeManager::get_onode(
DEBUGT("no entry for {}", trans, hoid);
return crimson::ct_error::enoent::make();
}
auto val = OnodeRef(new FLTreeOnode(cursor.value()));
auto val = OnodeRef(new FLTreeOnode(
default_data_reservation,
default_metadata_range,
cursor.value()));
return get_onode_iertr::make_ready_future<OnodeRef>(
val
);
@ -43,10 +46,13 @@ FLTreeOnodeManager::get_or_create_onode(
return tree.insert(
trans, hoid,
OnodeTree::tree_value_config_t{sizeof(onode_layout_t)}
).si_then([&trans, &hoid, FNAME](auto p)
).si_then([this, &trans, &hoid, FNAME](auto p)
-> get_or_create_onode_ret {
auto [cursor, created] = std::move(p);
auto val = OnodeRef(new FLTreeOnode(cursor.value()));
auto val = OnodeRef(new FLTreeOnode(
default_data_reservation,
default_metadata_range,
cursor.value()));
if (created) {
DEBUGT("created onode for entry for {}", trans, hoid);
val->get_mutable_layout(trans) = onode_layout_t{};

View File

@ -37,7 +37,14 @@ struct FLTreeOnode final : Onode, Value {
FLTreeOnode& operator=(const FLTreeOnode&) = delete;
template <typename... T>
FLTreeOnode(T&&... args) : Value(std::forward<T>(args)...) {}
FLTreeOnode(uint32_t ddr, uint32_t dmr, T&&... args)
: Onode(ddr, dmr),
Value(std::forward<T>(args)...) {}
template <typename... T>
FLTreeOnode(T&&... args)
: Onode(0, 0),
Value(std::forward<T>(args)...) {}
struct Recorder : public ValueDeltaRecorder {
Recorder(bufferlist &bl) : ValueDeltaRecorder(bl) {}
@ -102,12 +109,23 @@ struct FLTreeOnode final : Onode, Value {
using OnodeTree = Btree<FLTreeOnode>;
using crimson::common::get_conf;
class FLTreeOnodeManager : public crimson::os::seastore::OnodeManager {
OnodeTree tree;
uint32_t default_data_reservation = 0;
uint32_t default_metadata_offset = 0;
uint32_t default_metadata_range = 0;
public:
FLTreeOnodeManager(TransactionManager &tm) :
tree(NodeExtentManager::create_seastore(tm)) {}
tree(NodeExtentManager::create_seastore(tm)),
default_data_reservation(
get_conf<uint64_t>("seastore_default_max_object_size")),
default_metadata_offset(default_data_reservation),
default_metadata_range(
get_conf<uint64_t>("seastore_default_object_metadata_reservation"))
{}
mkfs_ret mkfs(Transaction &t) {
return tree.mkfs(t);

View File

@ -68,6 +68,8 @@ public:
}
};
using crimson::common::get_conf;
SeaStore::SeaStore(
const std::string& root,
MDStoreRef mdstore,
@ -80,7 +82,9 @@ SeaStore::SeaStore(
segment_manager(std::move(sm)),
transaction_manager(std::move(tm)),
collection_manager(std::move(cm)),
onode_manager(std::move(om))
onode_manager(std::move(om)),
max_object_size(
get_conf<uint64_t>("seastore_default_max_object_size"))
{
register_metrics();
}
@ -459,7 +463,7 @@ SeaStore::read_errorator::future<ceph::bufferlist> SeaStore::read(
size - offset :
std::min(size - offset, len);
return ObjectDataHandler().read(
return ObjectDataHandler(max_object_size).read(
ObjectDataHandler::context_t{
*transaction_manager,
t,
@ -509,7 +513,7 @@ SeaStore::get_attr_errorator::future<ceph::bufferlist> SeaStore::get_attr(
}
return _omap_get_value(
t,
layout.xattr_root.get(onode.get_hint()),
layout.xattr_root.get(onode.get_metadata_hint()),
name);
}
).handle_error(crimson::ct_error::input_output_error::handle([FNAME] {
@ -607,7 +611,8 @@ SeaStore::omap_get_values(
"omap_get_values",
op_type_t::OMAP_GET_VALUES,
[this, keys](auto &t, auto &onode) {
omap_root_t omap_root = onode.get_layout().omap_root.get(onode.get_hint());
omap_root_t omap_root = onode.get_layout().omap_root.get(
onode.get_metadata_hint());
return _omap_get_values(
t,
std::move(omap_root),
@ -685,7 +690,7 @@ SeaStore::_omap_list_ret SeaStore::_omap_list(
const std::optional<std::string>& start,
OMapManager::omap_list_config_t config) const
{
auto root = omap_root.get(onode.get_hint());
auto root = omap_root.get(onode.get_metadata_hint());
if (root.is_null()) {
return seastar::make_ready_future<_omap_list_bare_ret>(
true, omap_values_t{}
@ -1068,7 +1073,7 @@ SeaStore::tm_ret SeaStore::_write(
return seastar::do_with(
std::move(_bl),
[=, &ctx, &onode](auto &bl) {
return ObjectDataHandler().write(
return ObjectDataHandler(max_object_size).write(
ObjectDataHandler::context_t{
*transaction_manager,
*ctx.transaction,
@ -1089,13 +1094,13 @@ SeaStore::_omap_set_kvs(
{
return seastar::do_with(
BtreeOMapManager(*transaction_manager),
omap_root.get(onode->get_hint()),
omap_root.get(onode->get_metadata_hint()),
[&, keys=std::move(kvs)](auto &omap_manager, auto &root) {
tm_iertr::future<> maybe_create_root =
!root.is_null() ?
tm_iertr::now() :
omap_manager.initialize_omap(
t, onode->get_hint()
t, onode->get_metadata_hint()
).si_then([&root](auto new_root) {
root = new_root;
});
@ -1146,13 +1151,13 @@ SeaStore::tm_ret SeaStore::_omap_rmkeys(
{
LOG_PREFIX(SeaStore::_omap_rmkeys);
DEBUGT("{} {} keys", *ctx.transaction, *onode, keys.size());
auto omap_root = onode->get_layout().omap_root.get(onode->get_hint());
auto omap_root = onode->get_layout().omap_root.get(onode->get_metadata_hint());
if (omap_root.is_null()) {
return seastar::now();
} else {
return seastar::do_with(
BtreeOMapManager(*transaction_manager),
onode->get_layout().omap_root.get(onode->get_hint()),
onode->get_layout().omap_root.get(onode->get_metadata_hint()),
std::move(keys),
[&ctx, &onode](
auto &omap_manager,
@ -1198,7 +1203,7 @@ SeaStore::tm_ret SeaStore::_truncate(
LOG_PREFIX(SeaStore::_truncate);
DEBUGT("onode={} size={}", *ctx.transaction, *onode, size);
onode->get_mutable_layout(*ctx.transaction).size = size;
return ObjectDataHandler().truncate(
return ObjectDataHandler(max_object_size).truncate(
ObjectDataHandler::context_t{
*transaction_manager,
*ctx.transaction,

View File

@ -305,6 +305,7 @@ private:
TransactionManagerRef transaction_manager;
CollectionManagerRef collection_manager;
OnodeManagerRef onode_manager;
const uint32_t max_object_size = 0;
using tm_iertr = TransactionManager::base_iertr;
using tm_ret = tm_iertr::future<>;

View File

@ -31,15 +31,15 @@ struct onode_item_t {
void initialize(Transaction& t, Onode& value) const {
auto& layout = value.get_mutable_layout(t);
layout.size = size;
layout.omap_root.update(omap_root_t(id, cnt_modify, value.get_hint()));
layout.omap_root.update(omap_root_t(id, cnt_modify, value.get_metadata_hint()));
validate(value);
}
void validate(Onode& value) const {
auto& layout = value.get_layout();
ceph_assert(laddr_t(layout.size) == laddr_t{size});
ceph_assert(layout.omap_root.get(value.get_hint()).addr == id);
ceph_assert(layout.omap_root.get(value.get_hint()).depth == cnt_modify);
ceph_assert(layout.omap_root.get(value.get_metadata_hint()).addr == id);
ceph_assert(layout.omap_root.get(value.get_metadata_hint()).depth == cnt_modify);
}
void modify(Transaction& t, Onode& value) {

View File

@ -11,6 +11,10 @@ using namespace crimson;
using namespace crimson::os;
using namespace crimson::os::seastore;
#define MAX_OBJECT_SIZE (16<<20)
#define DEFAULT_OBJECT_DATA_RESERVATION (16<<20)
#define DEFAULT_OBJECT_METADATA_RESERVATION (16<<20)
namespace {
[[maybe_unused]] seastar::logger& logger() {
return crimson::get_logger(ceph_subsys_test);
@ -22,6 +26,7 @@ class TestOnode final : public Onode {
bool dirty = false;
public:
TestOnode(uint32_t ddr, uint32_t dmr) : Onode(ddr, dmr) {}
const onode_layout_t &get_layout() const final {
return layout;
}
@ -58,7 +63,7 @@ struct object_data_handler_test_t:
offset,
len));
with_trans_intr(t, [&](auto &t) {
return ObjectDataHandler().write(
return ObjectDataHandler(MAX_OBJECT_SIZE).write(
ObjectDataHandler::context_t{
*tm,
t,
@ -81,7 +86,7 @@ struct object_data_handler_test_t:
0,
size - offset);
with_trans_intr(t, [&](auto &t) {
return ObjectDataHandler().truncate(
return ObjectDataHandler(MAX_OBJECT_SIZE).truncate(
ObjectDataHandler::context_t{
*tm,
t,
@ -100,7 +105,7 @@ struct object_data_handler_test_t:
void read(Transaction &t, objaddr_t offset, extent_len_t len) {
bufferlist bl = with_trans_intr(t, [&](auto &t) {
return ObjectDataHandler().read(
return ObjectDataHandler(MAX_OBJECT_SIZE).read(
ObjectDataHandler::context_t{
*tm,
t,
@ -132,7 +137,9 @@ struct object_data_handler_test_t:
}
seastar::future<> set_up_fut() final {
onode = new TestOnode{};
onode = new TestOnode(
DEFAULT_OBJECT_DATA_RESERVATION,
DEFAULT_OBJECT_METADATA_RESERVATION);
known_contents = buffer::create(4<<20 /* 4MB */);
memset(known_contents.c_str(), 0, known_contents.length());
size = 0;