mirror of
https://github.com/ceph/ceph
synced 2025-03-25 11:48:05 +00:00
Merge pull request #44235 from xxhdx1985126/wip-onode-omap-hint-optimization
crimson/os/seastore: avoid onode/omap laddr hint conflicts as much as possible Reviewed-by: Samuel Just <sjust@redhat.com> Reviewed-by: Yingxin Cheng <yingxin.cheng@intel.com> Reviewed-by: Chunmei Liu <chunmei.liu@intel.com>
This commit is contained in:
commit
686398f742
@ -65,3 +65,13 @@ options:
|
||||
level: dev
|
||||
desc: The record fullness threshold to flush a journal batch
|
||||
default: 0.95
|
||||
- name: seastore_default_max_object_size
|
||||
type: uint
|
||||
level: dev
|
||||
desc: default logical address space reservation for seastore objects' data
|
||||
default: 16777216
|
||||
- name: seastore_default_object_metadata_reservation
|
||||
type: uint
|
||||
level: dev
|
||||
desc: default logical address space reservation for seastore objects' metadata
|
||||
default: 16777216
|
||||
|
@ -10,8 +10,11 @@
|
||||
|
||||
#include "crimson/common/log.h"
|
||||
#include "crimson/common/errorator.h"
|
||||
|
||||
#ifndef NDEBUG
|
||||
#define INTR_FUT_DEBUG(FMT_MSG, ...) crimson::get_logger(ceph_subsys_).trace(FMT_MSG, ##__VA_ARGS__)
|
||||
#else
|
||||
#define INTR_FUT_DEBUG(FMT_MSG, ...)
|
||||
#endif
|
||||
|
||||
// The interrupt condition generally works this way:
|
||||
//
|
||||
|
@ -15,16 +15,6 @@ namespace {
|
||||
}
|
||||
|
||||
namespace crimson::os::seastore {
|
||||
|
||||
/**
|
||||
* MAX_OBJECT_SIZE
|
||||
*
|
||||
* For now, we allocate a fixed region of laddr space of size MAX_OBJECT_SIZE
|
||||
* for any object. In the future, once we have the ability to remap logical
|
||||
* mappings (necessary for clone), we'll add the ability to grow and shrink
|
||||
* these regions and remove this assumption.
|
||||
*/
|
||||
static constexpr extent_len_t MAX_OBJECT_SIZE = 16<<20;
|
||||
#define assert_aligned(x) ceph_assert(((x)%ctx.tm.get_block_size()) == 0)
|
||||
|
||||
using context_t = ObjectDataHandler::context_t;
|
||||
@ -261,9 +251,9 @@ ObjectDataHandler::write_ret ObjectDataHandler::prepare_data_reservation(
|
||||
extent_len_t size)
|
||||
{
|
||||
LOG_PREFIX(ObjectDataHandler::prepare_data_reservation);
|
||||
ceph_assert(size <= MAX_OBJECT_SIZE);
|
||||
ceph_assert(size <= max_object_size);
|
||||
if (!object_data.is_null()) {
|
||||
ceph_assert(object_data.get_reserved_data_len() == MAX_OBJECT_SIZE);
|
||||
ceph_assert(object_data.get_reserved_data_len() == max_object_size);
|
||||
DEBUGT("reservation present: {}~{}",
|
||||
ctx.t,
|
||||
object_data.get_reserved_data_base(),
|
||||
@ -272,14 +262,14 @@ ObjectDataHandler::write_ret ObjectDataHandler::prepare_data_reservation(
|
||||
} else {
|
||||
DEBUGT("reserving: {}~{}",
|
||||
ctx.t,
|
||||
ctx.onode.get_hint(),
|
||||
MAX_OBJECT_SIZE);
|
||||
ctx.onode.get_data_hint(),
|
||||
max_object_size);
|
||||
return ctx.tm.reserve_region(
|
||||
ctx.t,
|
||||
ctx.onode.get_hint(),
|
||||
MAX_OBJECT_SIZE
|
||||
).si_then([&object_data](auto pin) {
|
||||
ceph_assert(pin->get_length() == MAX_OBJECT_SIZE);
|
||||
ctx.onode.get_data_hint(),
|
||||
max_object_size
|
||||
).si_then([max_object_size=max_object_size, &object_data](auto pin) {
|
||||
ceph_assert(pin->get_length() == max_object_size);
|
||||
object_data.update_reserved(
|
||||
pin->get_laddr(),
|
||||
pin->get_length());
|
||||
|
@ -50,6 +50,8 @@ class ObjectDataHandler {
|
||||
public:
|
||||
using base_iertr = TransactionManager::base_iertr;
|
||||
|
||||
ObjectDataHandler(uint32_t mos) : max_object_size(mos) {}
|
||||
|
||||
struct context_t {
|
||||
TransactionManager &tm;
|
||||
Transaction &t;
|
||||
@ -104,6 +106,16 @@ private:
|
||||
context_t ctx,
|
||||
object_data_t &object_data,
|
||||
extent_len_t size);
|
||||
private:
|
||||
/**
|
||||
* max_object_size
|
||||
*
|
||||
* For now, we allocate a fixed region of laddr space of size max_object_size
|
||||
* for any object. In the future, once we have the ability to remap logical
|
||||
* mappings (necessary for clone), we'll add the ability to grow and shrink
|
||||
* these regions and remove this assumption.
|
||||
*/
|
||||
const uint32_t max_object_size = 0;
|
||||
};
|
||||
|
||||
}
|
||||
|
@ -52,12 +52,29 @@ class Onode : public boost::intrusive_ref_counter<
|
||||
Onode,
|
||||
boost::thread_unsafe_counter>
|
||||
{
|
||||
protected:
|
||||
virtual laddr_t get_hint() const = 0;
|
||||
const uint32_t default_metadata_offset = 0;
|
||||
const uint32_t default_metadata_range = 0;
|
||||
public:
|
||||
Onode(uint32_t ddr, uint32_t dmr)
|
||||
: default_metadata_offset(ddr),
|
||||
default_metadata_range(dmr)
|
||||
{}
|
||||
|
||||
virtual const onode_layout_t &get_layout() const = 0;
|
||||
virtual onode_layout_t &get_mutable_layout(Transaction &t) = 0;
|
||||
virtual ~Onode() = default;
|
||||
virtual laddr_t get_hint() const = 0;
|
||||
|
||||
laddr_t get_metadata_hint() const {
|
||||
assert(default_metadata_offset);
|
||||
assert(default_metadata_range);
|
||||
return get_hint() + default_metadata_offset +
|
||||
((uint32_t)std::rand() % default_metadata_range);
|
||||
}
|
||||
laddr_t get_data_hint() const {
|
||||
return get_hint();
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
@ -27,7 +27,10 @@ FLTreeOnodeManager::get_onode_ret FLTreeOnodeManager::get_onode(
|
||||
DEBUGT("no entry for {}", trans, hoid);
|
||||
return crimson::ct_error::enoent::make();
|
||||
}
|
||||
auto val = OnodeRef(new FLTreeOnode(cursor.value()));
|
||||
auto val = OnodeRef(new FLTreeOnode(
|
||||
default_data_reservation,
|
||||
default_metadata_range,
|
||||
cursor.value()));
|
||||
return get_onode_iertr::make_ready_future<OnodeRef>(
|
||||
val
|
||||
);
|
||||
@ -43,10 +46,13 @@ FLTreeOnodeManager::get_or_create_onode(
|
||||
return tree.insert(
|
||||
trans, hoid,
|
||||
OnodeTree::tree_value_config_t{sizeof(onode_layout_t)}
|
||||
).si_then([&trans, &hoid, FNAME](auto p)
|
||||
).si_then([this, &trans, &hoid, FNAME](auto p)
|
||||
-> get_or_create_onode_ret {
|
||||
auto [cursor, created] = std::move(p);
|
||||
auto val = OnodeRef(new FLTreeOnode(cursor.value()));
|
||||
auto val = OnodeRef(new FLTreeOnode(
|
||||
default_data_reservation,
|
||||
default_metadata_range,
|
||||
cursor.value()));
|
||||
if (created) {
|
||||
DEBUGT("created onode for entry for {}", trans, hoid);
|
||||
val->get_mutable_layout(trans) = onode_layout_t{};
|
||||
|
@ -37,7 +37,14 @@ struct FLTreeOnode final : Onode, Value {
|
||||
FLTreeOnode& operator=(const FLTreeOnode&) = delete;
|
||||
|
||||
template <typename... T>
|
||||
FLTreeOnode(T&&... args) : Value(std::forward<T>(args)...) {}
|
||||
FLTreeOnode(uint32_t ddr, uint32_t dmr, T&&... args)
|
||||
: Onode(ddr, dmr),
|
||||
Value(std::forward<T>(args)...) {}
|
||||
|
||||
template <typename... T>
|
||||
FLTreeOnode(T&&... args)
|
||||
: Onode(0, 0),
|
||||
Value(std::forward<T>(args)...) {}
|
||||
|
||||
struct Recorder : public ValueDeltaRecorder {
|
||||
Recorder(bufferlist &bl) : ValueDeltaRecorder(bl) {}
|
||||
@ -102,12 +109,23 @@ struct FLTreeOnode final : Onode, Value {
|
||||
|
||||
using OnodeTree = Btree<FLTreeOnode>;
|
||||
|
||||
using crimson::common::get_conf;
|
||||
|
||||
class FLTreeOnodeManager : public crimson::os::seastore::OnodeManager {
|
||||
OnodeTree tree;
|
||||
|
||||
uint32_t default_data_reservation = 0;
|
||||
uint32_t default_metadata_offset = 0;
|
||||
uint32_t default_metadata_range = 0;
|
||||
public:
|
||||
FLTreeOnodeManager(TransactionManager &tm) :
|
||||
tree(NodeExtentManager::create_seastore(tm)) {}
|
||||
tree(NodeExtentManager::create_seastore(tm)),
|
||||
default_data_reservation(
|
||||
get_conf<uint64_t>("seastore_default_max_object_size")),
|
||||
default_metadata_offset(default_data_reservation),
|
||||
default_metadata_range(
|
||||
get_conf<uint64_t>("seastore_default_object_metadata_reservation"))
|
||||
{}
|
||||
|
||||
mkfs_ret mkfs(Transaction &t) {
|
||||
return tree.mkfs(t);
|
||||
|
@ -68,6 +68,8 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
using crimson::common::get_conf;
|
||||
|
||||
SeaStore::SeaStore(
|
||||
const std::string& root,
|
||||
MDStoreRef mdstore,
|
||||
@ -80,7 +82,9 @@ SeaStore::SeaStore(
|
||||
segment_manager(std::move(sm)),
|
||||
transaction_manager(std::move(tm)),
|
||||
collection_manager(std::move(cm)),
|
||||
onode_manager(std::move(om))
|
||||
onode_manager(std::move(om)),
|
||||
max_object_size(
|
||||
get_conf<uint64_t>("seastore_default_max_object_size"))
|
||||
{
|
||||
register_metrics();
|
||||
}
|
||||
@ -459,7 +463,7 @@ SeaStore::read_errorator::future<ceph::bufferlist> SeaStore::read(
|
||||
size - offset :
|
||||
std::min(size - offset, len);
|
||||
|
||||
return ObjectDataHandler().read(
|
||||
return ObjectDataHandler(max_object_size).read(
|
||||
ObjectDataHandler::context_t{
|
||||
*transaction_manager,
|
||||
t,
|
||||
@ -509,7 +513,7 @@ SeaStore::get_attr_errorator::future<ceph::bufferlist> SeaStore::get_attr(
|
||||
}
|
||||
return _omap_get_value(
|
||||
t,
|
||||
layout.xattr_root.get(onode.get_hint()),
|
||||
layout.xattr_root.get(onode.get_metadata_hint()),
|
||||
name);
|
||||
}
|
||||
).handle_error(crimson::ct_error::input_output_error::handle([FNAME] {
|
||||
@ -607,7 +611,8 @@ SeaStore::omap_get_values(
|
||||
"omap_get_values",
|
||||
op_type_t::OMAP_GET_VALUES,
|
||||
[this, keys](auto &t, auto &onode) {
|
||||
omap_root_t omap_root = onode.get_layout().omap_root.get(onode.get_hint());
|
||||
omap_root_t omap_root = onode.get_layout().omap_root.get(
|
||||
onode.get_metadata_hint());
|
||||
return _omap_get_values(
|
||||
t,
|
||||
std::move(omap_root),
|
||||
@ -685,7 +690,7 @@ SeaStore::_omap_list_ret SeaStore::_omap_list(
|
||||
const std::optional<std::string>& start,
|
||||
OMapManager::omap_list_config_t config) const
|
||||
{
|
||||
auto root = omap_root.get(onode.get_hint());
|
||||
auto root = omap_root.get(onode.get_metadata_hint());
|
||||
if (root.is_null()) {
|
||||
return seastar::make_ready_future<_omap_list_bare_ret>(
|
||||
true, omap_values_t{}
|
||||
@ -1068,7 +1073,7 @@ SeaStore::tm_ret SeaStore::_write(
|
||||
return seastar::do_with(
|
||||
std::move(_bl),
|
||||
[=, &ctx, &onode](auto &bl) {
|
||||
return ObjectDataHandler().write(
|
||||
return ObjectDataHandler(max_object_size).write(
|
||||
ObjectDataHandler::context_t{
|
||||
*transaction_manager,
|
||||
*ctx.transaction,
|
||||
@ -1089,13 +1094,13 @@ SeaStore::_omap_set_kvs(
|
||||
{
|
||||
return seastar::do_with(
|
||||
BtreeOMapManager(*transaction_manager),
|
||||
omap_root.get(onode->get_hint()),
|
||||
omap_root.get(onode->get_metadata_hint()),
|
||||
[&, keys=std::move(kvs)](auto &omap_manager, auto &root) {
|
||||
tm_iertr::future<> maybe_create_root =
|
||||
!root.is_null() ?
|
||||
tm_iertr::now() :
|
||||
omap_manager.initialize_omap(
|
||||
t, onode->get_hint()
|
||||
t, onode->get_metadata_hint()
|
||||
).si_then([&root](auto new_root) {
|
||||
root = new_root;
|
||||
});
|
||||
@ -1146,13 +1151,13 @@ SeaStore::tm_ret SeaStore::_omap_rmkeys(
|
||||
{
|
||||
LOG_PREFIX(SeaStore::_omap_rmkeys);
|
||||
DEBUGT("{} {} keys", *ctx.transaction, *onode, keys.size());
|
||||
auto omap_root = onode->get_layout().omap_root.get(onode->get_hint());
|
||||
auto omap_root = onode->get_layout().omap_root.get(onode->get_metadata_hint());
|
||||
if (omap_root.is_null()) {
|
||||
return seastar::now();
|
||||
} else {
|
||||
return seastar::do_with(
|
||||
BtreeOMapManager(*transaction_manager),
|
||||
onode->get_layout().omap_root.get(onode->get_hint()),
|
||||
onode->get_layout().omap_root.get(onode->get_metadata_hint()),
|
||||
std::move(keys),
|
||||
[&ctx, &onode](
|
||||
auto &omap_manager,
|
||||
@ -1198,7 +1203,7 @@ SeaStore::tm_ret SeaStore::_truncate(
|
||||
LOG_PREFIX(SeaStore::_truncate);
|
||||
DEBUGT("onode={} size={}", *ctx.transaction, *onode, size);
|
||||
onode->get_mutable_layout(*ctx.transaction).size = size;
|
||||
return ObjectDataHandler().truncate(
|
||||
return ObjectDataHandler(max_object_size).truncate(
|
||||
ObjectDataHandler::context_t{
|
||||
*transaction_manager,
|
||||
*ctx.transaction,
|
||||
|
@ -305,6 +305,7 @@ private:
|
||||
TransactionManagerRef transaction_manager;
|
||||
CollectionManagerRef collection_manager;
|
||||
OnodeManagerRef onode_manager;
|
||||
const uint32_t max_object_size = 0;
|
||||
|
||||
using tm_iertr = TransactionManager::base_iertr;
|
||||
using tm_ret = tm_iertr::future<>;
|
||||
|
@ -31,15 +31,15 @@ struct onode_item_t {
|
||||
void initialize(Transaction& t, Onode& value) const {
|
||||
auto& layout = value.get_mutable_layout(t);
|
||||
layout.size = size;
|
||||
layout.omap_root.update(omap_root_t(id, cnt_modify, value.get_hint()));
|
||||
layout.omap_root.update(omap_root_t(id, cnt_modify, value.get_metadata_hint()));
|
||||
validate(value);
|
||||
}
|
||||
|
||||
void validate(Onode& value) const {
|
||||
auto& layout = value.get_layout();
|
||||
ceph_assert(laddr_t(layout.size) == laddr_t{size});
|
||||
ceph_assert(layout.omap_root.get(value.get_hint()).addr == id);
|
||||
ceph_assert(layout.omap_root.get(value.get_hint()).depth == cnt_modify);
|
||||
ceph_assert(layout.omap_root.get(value.get_metadata_hint()).addr == id);
|
||||
ceph_assert(layout.omap_root.get(value.get_metadata_hint()).depth == cnt_modify);
|
||||
}
|
||||
|
||||
void modify(Transaction& t, Onode& value) {
|
||||
|
@ -11,6 +11,10 @@ using namespace crimson;
|
||||
using namespace crimson::os;
|
||||
using namespace crimson::os::seastore;
|
||||
|
||||
#define MAX_OBJECT_SIZE (16<<20)
|
||||
#define DEFAULT_OBJECT_DATA_RESERVATION (16<<20)
|
||||
#define DEFAULT_OBJECT_METADATA_RESERVATION (16<<20)
|
||||
|
||||
namespace {
|
||||
[[maybe_unused]] seastar::logger& logger() {
|
||||
return crimson::get_logger(ceph_subsys_test);
|
||||
@ -22,6 +26,7 @@ class TestOnode final : public Onode {
|
||||
bool dirty = false;
|
||||
|
||||
public:
|
||||
TestOnode(uint32_t ddr, uint32_t dmr) : Onode(ddr, dmr) {}
|
||||
const onode_layout_t &get_layout() const final {
|
||||
return layout;
|
||||
}
|
||||
@ -58,7 +63,7 @@ struct object_data_handler_test_t:
|
||||
offset,
|
||||
len));
|
||||
with_trans_intr(t, [&](auto &t) {
|
||||
return ObjectDataHandler().write(
|
||||
return ObjectDataHandler(MAX_OBJECT_SIZE).write(
|
||||
ObjectDataHandler::context_t{
|
||||
*tm,
|
||||
t,
|
||||
@ -81,7 +86,7 @@ struct object_data_handler_test_t:
|
||||
0,
|
||||
size - offset);
|
||||
with_trans_intr(t, [&](auto &t) {
|
||||
return ObjectDataHandler().truncate(
|
||||
return ObjectDataHandler(MAX_OBJECT_SIZE).truncate(
|
||||
ObjectDataHandler::context_t{
|
||||
*tm,
|
||||
t,
|
||||
@ -100,7 +105,7 @@ struct object_data_handler_test_t:
|
||||
|
||||
void read(Transaction &t, objaddr_t offset, extent_len_t len) {
|
||||
bufferlist bl = with_trans_intr(t, [&](auto &t) {
|
||||
return ObjectDataHandler().read(
|
||||
return ObjectDataHandler(MAX_OBJECT_SIZE).read(
|
||||
ObjectDataHandler::context_t{
|
||||
*tm,
|
||||
t,
|
||||
@ -132,7 +137,9 @@ struct object_data_handler_test_t:
|
||||
}
|
||||
|
||||
seastar::future<> set_up_fut() final {
|
||||
onode = new TestOnode{};
|
||||
onode = new TestOnode(
|
||||
DEFAULT_OBJECT_DATA_RESERVATION,
|
||||
DEFAULT_OBJECT_METADATA_RESERVATION);
|
||||
known_contents = buffer::create(4<<20 /* 4MB */);
|
||||
memset(known_contents.c_str(), 0, known_contents.length());
|
||||
size = 0;
|
||||
|
Loading…
Reference in New Issue
Block a user