Merge pull request #44912 from xxhdx1985126/wip-fixed-kv-btree

crimson/os/seastore: extract fixed kv btree implementation out of lba manager

Reviewed-by: Samuel Just <sjust@redhat.com>
This commit is contained in:
Samuel Just 2022-03-15 12:45:07 -07:00 committed by GitHub
commit 95e7ce9305
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
18 changed files with 2903 additions and 2696 deletions

View File

@ -11,8 +11,6 @@ set(crimson_seastore_srcs
lba_manager.cc
segment_cleaner.cc
lba_manager/btree/btree_lba_manager.cc
lba_manager/btree/btree_range_pin.cc
lba_manager/btree/lba_btree.cc
lba_manager/btree/lba_btree_node.cc
omap_manager.cc
omap_manager/btree/btree_omap_manager.cc

View File

@ -0,0 +1,475 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
#pragma once
#include <boost/intrusive/set.hpp>
#include "crimson/common/log.h"
#include "crimson/os/seastore/cached_extent.h"
#include "crimson/os/seastore/seastore_types.h"
namespace crimson::os::seastore {
template <typename bound_t>
struct fixed_kv_node_meta_t {
bound_t begin = 0;
bound_t end = 0;
depth_t depth = 0;
bool is_parent_of(const fixed_kv_node_meta_t &other) const {
return (depth == other.depth + 1) &&
(begin <= other.begin) &&
(end > other.begin);
}
std::pair<fixed_kv_node_meta_t, fixed_kv_node_meta_t> split_into(bound_t pivot) const {
return std::make_pair(
fixed_kv_node_meta_t{begin, pivot, depth},
fixed_kv_node_meta_t{pivot, end, depth});
}
static fixed_kv_node_meta_t merge_from(
const fixed_kv_node_meta_t &lhs, const fixed_kv_node_meta_t &rhs) {
ceph_assert(lhs.depth == rhs.depth);
return fixed_kv_node_meta_t{lhs.begin, rhs.end, lhs.depth};
}
static std::pair<fixed_kv_node_meta_t, fixed_kv_node_meta_t>
rebalance(const fixed_kv_node_meta_t &lhs, const fixed_kv_node_meta_t &rhs, bound_t pivot) {
ceph_assert(lhs.depth == rhs.depth);
return std::make_pair(
fixed_kv_node_meta_t{lhs.begin, pivot, lhs.depth},
fixed_kv_node_meta_t{pivot, rhs.end, lhs.depth});
}
bool is_root() const {
return begin == 0 && end == L_ADDR_MAX;
}
};
template <typename bound_t>
inline std::ostream &operator<<(
std::ostream &lhs,
const fixed_kv_node_meta_t<bound_t> &rhs)
{
return lhs << "btree_node_meta_t("
<< "begin=" << rhs.begin
<< ", end=" << rhs.end
<< ", depth=" << rhs.depth
<< ")";
}
/**
* fixed_kv_node_meta_le_t
*
* On disk layout for fixed_kv_node_meta_t
*/
template <typename bound_le_t>
struct fixed_kv_node_meta_le_t {
bound_le_t begin = bound_le_t(0);
bound_le_t end = bound_le_t(0);
depth_le_t depth = init_depth_le(0);
fixed_kv_node_meta_le_t() = default;
fixed_kv_node_meta_le_t(
const fixed_kv_node_meta_le_t<bound_le_t> &) = default;
explicit fixed_kv_node_meta_le_t(
const fixed_kv_node_meta_t<typename bound_le_t::orig_type> &val)
: begin(ceph_le64(val.begin)),
end(ceph_le64(val.end)),
depth(init_depth_le(val.depth)) {}
operator fixed_kv_node_meta_t<typename bound_le_t::orig_type>() const {
return fixed_kv_node_meta_t<typename bound_le_t::orig_type>{
begin, end, depth };
}
};
/**
* btree_range_pin_t
*
* Element tracked by btree_pin_set_t below. Encapsulates the intrusive_set
* hook, the fixed_kv_node_meta_t representing the key range covered by a node,
* and extent and ref members intended to hold a reference when the extent
* should be pinned.
*/
template <typename T>
class btree_pin_set_t;
template <typename node_bound_t>
class btree_range_pin_t : public boost::intrusive::set_base_hook<> {
friend class btree_pin_set_t<node_bound_t>;
fixed_kv_node_meta_t<node_bound_t> range;
btree_pin_set_t<node_bound_t> *pins = nullptr;
// We need to be able to remember extent without holding a reference,
// but we can do it more compactly -- TODO
CachedExtent *extent = nullptr;
CachedExtentRef ref;
using index_t = boost::intrusive::set<btree_range_pin_t>;
static auto get_tuple(const fixed_kv_node_meta_t<node_bound_t> &meta) {
return std::make_tuple(-meta.depth, meta.begin);
}
void acquire_ref() {
ref = CachedExtentRef(extent);
}
void drop_ref() {
ref.reset();
}
public:
btree_range_pin_t() = default;
btree_range_pin_t(CachedExtent *extent)
: extent(extent) {}
btree_range_pin_t(const btree_range_pin_t &rhs, CachedExtent *extent)
: range(rhs.range), extent(extent) {}
bool has_ref() const {
return !!ref;
}
bool is_root() const {
return range.is_root();
}
void set_range(const fixed_kv_node_meta_t<node_bound_t> &nrange) {
range = nrange;
}
void set_extent(CachedExtent *nextent) {
ceph_assert(!extent);
extent = nextent;
}
CachedExtent &get_extent() {
assert(extent);
return *extent;
}
bool has_ref() {
return !!ref;
}
void take_pin(btree_range_pin_t &other)
{
ceph_assert(other.extent);
if (other.pins) {
other.pins->replace_pin(*this, other);
pins = other.pins;
other.pins = nullptr;
if (other.has_ref()) {
other.drop_ref();
acquire_ref();
}
}
}
friend bool operator<(
const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) {
return get_tuple(lhs.range) < get_tuple(rhs.range);
}
friend bool operator>(
const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) {
return get_tuple(lhs.range) > get_tuple(rhs.range);
}
friend bool operator==(
const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) {
return get_tuple(lhs.range) == rhs.get_tuple(rhs.range);
}
struct meta_cmp_t {
bool operator()(
const btree_range_pin_t &lhs, const fixed_kv_node_meta_t<node_bound_t> &rhs) const {
return get_tuple(lhs.range) < get_tuple(rhs);
}
bool operator()(
const fixed_kv_node_meta_t<node_bound_t> &lhs, const btree_range_pin_t &rhs) const {
return get_tuple(lhs) < get_tuple(rhs.range);
}
};
friend std::ostream &operator<<(
std::ostream &lhs,
const btree_range_pin_t<node_bound_t> &rhs) {
return lhs << "btree_range_pin_t("
<< "begin=" << rhs.range.begin
<< ", end=" << rhs.range.end
<< ", depth=" << rhs.range.depth
<< ", extent=" << rhs.extent
<< ")";
}
template <typename>
friend class BtreeNodePin;
~btree_range_pin_t()
{
ceph_assert(!pins == !is_linked());
ceph_assert(!ref);
if (pins) {
crimson::get_logger(ceph_subsys_seastore_lba
).debug("{}: removing {}", __func__, *this);
pins->remove_pin(*this, true);
}
extent = nullptr;
}
};
/**
* btree_pin_set_t
*
* Ensures that for every cached node, all parent btree nodes required
* to map it are present in cache. Relocating these nodes can
* therefore be done without further reads or cache space.
*
* Contains a btree_range_pin_t for every clean or dirty btree node
* or LogicalCachedExtent instance in cache at any point in time.
* For any btree node, the contained btree_range_pin_t will hold
* a reference to that node pinning it in cache as long as that
* node has children in the set. This invariant can be violated
* only by calling retire_extent and is repaired by calling
* check_parent synchronously after adding any new extents.
*/
template <typename node_bound_t>
class btree_pin_set_t {
friend class btree_range_pin_t<node_bound_t>;
using pins_t = typename btree_range_pin_t<node_bound_t>::index_t;
pins_t pins;
/// Removes pin from set optionally checking whether parent has other children
void remove_pin(btree_range_pin_t<node_bound_t> &pin, bool do_check_parent)
{
crimson::get_logger(ceph_subsys_seastore_lba).debug("{}: {}", __func__, pin);
ceph_assert(pin.is_linked());
ceph_assert(pin.pins);
ceph_assert(!pin.ref);
pins.erase(pin);
pin.pins = nullptr;
if (do_check_parent) {
check_parent(pin);
}
}
void replace_pin(
btree_range_pin_t<node_bound_t> &to,
btree_range_pin_t<node_bound_t> &from)
{
pins.replace_node(pins.iterator_to(from), to);
}
/// Returns parent pin if exists
btree_range_pin_t<node_bound_t> *maybe_get_parent(
const fixed_kv_node_meta_t<node_bound_t> &meta)
{
auto cmeta = meta;
cmeta.depth++;
auto iter = pins.upper_bound(
cmeta,
typename btree_range_pin_t<node_bound_t>::meta_cmp_t());
if (iter == pins.begin()) {
return nullptr;
} else {
--iter;
if (iter->range.is_parent_of(meta)) {
return &*iter;
} else {
return nullptr;
}
}
}
/// Returns earliest child pin if exist
const btree_range_pin_t<node_bound_t>
*maybe_get_first_child(const fixed_kv_node_meta_t<node_bound_t> &meta) const
{
if (meta.depth == 0) {
return nullptr;
}
auto cmeta = meta;
cmeta.depth--;
auto iter = pins.lower_bound(
cmeta,
typename btree_range_pin_t<node_bound_t>::meta_cmp_t());
if (iter == pins.end()) {
return nullptr;
} else if (meta.is_parent_of(iter->range)) {
return &*iter;
} else {
return nullptr;
}
}
/// Releases pin if it has no children
void release_if_no_children(btree_range_pin_t<node_bound_t> &pin)
{
ceph_assert(pin.is_linked());
if (maybe_get_first_child(pin.range) == nullptr) {
pin.drop_ref();
}
}
public:
/// Adds pin to set, assumes set is consistent
void add_pin(btree_range_pin_t<node_bound_t> &pin)
{
ceph_assert(!pin.is_linked());
ceph_assert(!pin.pins);
ceph_assert(!pin.ref);
auto [prev, inserted] = pins.insert(pin);
if (!inserted) {
crimson::get_logger(ceph_subsys_seastore_lba).error(
"{}: unable to add {} ({}), found {} ({})",
__func__,
pin,
*(pin.extent),
*prev,
*(prev->extent));
ceph_assert(0 == "impossible");
return;
}
pin.pins = this;
if (!pin.is_root()) {
auto *parent = maybe_get_parent(pin.range);
ceph_assert(parent);
if (!parent->has_ref()) {
crimson::get_logger(ceph_subsys_seastore_lba
).debug("{}: acquiring parent {}", __func__,
static_cast<void*>(parent));
parent->acquire_ref();
} else {
crimson::get_logger(ceph_subsys_seastore_lba).debug(
"{}: parent has ref {}", __func__,
static_cast<void*>(parent));
}
}
if (maybe_get_first_child(pin.range) != nullptr) {
crimson::get_logger(ceph_subsys_seastore_lba).debug(
"{}: acquiring self {}", __func__, pin);
pin.acquire_ref();
}
}
/**
* retire/check_parent
*
* See BtreeLBAManager::complete_transaction.
* retire removes the specified pin from the set, but does not
* check parents. After any new extents are added to the set,
* the caller is required to call check_parent to restore the
* invariant.
*/
void retire(btree_range_pin_t<node_bound_t> &pin)
{
pin.drop_ref();
remove_pin(pin, false);
}
void check_parent(btree_range_pin_t<node_bound_t> &pin)
{
auto parent = maybe_get_parent(pin.range);
if (parent) {
crimson::get_logger(ceph_subsys_seastore_lba
).debug("{}: releasing parent {}", __func__, *parent);
release_if_no_children(*parent);
}
}
template <typename F>
void scan(F &&f) {
for (auto &i : pins) {
std::invoke(f, i);
}
}
~btree_pin_set_t() {
ceph_assert(pins.empty());
}
};
template <typename key_t>
class BtreeNodePin : public PhysicalNodePin<key_t> {
/**
* parent
*
* populated until link_extent is called to ensure cache residence
* until add_pin is called.
*/
CachedExtentRef parent;
paddr_t paddr;
btree_range_pin_t<key_t> pin;
public:
BtreeNodePin() = default;
BtreeNodePin(
CachedExtentRef parent,
paddr_t paddr,
fixed_kv_node_meta_t<key_t> &&meta)
: parent(parent), paddr(paddr) {
pin.set_range(std::move(meta));
}
btree_range_pin_t<key_t>& get_range_pin() {
return pin;
}
CachedExtentRef get_parent() {
return parent;
}
void set_parent(CachedExtentRef pin) {
parent = pin;
}
void link_extent(LogicalCachedExtent *ref) final {
pin.set_extent(ref);
}
extent_len_t get_length() const final {
ceph_assert(pin.range.end > pin.range.begin);
return pin.range.end - pin.range.begin;
}
paddr_t get_paddr() const final {
return paddr;
}
key_t get_key() const final {
return pin.range.begin;
}
PhysicalNodePinRef<key_t> duplicate() const final {
auto ret = std::unique_ptr<BtreeNodePin<key_t>>(
new BtreeNodePin<key_t>);
ret->pin.set_range(pin.range);
ret->paddr = paddr;
ret->parent = parent;
return ret;
}
void take_pin(PhysicalNodePin<key_t> &opin) final {
pin.take_pin(static_cast<BtreeNodePin<key_t>&>(opin).pin);
}
bool has_been_invalidated() const final {
return parent->has_been_invalidated();
}
};
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,448 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
#pragma once
#include <sys/mman.h>
#include <memory>
#include <string.h>
#include "include/buffer.h"
#include "crimson/common/fixed_kv_node_layout.h"
#include "crimson/common/errorator.h"
#include "crimson/os/seastore/lba_manager.h"
#include "crimson/os/seastore/seastore_types.h"
#include "crimson/os/seastore/cache.h"
#include "crimson/os/seastore/cached_extent.h"
#include "crimson/os/seastore/btree/btree_range_pin.h"
#include "crimson/os/seastore/btree/fixed_kv_btree.h"
namespace crimson::os::seastore {
/**
* FixedKVNode
*
* Base class enabling recursive lookup between internal and leaf nodes.
*/
template <typename node_key_t>
struct FixedKVNode : CachedExtent {
using FixedKVNodeRef = TCachedExtentRef<FixedKVNode>;
btree_range_pin_t<node_key_t> pin;
FixedKVNode(ceph::bufferptr &&ptr) : CachedExtent(std::move(ptr)), pin(this) {}
FixedKVNode(const FixedKVNode &rhs)
: CachedExtent(rhs), pin(rhs.pin, this) {}
virtual fixed_kv_node_meta_t<node_key_t> get_node_meta() const = 0;
virtual ~FixedKVNode() = default;
void on_delta_write(paddr_t record_block_offset) final {
// All in-memory relative addrs are necessarily record-relative
assert(get_prior_instance());
pin.take_pin(get_prior_instance()->template cast<FixedKVNode>()->pin);
resolve_relative_addrs(record_block_offset);
}
void on_initial_write() final {
// All in-memory relative addrs are necessarily block-relative
resolve_relative_addrs(get_paddr());
}
void on_clean_read() final {
// From initial write of block, relative addrs are necessarily block-relative
resolve_relative_addrs(get_paddr());
}
virtual void resolve_relative_addrs(paddr_t base) = 0;
};
/**
* FixedKVInternalNode
*
* Abstracts operations on and layout of internal nodes for the
* LBA Tree.
*/
template <
size_t CAPACITY,
typename NODE_KEY,
typename NODE_KEY_LE,
size_t node_size,
typename node_type_t>
struct FixedKVInternalNode
: FixedKVNode<NODE_KEY>,
common::FixedKVNodeLayout<
CAPACITY,
fixed_kv_node_meta_t<NODE_KEY>,
fixed_kv_node_meta_le_t<NODE_KEY_LE>,
NODE_KEY, NODE_KEY_LE,
paddr_t, paddr_le_t> {
using Ref = TCachedExtentRef<node_type_t>;
using node_layout_t =
common::FixedKVNodeLayout<
CAPACITY,
fixed_kv_node_meta_t<NODE_KEY>,
fixed_kv_node_meta_le_t<NODE_KEY_LE>,
NODE_KEY,
NODE_KEY_LE,
paddr_t,
paddr_le_t>;
using internal_const_iterator_t = typename node_layout_t::const_iterator;
using internal_iterator_t = typename node_layout_t::iterator;
template <typename... T>
FixedKVInternalNode(T&&... t) :
FixedKVNode<NODE_KEY>(std::forward<T>(t)...),
node_layout_t(this->get_bptr().c_str()) {}
virtual ~FixedKVInternalNode() {}
fixed_kv_node_meta_t<NODE_KEY> get_node_meta() const {
return this->get_meta();
}
typename node_layout_t::delta_buffer_t delta_buffer;
typename node_layout_t::delta_buffer_t *maybe_get_delta_buffer() {
return this->is_mutation_pending()
? &delta_buffer : nullptr;
}
CachedExtentRef duplicate_for_write() override {
assert(delta_buffer.empty());
return CachedExtentRef(new node_type_t(*this));
};
void update(
internal_const_iterator_t iter,
paddr_t addr) {
return this->journal_update(
iter,
this->maybe_generate_relative(addr),
maybe_get_delta_buffer());
}
void insert(
internal_const_iterator_t iter,
NODE_KEY pivot,
paddr_t addr) {
return this->journal_insert(
iter,
pivot,
this->maybe_generate_relative(addr),
maybe_get_delta_buffer());
}
void remove(internal_const_iterator_t iter) {
return this->journal_remove(
iter,
maybe_get_delta_buffer());
}
void replace(
internal_const_iterator_t iter,
NODE_KEY pivot,
paddr_t addr) {
return this->journal_replace(
iter,
pivot,
this->maybe_generate_relative(addr),
maybe_get_delta_buffer());
}
std::tuple<Ref, Ref, NODE_KEY>
make_split_children(op_context_t<NODE_KEY> c) {
auto left = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
auto right = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
auto pivot = this->split_into(*left, *right);
left->pin.set_range(left->get_meta());
right->pin.set_range(right->get_meta());
return std::make_tuple(
left,
right,
pivot);
}
Ref make_full_merge(
op_context_t<NODE_KEY> c,
Ref &right) {
auto replacement = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
replacement->merge_from(*this, *right->template cast<node_type_t>());
replacement->pin.set_range(replacement->get_meta());
return replacement;
}
std::tuple<Ref, Ref, NODE_KEY>
make_balanced(
op_context_t<NODE_KEY> c,
Ref &_right,
bool prefer_left) {
ceph_assert(_right->get_type() == this->get_type());
auto &right = *_right->template cast<node_type_t>();
auto replacement_left = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
auto replacement_right = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
auto pivot = this->balance_into_new_nodes(
*this,
right,
prefer_left,
*replacement_left,
*replacement_right);
replacement_left->pin.set_range(replacement_left->get_meta());
replacement_right->pin.set_range(replacement_right->get_meta());
return std::make_tuple(
replacement_left,
replacement_right,
pivot);
}
/**
* Internal relative addresses on read or in memory prior to commit
* are either record or block relative depending on whether this
* physical node is is_initial_pending() or just is_pending().
*
* User passes appropriate base depending on lifecycle and
* resolve_relative_addrs fixes up relative internal references
* based on base.
*/
void resolve_relative_addrs(paddr_t base)
{
LOG_PREFIX(FixedKVInternalNode::resolve_relative_addrs);
for (auto i: *this) {
if (i->get_val().is_relative()) {
auto updated = base.add_relative(i->get_val());
SUBTRACE(seastore_lba_details, "{} -> {}", i->get_val(), updated);
i->set_val(updated);
}
}
}
void node_resolve_vals(
internal_iterator_t from,
internal_iterator_t to) const {
if (this->is_initial_pending()) {
for (auto i = from; i != to; ++i) {
if (i->get_val().is_relative()) {
assert(i->get_val().is_block_relative());
i->set_val(this->get_paddr().add_relative(i->get_val()));
}
}
}
}
void node_unresolve_vals(
internal_iterator_t from,
internal_iterator_t to) const {
if (this->is_initial_pending()) {
for (auto i = from; i != to; ++i) {
if (i->get_val().is_relative()) {
assert(i->get_val().is_record_relative());
i->set_val(i->get_val() - this->get_paddr());
}
}
}
}
std::ostream &print_detail(std::ostream &out) const
{
return out << ", size=" << this->get_size()
<< ", meta=" << this->get_meta();
}
ceph::bufferlist get_delta() {
ceph::buffer::ptr bptr(delta_buffer.get_bytes());
delta_buffer.copy_out(bptr.c_str(), bptr.length());
ceph::bufferlist bl;
bl.push_back(bptr);
return bl;
}
void apply_delta_and_adjust_crc(
paddr_t base, const ceph::bufferlist &_bl) {
assert(_bl.length());
ceph::bufferlist bl = _bl;
bl.rebuild();
typename node_layout_t::delta_buffer_t buffer;
buffer.copy_in(bl.front().c_str(), bl.front().length());
buffer.replay(*this);
this->set_last_committed_crc(this->get_crc32c());
resolve_relative_addrs(base);
}
constexpr static size_t get_min_capacity() {
return (node_layout_t::get_capacity() - 1) / 2;
}
bool at_max_capacity() const {
assert(this->get_size() <= node_layout_t::get_capacity());
return this->get_size() == node_layout_t::get_capacity();
}
bool at_min_capacity() const {
assert(this->get_size() >= (get_min_capacity() - 1));
return this->get_size() <= get_min_capacity();
}
bool below_min_capacity() const {
assert(this->get_size() >= (get_min_capacity() - 1));
return this->get_size() < get_min_capacity();
}
};
template <
size_t CAPACITY,
typename NODE_KEY,
typename NODE_KEY_LE,
typename VAL,
typename VAL_LE,
size_t node_size,
typename node_type_t>
struct FixedKVLeafNode
: FixedKVNode<NODE_KEY>,
common::FixedKVNodeLayout<
CAPACITY,
fixed_kv_node_meta_t<NODE_KEY>,
fixed_kv_node_meta_le_t<NODE_KEY_LE>,
NODE_KEY, NODE_KEY_LE,
VAL, VAL_LE> {
using Ref = TCachedExtentRef<node_type_t>;
using node_layout_t =
common::FixedKVNodeLayout<
CAPACITY,
fixed_kv_node_meta_t<NODE_KEY>,
fixed_kv_node_meta_le_t<NODE_KEY_LE>,
NODE_KEY,
NODE_KEY_LE,
VAL,
VAL_LE>;
using internal_const_iterator_t = typename node_layout_t::const_iterator;
template <typename... T>
FixedKVLeafNode(T&&... t) :
FixedKVNode<NODE_KEY>(std::forward<T>(t)...),
node_layout_t(this->get_bptr().c_str()) {}
virtual ~FixedKVLeafNode() {}
fixed_kv_node_meta_t<NODE_KEY> get_node_meta() const {
return this->get_meta();
}
typename node_layout_t::delta_buffer_t delta_buffer;
virtual typename node_layout_t::delta_buffer_t *maybe_get_delta_buffer() {
return this->is_mutation_pending() ? &delta_buffer : nullptr;
}
CachedExtentRef duplicate_for_write() override {
assert(delta_buffer.empty());
return CachedExtentRef(new node_type_t(*this));
};
virtual void update(
internal_const_iterator_t iter,
VAL val) = 0;
virtual internal_const_iterator_t insert(
internal_const_iterator_t iter,
NODE_KEY addr,
VAL val) = 0;
virtual void remove(internal_const_iterator_t iter) = 0;
std::tuple<Ref, Ref, NODE_KEY>
make_split_children(op_context_t<NODE_KEY> c) {
auto left = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
auto right = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
auto pivot = this->split_into(*left, *right);
left->pin.set_range(left->get_meta());
right->pin.set_range(right->get_meta());
return std::make_tuple(
left,
right,
pivot);
}
Ref make_full_merge(
op_context_t<NODE_KEY> c,
Ref &right) {
auto replacement = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
replacement->merge_from(*this, *right->template cast<node_type_t>());
replacement->pin.set_range(replacement->get_meta());
return replacement;
}
std::tuple<Ref, Ref, NODE_KEY>
make_balanced(
op_context_t<NODE_KEY> c,
Ref &_right,
bool prefer_left) {
ceph_assert(_right->get_type() == this->get_type());
auto &right = *_right->template cast<node_type_t>();
auto replacement_left = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
auto replacement_right = c.cache.template alloc_new_extent<node_type_t>(
c.trans, node_size);
auto pivot = this->balance_into_new_nodes(
*this,
right,
prefer_left,
*replacement_left,
*replacement_right);
replacement_left->pin.set_range(replacement_left->get_meta());
replacement_right->pin.set_range(replacement_right->get_meta());
return std::make_tuple(
replacement_left,
replacement_right,
pivot);
}
ceph::bufferlist get_delta() {
ceph::buffer::ptr bptr(delta_buffer.get_bytes());
delta_buffer.copy_out(bptr.c_str(), bptr.length());
ceph::bufferlist bl;
bl.push_back(bptr);
return bl;
}
void apply_delta_and_adjust_crc(
paddr_t base, const ceph::bufferlist &_bl) {
assert(_bl.length());
ceph::bufferlist bl = _bl;
bl.rebuild();
typename node_layout_t::delta_buffer_t buffer;
buffer.copy_in(bl.front().c_str(), bl.front().length());
buffer.replay(*this);
this->set_last_committed_crc(this->get_crc32c());
this->resolve_relative_addrs(base);
}
constexpr static size_t get_min_capacity() {
return (node_layout_t::get_capacity() - 1) / 2;
}
bool at_max_capacity() const {
assert(this->get_size() <= node_layout_t::get_capacity());
return this->get_size() == node_layout_t::get_capacity();
}
bool at_min_capacity() const {
assert(this->get_size() >= (get_min_capacity() - 1));
return this->get_size() <= get_min_capacity();
}
bool below_min_capacity() const {
assert(this->get_size() >= (get_min_capacity() - 1));
return this->get_size() < get_min_capacity();
}
};
} // namespace crimson::os::seastore

View File

@ -81,7 +81,7 @@ std::ostream &LogicalCachedExtent::print_detail(std::ostream &out) const
std::ostream &operator<<(std::ostream &out, const LBAPin &rhs)
{
return out << "LBAPin(" << rhs.get_laddr() << "~" << rhs.get_length()
return out << "LBAPin(" << rhs.get_key() << "~" << rhs.get_length()
<< "->" << rhs.get_paddr();
}

View File

@ -666,20 +666,30 @@ private:
};
class LogicalCachedExtent;
class LBAPin;
using LBAPinRef = std::unique_ptr<LBAPin>;
class LBAPin {
template <typename key_t>
class PhysicalNodePin;
template <typename key_t>
using PhysicalNodePinRef = std::unique_ptr<PhysicalNodePin<key_t>>;
template <typename key_t>
class PhysicalNodePin {
public:
virtual void link_extent(LogicalCachedExtent *ref) = 0;
virtual void take_pin(LBAPin &pin) = 0;
virtual void take_pin(PhysicalNodePin<key_t> &pin) = 0;
virtual extent_len_t get_length() const = 0;
virtual paddr_t get_paddr() const = 0;
virtual laddr_t get_laddr() const = 0;
virtual LBAPinRef duplicate() const = 0;
virtual key_t get_key() const = 0;
virtual PhysicalNodePinRef<key_t> duplicate() const = 0;
virtual bool has_been_invalidated() const = 0;
virtual ~LBAPin() {}
virtual ~PhysicalNodePin() {}
};
using LBAPin = PhysicalNodePin<laddr_t>;
using LBAPinRef = PhysicalNodePinRef<laddr_t>;
std::ostream &operator<<(std::ostream &out, const LBAPin &rhs);
using lba_pin_list_t = std::list<LBAPinRef>;
@ -756,7 +766,7 @@ public:
void set_pin(LBAPinRef &&npin) {
assert(!pin);
pin = std::move(npin);
laddr = pin->get_laddr();
laddr = pin->get_key();
pin->link_extent(this);
}

View File

@ -9,7 +9,6 @@
#include "include/buffer.h"
#include "crimson/os/seastore/lba_manager/btree/btree_lba_manager.h"
#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
#include "crimson/os/seastore/lba_manager/btree/lba_btree.h"
#include "crimson/os/seastore/logging.h"
SET_SUBSYS(seastore_lba);
@ -20,6 +19,22 @@ SET_SUBSYS(seastore_lba);
* - TRACE: read operations, DEBUG details
*/
namespace crimson::os::seastore {
template<>
Transaction::tree_stats_t& get_tree_stats<
crimson::os::seastore::lba_manager::btree::LBABtree>(Transaction &t) {
return t.get_lba_tree_stats();
}
template<>
phy_tree_root_t& get_phy_tree_root<
crimson::os::seastore::lba_manager::btree::LBABtree>(root_t &r) {
return r.lba_root;
}
}
namespace crimson::os::seastore::lba_manager::btree {
BtreeLBAManager::mkfs_ret BtreeLBAManager::mkfs(
@ -46,7 +61,8 @@ BtreeLBAManager::get_mappings(
LOG_PREFIX(BtreeLBAManager::get_mappings);
TRACET("{}~{}", t, offset, length);
auto c = get_context(t);
return with_btree_state<lba_pin_list_t>(
return with_btree_state<LBABtree, lba_pin_list_t>(
cache,
c,
[c, offset, length, FNAME](auto &btree, auto &ret) {
return LBABtree::iterate_repeat(
@ -104,7 +120,8 @@ BtreeLBAManager::get_mapping(
LOG_PREFIX(BtreeLBAManager::get_mapping);
TRACET("{}", t, offset);
auto c = get_context(t);
return with_btree_ret<LBAPinRef>(
return with_btree_ret<LBABtree, LBAPinRef>(
cache,
c,
[FNAME, c, offset](auto &btree) {
return btree.lower_bound(
@ -147,7 +164,8 @@ BtreeLBAManager::alloc_extent(
auto c = get_context(t);
++stats.num_alloc_extents;
auto lookup_attempts = stats.num_alloc_extents_iter_nexts;
return with_btree_state<state_t>(
return crimson::os::seastore::with_btree_state<LBABtree, state_t>(
cache,
c,
hint,
[this, FNAME, c, hint, len, addr, lookup_attempts, &t](auto &btree, auto &state) {
@ -210,13 +228,13 @@ static bool is_lba_node(const CachedExtent &e)
return is_lba_node(e.get_type());
}
btree_range_pin_t &BtreeLBAManager::get_pin(CachedExtent &e)
btree_range_pin_t<laddr_t> &BtreeLBAManager::get_pin(CachedExtent &e)
{
if (is_lba_node(e)) {
return e.cast<LBANode>()->pin;
} else if (e.is_logical()) {
return static_cast<BtreeLBAPin &>(
e.cast<LogicalCachedExtent>()->get_pin()).pin;
e.cast<LogicalCachedExtent>()->get_pin()).get_range_pin();
} else {
ceph_abort_msg("impossible");
}
@ -280,23 +298,57 @@ void BtreeLBAManager::complete_transaction(
}
}
BtreeLBAManager::base_iertr::future<> _init_cached_extent(
op_context_t<laddr_t> c,
const CachedExtentRef &e,
LBABtree &btree,
bool &ret)
{
if (e->is_logical()) {
auto logn = e->cast<LogicalCachedExtent>();
return btree.lower_bound(
c,
logn->get_laddr()
).si_then([e, c, logn, &ret](auto iter) {
LOG_PREFIX(BtreeLBAManager::init_cached_extent);
if (!iter.is_end() &&
iter.get_key() == logn->get_laddr() &&
iter.get_val().paddr == logn->get_paddr()) {
logn->set_pin(iter.get_pin());
ceph_assert(iter.get_val().len == e->get_length());
if (c.pins) {
c.pins->add_pin(
static_cast<BtreeLBAPin&>(logn->get_pin()).get_range_pin());
}
DEBUGT("logical extent {} live", c.trans, *logn);
ret = true;
} else {
DEBUGT("logical extent {} not live", c.trans, *logn);
ret = false;
}
});
} else {
return btree.init_cached_extent(c, e
).si_then([&ret](bool is_alive) {
ret = is_alive;
});
}
}
BtreeLBAManager::init_cached_extent_ret BtreeLBAManager::init_cached_extent(
Transaction &t,
CachedExtentRef e)
{
LOG_PREFIX(BtreeLBAManager::init_cached_extent);
TRACET("{}", t, *e);
return seastar::do_with(bool(), [this, e, FNAME, &t](bool& ret) {
return seastar::do_with(bool(), [this, e, &t](bool &ret) {
auto c = get_context(t);
return with_btree(c, [c, e, &ret](auto &btree) {
return btree.init_cached_extent(c, e
).si_then([&ret](bool is_alive) {
ret = is_alive;
});
}).si_then([&ret, e, FNAME, c] {
DEBUGT("is_alive={} -- {}", c.trans, ret, *e);
return ret;
});
return with_btree<LBABtree>(cache, c, [c, e, &ret](auto &btree)
-> base_iertr::future<> {
LOG_PREFIX(BtreeLBAManager::init_cached_extent);
DEBUGT("extent {}", c.trans, *e);
return _init_cached_extent(c, e, btree, ret);
}).si_then([&ret] { return ret; });
});
}
@ -310,7 +362,8 @@ BtreeLBAManager::scan_mappings_ret BtreeLBAManager::scan_mappings(
DEBUGT("begin: {}, end: {}", t, begin, end);
auto c = get_context(t);
return with_btree(
return with_btree<LBABtree>(
cache,
c,
[c, f=std::move(f), begin, end](auto &btree) mutable {
return LBABtree::iterate_repeat(
@ -341,7 +394,8 @@ BtreeLBAManager::scan_mapped_space_ret BtreeLBAManager::scan_mapped_space(
return seastar::do_with(
std::move(f),
[this, c](auto &visitor) {
return with_btree(
return with_btree<LBABtree>(
cache,
c,
[c, &visitor](auto &btree) {
return LBABtree::iterate_repeat(
@ -377,10 +431,11 @@ BtreeLBAManager::rewrite_extent_ret BtreeLBAManager::rewrite_extent(
if (is_lba_node(*extent)) {
DEBUGT("rewriting lba extent -- {}", t, *extent);
auto c = get_context(t);
return with_btree(
return with_btree<LBABtree>(
cache,
c,
[c, extent](auto &btree) mutable {
return btree.rewrite_lba_extent(c, extent);
return btree.rewrite_extent(c, extent);
});
} else {
DEBUGT("skip non lba extent -- {}", t, *extent);
@ -433,7 +488,8 @@ BtreeLBAManager::get_physical_extent_if_live(
t, type, laddr, addr, len);
ceph_assert(is_lba_node(type));
auto c = get_context(t);
return with_btree_ret<CachedExtentRef>(
return with_btree_ret<LBABtree, CachedExtentRef>(
cache,
c,
[c, type, addr, laddr, len](auto &btree) {
if (type == extent_types_t::LADDR_INTERNAL) {
@ -509,7 +565,8 @@ BtreeLBAManager::_update_mapping_ret BtreeLBAManager::_update_mapping(
update_func_t &&f)
{
auto c = get_context(t);
return with_btree_ret<lba_map_val_t>(
return with_btree_ret<LBABtree, lba_map_val_t>(
cache,
c,
[f=std::move(f), c, addr](auto &btree) mutable {
return btree.lower_bound(

View File

@ -15,16 +15,23 @@
#include "common/interval_map.h"
#include "crimson/osd/exceptions.h"
#include "crimson/os/seastore/btree/fixed_kv_btree.h"
#include "crimson/os/seastore/seastore_types.h"
#include "crimson/os/seastore/lba_manager.h"
#include "crimson/os/seastore/cache.h"
#include "crimson/os/seastore/segment_manager.h"
#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
#include "crimson/os/seastore/lba_manager/btree/lba_btree.h"
#include "crimson/os/seastore/btree/btree_range_pin.h"
namespace crimson::os::seastore::lba_manager::btree {
using LBABtree = FixedKVBtree<
laddr_t, lba_map_val_t, LBAInternalNode,
LBALeafNode, LBA_BLOCK_SIZE>;
using BtreeLBAPin = BtreeNodePin<laddr_t>;
/**
* BtreeLBAManager
*
@ -84,6 +91,14 @@ public:
void complete_transaction(
Transaction &t) final;
/**
* init_cached_extent
*
* Checks whether e is live (reachable from lba tree) and drops or initializes
* accordingly.
*
* Returns if e is live.
*/
init_cached_extent_ret init_cached_extent(
Transaction &t,
CachedExtentRef e) final;
@ -117,8 +132,8 @@ public:
void add_pin(LBAPin &pin) final {
auto *bpin = reinterpret_cast<BtreeLBAPin*>(&pin);
pin_set.add_pin(bpin->pin);
bpin->parent = nullptr;
pin_set.add_pin(bpin->get_range_pin());
bpin->set_parent(nullptr);
}
~BtreeLBAManager();
@ -126,84 +141,21 @@ private:
SegmentManager &segment_manager;
Cache &cache;
btree_pin_set_t pin_set;
btree_pin_set_t<laddr_t> pin_set;
struct {
uint64_t num_alloc_extents = 0;
uint64_t num_alloc_extents_iter_nexts = 0;
} stats;
op_context_t get_context(Transaction &t) {
return op_context_t{cache, t, &pin_set};
op_context_t<laddr_t> get_context(Transaction &t) {
return op_context_t<laddr_t>{cache, t, &pin_set};
}
static btree_range_pin_t &get_pin(CachedExtent &e);
static btree_range_pin_t<laddr_t> &get_pin(CachedExtent &e);
seastar::metrics::metric_group metrics;
void register_metrics();
template <typename F, typename... Args>
auto with_btree(
op_context_t c,
F &&f) {
return cache.get_root(
c.trans
).si_then([this, c, f=std::forward<F>(f)](RootBlockRef croot) mutable {
return seastar::do_with(
LBABtree(croot->get_root().lba_root),
[this, c, croot, f=std::move(f)](auto &btree) mutable {
return f(
btree
).si_then([this, c, croot, &btree] {
if (btree.is_root_dirty()) {
auto mut_croot = cache.duplicate_for_write(
c.trans, croot
)->cast<RootBlock>();
mut_croot->get_root().lba_root = btree.get_root_undirty();
}
return base_iertr::now();
});
});
});
}
template <typename State, typename F>
auto with_btree_state(
op_context_t c,
State &&init,
F &&f) {
return seastar::do_with(
std::forward<State>(init),
[this, c, f=std::forward<F>(f)](auto &state) mutable {
(void)this; // silence incorrect clang warning about capture
return with_btree(c, [&state, f=std::move(f)](auto &btree) mutable {
return f(btree, state);
}).si_then([&state] {
return seastar::make_ready_future<State>(std::move(state));
});
});
}
template <typename State, typename F>
auto with_btree_state(
op_context_t c,
F &&f) {
return with_btree_state<State, F>(c, State{}, std::forward<F>(f));
}
template <typename Ret, typename F>
auto with_btree_ret(
op_context_t c,
F &&f) {
return with_btree_state<Ret>(
c,
[f=std::forward<F>(f)](auto &btree, auto &ret) mutable {
return f(
btree
).si_then([&ret](auto &&_ret) {
ret = std::move(_ret);
});
});
}
/**
* update_refcount

View File

@ -1,155 +0,0 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h"
#include "crimson/os/seastore/logging.h"
SET_SUBSYS(seastore_lba);
namespace crimson::os::seastore::lba_manager::btree {
void btree_range_pin_t::take_pin(btree_range_pin_t &other)
{
ceph_assert(other.extent);
if (other.pins) {
other.pins->replace_pin(*this, other);
pins = other.pins;
other.pins = nullptr;
if (other.has_ref()) {
other.drop_ref();
acquire_ref();
}
}
}
btree_range_pin_t::~btree_range_pin_t()
{
LOG_PREFIX(btree_range_pin_t::~btree_range_pin_t);
ceph_assert(!pins == !is_linked());
ceph_assert(!ref);
if (pins) {
TRACE("removing {}", *this);
pins->remove_pin(*this, true);
}
extent = nullptr;
}
void btree_pin_set_t::replace_pin(btree_range_pin_t &to, btree_range_pin_t &from)
{
pins.replace_node(pins.iterator_to(from), to);
}
void btree_pin_set_t::remove_pin(btree_range_pin_t &pin, bool do_check_parent)
{
LOG_PREFIX(btree_pin_set_t::remove_pin);
TRACE("{}", pin);
ceph_assert(pin.is_linked());
ceph_assert(pin.pins);
ceph_assert(!pin.ref);
pins.erase(pin);
pin.pins = nullptr;
if (do_check_parent) {
check_parent(pin);
}
}
btree_range_pin_t *btree_pin_set_t::maybe_get_parent(
const lba_node_meta_t &meta)
{
auto cmeta = meta;
cmeta.depth++;
auto iter = pins.upper_bound(cmeta, btree_range_pin_t::meta_cmp_t());
if (iter == pins.begin()) {
return nullptr;
} else {
--iter;
if (iter->range.is_parent_of(meta)) {
return &*iter;
} else {
return nullptr;
}
}
}
const btree_range_pin_t *btree_pin_set_t::maybe_get_first_child(
const lba_node_meta_t &meta) const
{
if (meta.depth == 0) {
return nullptr;
}
auto cmeta = meta;
cmeta.depth--;
auto iter = pins.lower_bound(cmeta, btree_range_pin_t::meta_cmp_t());
if (iter == pins.end()) {
return nullptr;
} else if (meta.is_parent_of(iter->range)) {
return &*iter;
} else {
return nullptr;
}
}
void btree_pin_set_t::release_if_no_children(btree_range_pin_t &pin)
{
ceph_assert(pin.is_linked());
if (maybe_get_first_child(pin.range) == nullptr) {
pin.drop_ref();
}
}
void btree_pin_set_t::add_pin(btree_range_pin_t &pin)
{
LOG_PREFIX(btree_pin_set_t::add_pin);
ceph_assert(!pin.is_linked());
ceph_assert(!pin.pins);
ceph_assert(!pin.ref);
auto [prev, inserted] = pins.insert(pin);
if (!inserted) {
ERROR("unable to add {} ({}), found {} ({})",
pin,
*(pin.extent),
*prev,
*(prev->extent));
ceph_assert(0 == "impossible");
return;
}
pin.pins = this;
if (!pin.is_root()) {
auto *parent = maybe_get_parent(pin.range);
ceph_assert(parent);
if (!parent->has_ref()) {
TRACE("acquiring parent {}", static_cast<void*>(parent));
parent->acquire_ref();
} else {
TRACE("parent has ref {}", static_cast<void*>(parent));
}
}
if (maybe_get_first_child(pin.range) != nullptr) {
TRACE("acquiring self {}", pin);
pin.acquire_ref();
}
}
void btree_pin_set_t::retire(btree_range_pin_t &pin)
{
pin.drop_ref();
remove_pin(pin, false);
}
void btree_pin_set_t::check_parent(btree_range_pin_t &pin)
{
LOG_PREFIX(btree_pin_set_t::check_parent);
auto parent = maybe_get_parent(pin.range);
if (parent) {
TRACE("releasing parent {}", *parent);
release_if_no_children(*parent);
}
}
}

View File

@ -1,292 +0,0 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
#pragma once
#include <boost/intrusive/set.hpp>
#include "crimson/os/seastore/cached_extent.h"
#include "crimson/os/seastore/seastore_types.h"
namespace crimson::os::seastore::lba_manager::btree {
class LBANode;
using LBANodeRef = TCachedExtentRef<LBANode>;
struct lba_node_meta_t {
laddr_t begin = 0;
laddr_t end = 0;
depth_t depth = 0;
bool is_parent_of(const lba_node_meta_t &other) const {
return (depth == other.depth + 1) &&
(begin <= other.begin) &&
(end > other.begin);
}
std::pair<lba_node_meta_t, lba_node_meta_t> split_into(laddr_t pivot) const {
return std::make_pair(
lba_node_meta_t{begin, pivot, depth},
lba_node_meta_t{pivot, end, depth});
}
static lba_node_meta_t merge_from(
const lba_node_meta_t &lhs, const lba_node_meta_t &rhs) {
ceph_assert(lhs.depth == rhs.depth);
return lba_node_meta_t{lhs.begin, rhs.end, lhs.depth};
}
static std::pair<lba_node_meta_t, lba_node_meta_t>
rebalance(const lba_node_meta_t &lhs, const lba_node_meta_t &rhs, laddr_t pivot) {
ceph_assert(lhs.depth == rhs.depth);
return std::make_pair(
lba_node_meta_t{lhs.begin, pivot, lhs.depth},
lba_node_meta_t{pivot, rhs.end, lhs.depth});
}
bool is_root() const {
return begin == 0 && end == L_ADDR_MAX;
}
};
inline std::ostream &operator<<(
std::ostream &lhs,
const lba_node_meta_t &rhs)
{
return lhs << "btree_node_meta_t("
<< "begin=" << rhs.begin
<< ", end=" << rhs.end
<< ", depth=" << rhs.depth
<< ")";
}
/**
* btree_range_pin_t
*
* Element tracked by btree_pin_set_t below. Encapsulates the intrusive_set
* hook, the lba_node_meta_t representing the lba range covered by a node,
* and extent and ref members intended to hold a reference when the extent
* should be pinned.
*/
class btree_pin_set_t;
class btree_range_pin_t : public boost::intrusive::set_base_hook<> {
friend class btree_pin_set_t;
lba_node_meta_t range;
btree_pin_set_t *pins = nullptr;
// We need to be able to remember extent without holding a reference,
// but we can do it more compactly -- TODO
CachedExtent *extent = nullptr;
CachedExtentRef ref;
using index_t = boost::intrusive::set<btree_range_pin_t>;
static auto get_tuple(const lba_node_meta_t &meta) {
return std::make_tuple(-meta.depth, meta.begin);
}
void acquire_ref() {
ref = CachedExtentRef(extent);
}
void drop_ref() {
ref.reset();
}
public:
btree_range_pin_t() = default;
btree_range_pin_t(CachedExtent *extent)
: extent(extent) {}
btree_range_pin_t(const btree_range_pin_t &rhs, CachedExtent *extent)
: range(rhs.range), extent(extent) {}
bool has_ref() const {
return !!ref;
}
bool is_root() const {
return range.is_root();
}
void set_range(const lba_node_meta_t &nrange) {
range = nrange;
}
void set_extent(CachedExtent *nextent) {
ceph_assert(!extent);
extent = nextent;
}
CachedExtent &get_extent() {
assert(extent);
return *extent;
}
bool has_ref() {
return !!ref;
}
void take_pin(btree_range_pin_t &other);
friend bool operator<(
const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) {
return get_tuple(lhs.range) < get_tuple(rhs.range);
}
friend bool operator>(
const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) {
return get_tuple(lhs.range) > get_tuple(rhs.range);
}
friend bool operator==(
const btree_range_pin_t &lhs, const btree_range_pin_t &rhs) {
return get_tuple(lhs.range) == rhs.get_tuple(rhs.range);
}
struct meta_cmp_t {
bool operator()(
const btree_range_pin_t &lhs, const lba_node_meta_t &rhs) const {
return get_tuple(lhs.range) < get_tuple(rhs);
}
bool operator()(
const lba_node_meta_t &lhs, const btree_range_pin_t &rhs) const {
return get_tuple(lhs) < get_tuple(rhs.range);
}
};
friend std::ostream &operator<<(
std::ostream &lhs,
const btree_range_pin_t &rhs) {
return lhs << "btree_range_pin_t("
<< "begin=" << rhs.range.begin
<< ", end=" << rhs.range.end
<< ", depth=" << rhs.range.depth
<< ", extent=" << rhs.extent
<< ")";
}
friend class BtreeLBAPin;
~btree_range_pin_t();
};
/**
* btree_pin_set_t
*
* Ensures that for every cached node, all parent LBANodes required
* to map it are present in cache. Relocating these nodes can
* therefore be done without further reads or cache space.
*
* Contains a btree_range_pin_t for every clean or dirty LBANode
* or LogicalCachedExtent instance in cache at any point in time.
* For any LBANode, the contained btree_range_pin_t will hold
* a reference to that node pinning it in cache as long as that
* node has children in the set. This invariant can be violated
* only by calling retire_extent and is repaired by calling
* check_parent synchronously after adding any new extents.
*/
class btree_pin_set_t {
friend class btree_range_pin_t;
using pins_t = btree_range_pin_t::index_t;
pins_t pins;
/// Removes pin from set optionally checking whether parent has other children
void remove_pin(btree_range_pin_t &pin, bool check_parent);
void replace_pin(btree_range_pin_t &to, btree_range_pin_t &from);
/// Returns parent pin if exists
btree_range_pin_t *maybe_get_parent(const lba_node_meta_t &pin);
/// Returns earliest child pin if exist
const btree_range_pin_t *maybe_get_first_child(const lba_node_meta_t &pin) const;
/// Releases pin if it has no children
void release_if_no_children(btree_range_pin_t &pin);
public:
/// Adds pin to set, assumes set is consistent
void add_pin(btree_range_pin_t &pin);
/**
* retire/check_parent
*
* See BtreeLBAManager::complete_transaction.
* retire removes the specified pin from the set, but does not
* check parents. After any new extents are added to the set,
* the caller is required to call check_parent to restore the
* invariant.
*/
void retire(btree_range_pin_t &pin);
void check_parent(btree_range_pin_t &pin);
template <typename F>
void scan(F &&f) {
for (auto &i : pins) {
std::invoke(f, i);
}
}
~btree_pin_set_t() {
ceph_assert(pins.empty());
}
};
class BtreeLBAPin : public LBAPin {
friend class BtreeLBAManager;
friend class LBABtree;
/**
* parent
*
* populated until link_extent is called to ensure cache residence
* until add_pin is called.
*/
CachedExtentRef parent;
paddr_t paddr;
btree_range_pin_t pin;
public:
BtreeLBAPin() = default;
BtreeLBAPin(
CachedExtentRef parent,
paddr_t paddr,
lba_node_meta_t &&meta)
: parent(parent), paddr(paddr) {
pin.set_range(std::move(meta));
}
void link_extent(LogicalCachedExtent *ref) final {
pin.set_extent(ref);
}
extent_len_t get_length() const final {
ceph_assert(pin.range.end > pin.range.begin);
return pin.range.end - pin.range.begin;
}
paddr_t get_paddr() const final {
return paddr;
}
laddr_t get_laddr() const final {
return pin.range.begin;
}
LBAPinRef duplicate() const final {
auto ret = std::unique_ptr<BtreeLBAPin>(new BtreeLBAPin);
ret->pin.set_range(pin.range);
ret->paddr = paddr;
ret->parent = parent;
return ret;
}
void take_pin(LBAPin &opin) final {
pin.take_pin(static_cast<BtreeLBAPin&>(opin).pin);
}
bool has_been_invalidated() const final {
return parent->has_been_invalidated();
}
};
}

File diff suppressed because it is too large Load Diff

View File

@ -1,702 +0,0 @@
// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
// vim: ts=8 sw=2 smarttab
#pragma once
#include <boost/container/static_vector.hpp>
#include <sys/mman.h>
#include <memory>
#include <string.h>
#include "crimson/os/seastore/lba_manager.h"
#include "crimson/os/seastore/logging.h"
#include "crimson/os/seastore/seastore_types.h"
#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
namespace crimson::os::seastore::lba_manager::btree {
class LBABtree {
static constexpr size_t MAX_DEPTH = 16;
public:
using base_iertr = LBAManager::base_iertr;
class iterator;
using iterator_fut = base_iertr::future<iterator>;
using mapped_space_visitor_t = LBAManager::scan_mapped_space_func_t;
class iterator {
public:
iterator(const iterator &rhs) noexcept :
internal(rhs.internal), leaf(rhs.leaf) {}
iterator(iterator &&rhs) noexcept :
internal(std::move(rhs.internal)), leaf(std::move(rhs.leaf)) {}
iterator &operator=(const iterator &) = default;
iterator &operator=(iterator &&) = default;
iterator_fut next(
op_context_t c,
mapped_space_visitor_t *visit=nullptr) const;
iterator_fut prev(op_context_t c) const;
void assert_valid() const {
assert(leaf.node);
assert(leaf.pos <= leaf.node->get_size());
for (auto &i: internal) {
(void)i;
assert(i.node);
assert(i.pos < i.node->get_size());
}
}
depth_t get_depth() const {
return internal.size() + 1;
}
auto &get_internal(depth_t depth) {
assert(depth > 1);
assert((depth - 2) < internal.size());
return internal[depth - 2];
}
const auto &get_internal(depth_t depth) const {
assert(depth > 1);
assert((depth - 2) < internal.size());
return internal[depth - 2];
}
laddr_t get_key() const {
assert(!is_end());
return leaf.node->iter_idx(leaf.pos).get_key();
}
lba_map_val_t get_val() const {
assert(!is_end());
auto ret = leaf.node->iter_idx(leaf.pos).get_val();
ret.paddr = ret.paddr.maybe_relative_to(leaf.node->get_paddr());
return ret;
}
bool is_end() const {
// external methods may only resolve at a boundary if at end
return at_boundary();
}
bool is_begin() const {
for (auto &i: internal) {
if (i.pos != 0)
return false;
}
return leaf.pos == 0;
}
LBAPinRef get_pin() const {
assert(!is_end());
auto val = get_val();
auto key = get_key();
return std::make_unique<BtreeLBAPin>(
leaf.node,
val.paddr,
lba_node_meta_t{ key, key + val.len, 0 });
}
private:
iterator() noexcept {}
iterator(depth_t depth) noexcept : internal(depth - 1) {}
friend class LBABtree;
static constexpr uint16_t INVALID = std::numeric_limits<uint16_t>::max();
template <typename NodeType>
struct node_position_t {
typename NodeType::Ref node;
uint16_t pos = INVALID;
void reset() {
*this = node_position_t{};
}
auto get_iter() {
assert(pos != INVALID);
assert(pos < node->get_size());
return node->iter_idx(pos);
}
};
boost::container::static_vector<
node_position_t<LBAInternalNode>, MAX_DEPTH> internal;
node_position_t<LBALeafNode> leaf;
bool at_boundary() const {
assert(leaf.pos <= leaf.node->get_size());
return leaf.pos == leaf.node->get_size();
}
using handle_boundary_ertr = base_iertr;
using handle_boundary_ret = handle_boundary_ertr::future<>;
handle_boundary_ret handle_boundary(
op_context_t c,
mapped_space_visitor_t *visitor);
depth_t check_split() const {
if (!leaf.node->at_max_capacity()) {
return 0;
}
for (depth_t split_from = 1; split_from < get_depth(); ++split_from) {
if (!get_internal(split_from + 1).node->at_max_capacity())
return split_from;
}
return get_depth();
}
depth_t check_merge() const {
if (!leaf.node->below_min_capacity()) {
return 0;
}
for (depth_t merge_from = 1; merge_from < get_depth(); ++merge_from) {
if (!get_internal(merge_from + 1).node->below_min_capacity())
return merge_from;
}
return get_depth();
}
};
LBABtree(lba_root_t root) : root(root) {}
bool is_root_dirty() const {
return root_dirty;
}
lba_root_t get_root_undirty() {
ceph_assert(root_dirty);
root_dirty = false;
return root;
}
/// mkfs
using mkfs_ret = lba_root_t;
static mkfs_ret mkfs(op_context_t c);
/**
* lower_bound
*
* @param c [in] context
* @param addr [in] ddr
* @return least iterator >= key
*/
iterator_fut lower_bound(
op_context_t c,
laddr_t addr,
mapped_space_visitor_t *visit=nullptr) const;
/**
* upper_bound
*
* @param c [in] context
* @param addr [in] ddr
* @return least iterator > key
*/
iterator_fut upper_bound(
op_context_t c,
laddr_t addr
) const {
return lower_bound(
c, addr
).si_then([c, addr](auto iter) {
if (!iter.is_end() && iter.get_key() == addr) {
return iter.next(c);
} else {
return iterator_fut(
interruptible::ready_future_marker{},
iter);
}
});
}
/**
* upper_bound_right
*
* @param c [in] context
* @param addr [in] addr
* @return least iterator i s.t. i.get_key() + i.get_val().len > key
*/
iterator_fut upper_bound_right(
op_context_t c,
laddr_t addr) const
{
return lower_bound(
c, addr
).si_then([c, addr](auto iter) {
if (iter.is_begin()) {
return iterator_fut(
interruptible::ready_future_marker{},
iter);
} else {
return iter.prev(
c
).si_then([iter, addr](auto prev) {
if ((prev.get_key() + prev.get_val().len) > addr) {
return iterator_fut(
interruptible::ready_future_marker{},
prev);
} else {
return iterator_fut(
interruptible::ready_future_marker{},
iter);
}
});
}
});
}
iterator_fut begin(op_context_t c) const {
return lower_bound(c, 0);
}
iterator_fut end(op_context_t c) const {
return upper_bound(c, L_ADDR_MAX);
}
using iterate_repeat_ret_inner = base_iertr::future<
seastar::stop_iteration>;
template <typename F>
static base_iertr::future<> iterate_repeat(
op_context_t c,
iterator_fut &&iter_fut,
F &&f,
mapped_space_visitor_t *visitor=nullptr) {
return std::move(
iter_fut
).si_then([c, visitor, f=std::forward<F>(f)](auto iter) {
return seastar::do_with(
iter,
std::move(f),
[c, visitor](auto &pos, auto &f) {
return trans_intr::repeat(
[c, visitor, &f, &pos] {
return f(
pos
).si_then([c, visitor, &pos](auto done) {
if (done == seastar::stop_iteration::yes) {
return iterate_repeat_ret_inner(
interruptible::ready_future_marker{},
seastar::stop_iteration::yes);
} else {
ceph_assert(!pos.is_end());
return pos.next(
c, visitor
).si_then([&pos](auto next) {
pos = next;
return iterate_repeat_ret_inner(
interruptible::ready_future_marker{},
seastar::stop_iteration::no);
});
}
});
});
});
});
}
/**
* insert
*
* Inserts val at laddr with iter as a hint. If element at laddr already
* exists returns iterator to that element unchanged and returns false.
*
* Invalidates all outstanding iterators for this tree on this transaction.
*
* @param c [in] op context
* @param iter [in] hint, insertion constant if immediately prior to iter
* @param laddr [in] addr at which to insert
* @param val [in] val to insert
* @return pair<iter, bool> where iter points to element at addr, bool true
* iff element at laddr did not exist.
*/
using insert_iertr = base_iertr;
using insert_ret = insert_iertr::future<std::pair<iterator, bool>>;
insert_ret insert(
op_context_t c,
iterator iter,
laddr_t laddr,
lba_map_val_t val
);
insert_ret insert(
op_context_t c,
laddr_t laddr,
lba_map_val_t val) {
return lower_bound(
c, laddr
).si_then([this, c, laddr, val](auto iter) {
return insert(c, iter, laddr, val);
});
}
/**
* update
*
* Invalidates all outstanding iterators for this tree on this transaction.
*
* @param c [in] op context
* @param iter [in] iterator to element to update, must not be end
* @param val [in] val with which to update
* @return iterator to newly updated element
*/
using update_iertr = base_iertr;
using update_ret = update_iertr::future<iterator>;
update_ret update(
op_context_t c,
iterator iter,
lba_map_val_t val);
/**
* remove
*
* Invalidates all outstanding iterators for this tree on this transaction.
*
* @param c [in] op context
* @param iter [in] iterator to element to remove, must not be end
*/
using remove_iertr = base_iertr;
using remove_ret = remove_iertr::future<>;
remove_ret remove(
op_context_t c,
iterator iter);
/**
* init_cached_extent
*
* Checks whether e is live (reachable from lba tree) and drops or initializes
* accordingly.
*
* Returns if e is live.
*/
using init_cached_extent_iertr = base_iertr;
using init_cached_extent_ret = init_cached_extent_iertr::future<bool>;
init_cached_extent_ret init_cached_extent(op_context_t c, CachedExtentRef e);
/// get_leaf_if_live: get leaf node at laddr/addr if still live
using get_leaf_if_live_iertr = base_iertr;
using get_leaf_if_live_ret = get_leaf_if_live_iertr::future<CachedExtentRef>;
get_leaf_if_live_ret get_leaf_if_live(
op_context_t c,
paddr_t addr,
laddr_t laddr,
seastore_off_t len);
/// get_internal_if_live: get internal node at laddr/addr if still live
using get_internal_if_live_iertr = base_iertr;
using get_internal_if_live_ret = get_internal_if_live_iertr::future<CachedExtentRef>;
get_internal_if_live_ret get_internal_if_live(
op_context_t c,
paddr_t addr,
laddr_t laddr,
seastore_off_t len);
/**
* rewrite_lba_extent
*
* Rewrites a fresh copy of extent into transaction and updates internal
* references.
*/
using rewrite_lba_extent_iertr = base_iertr;
using rewrite_lba_extent_ret = rewrite_lba_extent_iertr::future<>;
rewrite_lba_extent_ret rewrite_lba_extent(op_context_t c, CachedExtentRef e);
private:
lba_root_t root;
bool root_dirty = false;
using get_internal_node_iertr = base_iertr;
using get_internal_node_ret = get_internal_node_iertr::future<LBAInternalNodeRef>;
static get_internal_node_ret get_internal_node(
op_context_t c,
depth_t depth,
paddr_t offset,
laddr_t begin,
laddr_t end);
using get_leaf_node_iertr = base_iertr;
using get_leaf_node_ret = get_leaf_node_iertr::future<LBALeafNodeRef>;
static get_leaf_node_ret get_leaf_node(
op_context_t c,
paddr_t offset,
laddr_t begin,
laddr_t end);
using lookup_root_iertr = base_iertr;
using lookup_root_ret = lookup_root_iertr::future<>;
lookup_root_ret lookup_root(
op_context_t c,
iterator &iter,
mapped_space_visitor_t *visitor) const {
if (root.get_depth() > 1) {
return get_internal_node(
c,
root.get_depth(),
root.get_location(),
0,
L_ADDR_MAX
).si_then([this, visitor, &iter](LBAInternalNodeRef root_node) {
iter.get_internal(root.get_depth()).node = root_node;
if (visitor) (*visitor)(root_node->get_paddr(), root_node->get_length());
return lookup_root_iertr::now();
});
} else {
return get_leaf_node(
c,
root.get_location(),
0,
L_ADDR_MAX
).si_then([visitor, &iter](LBALeafNodeRef root_node) {
iter.leaf.node = root_node;
if (visitor) (*visitor)(root_node->get_paddr(), root_node->get_length());
return lookup_root_iertr::now();
});
}
}
using lookup_internal_level_iertr = base_iertr;
using lookup_internal_level_ret = lookup_internal_level_iertr::future<>;
template <typename F>
static lookup_internal_level_ret lookup_internal_level(
op_context_t c,
depth_t depth,
iterator &iter,
F &f,
mapped_space_visitor_t *visitor
) {
assert(depth > 1);
auto &parent_entry = iter.get_internal(depth + 1);
auto parent = parent_entry.node;
auto node_iter = parent->iter_idx(parent_entry.pos);
auto next_iter = node_iter + 1;
auto begin = node_iter->get_key();
auto end = next_iter == parent->end()
? parent->get_node_meta().end
: next_iter->get_key();
return get_internal_node(
c,
depth,
node_iter->get_val().maybe_relative_to(parent->get_paddr()),
begin,
end
).si_then([depth, visitor, &iter, &f](LBAInternalNodeRef node) {
auto &entry = iter.get_internal(depth);
entry.node = node;
auto node_iter = f(*node);
assert(node_iter != node->end());
entry.pos = node_iter->get_offset();
if (visitor) (*visitor)(node->get_paddr(), node->get_length());
return seastar::now();
});
}
using lookup_leaf_iertr = base_iertr;
using lookup_leaf_ret = lookup_leaf_iertr::future<>;
template <typename F>
static lookup_internal_level_ret lookup_leaf(
op_context_t c,
iterator &iter,
F &f,
mapped_space_visitor_t *visitor
) {
auto &parent_entry = iter.get_internal(2);
auto parent = parent_entry.node;
assert(parent);
auto node_iter = parent->iter_idx(parent_entry.pos);
auto next_iter = node_iter + 1;
auto begin = node_iter->get_key();
auto end = next_iter == parent->end()
? parent->get_node_meta().end
: next_iter->get_key();
return get_leaf_node(
c,
node_iter->get_val().maybe_relative_to(parent->get_paddr()),
begin,
end
).si_then([visitor, &iter, &f](LBALeafNodeRef node) {
iter.leaf.node = node;
auto node_iter = f(*node);
iter.leaf.pos = node_iter->get_offset();
if (visitor) (*visitor)(node->get_paddr(), node->get_length());
return seastar::now();
});
}
/**
* lookup_depth_range
*
* Performs node lookups on depths [from, to) using li and ll to
* specific target at each level. Note, may leave the iterator
* at_boundary(), call handle_boundary() prior to returning out
* lf LBABtree.
*/
using lookup_depth_range_iertr = base_iertr;
using lookup_depth_range_ret = lookup_depth_range_iertr::future<>;
template <typename LI, typename LL>
static lookup_depth_range_ret lookup_depth_range(
op_context_t c, ///< [in] context
iterator &iter, ///< [in,out] iterator to populate
depth_t from, ///< [in] from inclusive
depth_t to, ///< [in] to exclusive, (to <= from, to == from is a noop)
LI &li, ///< [in] internal->iterator
LL &ll, ///< [in] leaf->iterator
mapped_space_visitor_t *visitor ///< [in] mapped space visitor
) {
LOG_PREFIX(LBATree::lookup_depth_range);
SUBDEBUGT(seastore_lba_details, "{} -> {}", c.trans, from, to);
return seastar::do_with(
from,
[c, to, visitor, &iter, &li, &ll](auto &d) {
return trans_intr::repeat(
[c, to, visitor, &iter, &li, &ll, &d] {
if (d > to) {
return [&] {
if (d > 1) {
return lookup_internal_level(
c,
d,
iter,
li,
visitor);
} else {
assert(d == 1);
return lookup_leaf(
c,
iter,
ll,
visitor);
}
}().si_then([&d] {
--d;
return lookup_depth_range_iertr::make_ready_future<
seastar::stop_iteration
>(seastar::stop_iteration::no);
});
} else {
return lookup_depth_range_iertr::make_ready_future<
seastar::stop_iteration
>(seastar::stop_iteration::yes);
}
});
});
}
using lookup_iertr = base_iertr;
using lookup_ret = lookup_iertr::future<iterator>;
template <typename LI, typename LL>
lookup_ret lookup(
op_context_t c,
LI &&lookup_internal,
LL &&lookup_leaf,
mapped_space_visitor_t *visitor
) const {
LOG_PREFIX(LBATree::lookup);
return seastar::do_with(
iterator{root.get_depth()},
std::forward<LI>(lookup_internal),
std::forward<LL>(lookup_leaf),
[FNAME, this, visitor, c](auto &iter, auto &li, auto &ll) {
return lookup_root(
c, iter, visitor
).si_then([FNAME, this, visitor, c, &iter, &li, &ll] {
if (iter.get_depth() > 1) {
auto &root_entry = *(iter.internal.rbegin());
root_entry.pos = li(*(root_entry.node)).get_offset();
} else {
auto &root_entry = iter.leaf;
auto riter = ll(*(root_entry.node));
root_entry.pos = riter->get_offset();
}
SUBDEBUGT(seastore_lba_details, "got root, depth {}", c.trans, root.get_depth());
return lookup_depth_range(
c,
iter,
root.get_depth() - 1,
0,
li,
ll,
visitor
).si_then([c, visitor, &iter] {
if (iter.at_boundary()) {
return iter.handle_boundary(c, visitor);
} else {
return lookup_iertr::now();
}
});
}).si_then([&iter] {
return std::move(iter);
});
});
}
/**
* handle_split
*
* Prepare iter for insertion. iter should begin pointing at
* the valid insertion point (lower_bound(laddr)).
*
* Upon completion, iter will point at the
* position at which laddr should be inserted. iter may, upon completion,
* point at the end of a leaf other than the end leaf if that's the correct
* insertion point.
*/
using find_insertion_iertr = base_iertr;
using find_insertion_ret = find_insertion_iertr::future<>;
static find_insertion_ret find_insertion(
op_context_t c,
laddr_t laddr,
iterator &iter);
/**
* handle_split
*
* Split nodes in iter as needed for insertion. First, scan iter from leaf
* to find first non-full level. Then, split from there towards leaf.
*
* Upon completion, iter will point at the newly split insertion point. As
* with find_insertion, iter's leaf pointer may be end without iter being
* end.
*/
using handle_split_iertr = base_iertr;
using handle_split_ret = handle_split_iertr::future<>;
handle_split_ret handle_split(
op_context_t c,
iterator &iter);
using handle_merge_iertr = base_iertr;
using handle_merge_ret = handle_merge_iertr::future<>;
handle_merge_ret handle_merge(
op_context_t c,
iterator &iter);
using update_internal_mapping_iertr = base_iertr;
using update_internal_mapping_ret = update_internal_mapping_iertr::future<>;
update_internal_mapping_ret update_internal_mapping(
op_context_t c,
depth_t depth,
laddr_t laddr,
paddr_t old_addr,
paddr_t new_addr);
template <typename T>
using node_position_t = iterator::node_position_t<T>;
template <typename NodeType>
friend base_iertr::future<typename NodeType::Ref> get_node(
op_context_t c,
depth_t depth,
paddr_t addr,
laddr_t begin,
laddr_t end);
template <typename NodeType>
friend handle_merge_ret merge_level(
op_context_t c,
depth_t depth,
node_position_t<LBAInternalNode> &parent_pos,
node_position_t<NodeType> &pos);
};
}

View File

@ -27,24 +27,6 @@ std::ostream& operator<<(std::ostream& out, const lba_map_val_t& v)
<< ")";
}
std::ostream &LBAInternalNode::print_detail(std::ostream &out) const
{
return out << ", size=" << get_size()
<< ", meta=" << get_meta();
}
void LBAInternalNode::resolve_relative_addrs(paddr_t base)
{
LOG_PREFIX(LBAInternalNode::resolve_relative_addrs);
for (auto i: *this) {
if (i->get_val().is_relative()) {
auto updated = base.add_relative(i->get_val());
TRACE("{} -> {}", i->get_val(), updated);
i->set_val(updated);
}
}
}
std::ostream &LBALeafNode::print_detail(std::ostream &out) const
{
return out << ", size=" << get_size()

View File

@ -16,18 +16,15 @@
#include "crimson/os/seastore/seastore_types.h"
#include "crimson/os/seastore/cache.h"
#include "crimson/os/seastore/cached_extent.h"
#include "crimson/os/seastore/lba_manager/btree/lba_btree_node.h"
#include "crimson/os/seastore/lba_manager/btree/btree_range_pin.h"
#include "crimson/os/seastore/btree/btree_range_pin.h"
#include "crimson/os/seastore/btree/fixed_kv_btree.h"
#include "crimson/os/seastore/btree/fixed_kv_node.h"
namespace crimson::os::seastore::lba_manager::btree {
using base_iertr = LBAManager::base_iertr;
struct op_context_t {
Cache &cache;
Transaction &trans;
btree_pin_set_t *pins = nullptr;
};
using LBANode = FixedKVNode<laddr_t>;
/**
* lba_map_val_t
@ -57,71 +54,11 @@ WRITE_EQ_OPERATORS_4(
std::ostream& operator<<(std::ostream& out, const lba_map_val_t&);
class BtreeLBAPin;
using BtreeLBAPinRef = std::unique_ptr<BtreeLBAPin>;
constexpr size_t LBA_BLOCK_SIZE = 4096;
/**
* lba_node_meta_le_t
*
* On disk layout for lba_node_meta_t
*/
struct lba_node_meta_le_t {
laddr_le_t begin = laddr_le_t(0);
laddr_le_t end = laddr_le_t(0);
depth_le_t depth = init_depth_le(0);
using lba_node_meta_t = fixed_kv_node_meta_t<laddr_t>;
lba_node_meta_le_t() = default;
lba_node_meta_le_t(const lba_node_meta_le_t &) = default;
explicit lba_node_meta_le_t(const lba_node_meta_t &val)
: begin(ceph_le64(val.begin)),
end(ceph_le64(val.end)),
depth(init_depth_le(val.depth)) {}
operator lba_node_meta_t() const {
return lba_node_meta_t{ begin, end, depth };
}
};
/**
* LBANode
*
* Base class enabling recursive lookup between internal and leaf nodes.
*/
struct LBANode : CachedExtent {
using LBANodeRef = TCachedExtentRef<LBANode>;
btree_range_pin_t pin;
LBANode(ceph::bufferptr &&ptr) : CachedExtent(std::move(ptr)), pin(this) {}
LBANode(const LBANode &rhs)
: CachedExtent(rhs), pin(rhs.pin, this) {}
virtual lba_node_meta_t get_node_meta() const = 0;
virtual ~LBANode() = default;
void on_delta_write(paddr_t record_block_offset) final {
// All in-memory relative addrs are necessarily record-relative
assert(get_prior_instance());
pin.take_pin(get_prior_instance()->cast<LBANode>()->pin);
resolve_relative_addrs(record_block_offset);
}
void on_initial_write() final {
// All in-memory relative addrs are necessarily block-relative
resolve_relative_addrs(get_paddr());
}
void on_clean_read() final {
// From initial write of block, relative addrs are necessarily block-relative
resolve_relative_addrs(get_paddr());
}
virtual void resolve_relative_addrs(paddr_t base) = 0;
};
using LBANodeRef = LBANode::LBANodeRef;
using lba_node_meta_le_t = fixed_kv_node_meta_le_t<laddr_le_t>;
/**
* LBAInternalNode
@ -142,197 +79,22 @@ using LBANodeRef = LBANode::LBANodeRef;
*/
constexpr size_t INTERNAL_NODE_CAPACITY = 254;
struct LBAInternalNode
: LBANode,
common::FixedKVNodeLayout<
: FixedKVInternalNode<
INTERNAL_NODE_CAPACITY,
lba_node_meta_t, lba_node_meta_le_t,
laddr_t, laddr_le_t,
paddr_t, paddr_le_t> {
LBA_BLOCK_SIZE,
LBAInternalNode> {
using Ref = TCachedExtentRef<LBAInternalNode>;
using internal_iterator_t = const_iterator;
template <typename... T>
LBAInternalNode(T&&... t) :
LBANode(std::forward<T>(t)...),
FixedKVNodeLayout(get_bptr().c_str()) {}
FixedKVInternalNode(std::forward<T>(t)...) {}
static constexpr extent_types_t TYPE = extent_types_t::LADDR_INTERNAL;
lba_node_meta_t get_node_meta() const { return get_meta(); }
CachedExtentRef duplicate_for_write() final {
assert(delta_buffer.empty());
return CachedExtentRef(new LBAInternalNode(*this));
};
delta_buffer_t delta_buffer;
delta_buffer_t *maybe_get_delta_buffer() {
return is_mutation_pending() ? &delta_buffer : nullptr;
}
void update(
const_iterator iter,
paddr_t addr) {
return journal_update(
iter,
maybe_generate_relative(addr),
maybe_get_delta_buffer());
}
void insert(
const_iterator iter,
laddr_t pivot,
paddr_t addr) {
return journal_insert(
iter,
pivot,
maybe_generate_relative(addr),
maybe_get_delta_buffer());
}
void remove(const_iterator iter) {
return journal_remove(
iter,
maybe_get_delta_buffer());
}
void replace(
const_iterator iter,
laddr_t pivot,
paddr_t addr) {
return journal_replace(
iter,
pivot,
maybe_generate_relative(addr),
maybe_get_delta_buffer());
}
std::tuple<Ref, Ref, laddr_t>
make_split_children(op_context_t c) {
auto left = c.cache.alloc_new_extent<LBAInternalNode>(
c.trans, LBA_BLOCK_SIZE);
auto right = c.cache.alloc_new_extent<LBAInternalNode>(
c.trans, LBA_BLOCK_SIZE);
auto pivot = split_into(*left, *right);
left->pin.set_range(left->get_meta());
right->pin.set_range(right->get_meta());
return std::make_tuple(
left,
right,
pivot);
}
Ref make_full_merge(
op_context_t c,
Ref &right) {
auto replacement = c.cache.alloc_new_extent<LBAInternalNode>(
c.trans, LBA_BLOCK_SIZE);
replacement->merge_from(*this, *right->cast<LBAInternalNode>());
replacement->pin.set_range(replacement->get_meta());
return replacement;
}
std::tuple<Ref, Ref, laddr_t>
make_balanced(
op_context_t c,
Ref &_right,
bool prefer_left) {
ceph_assert(_right->get_type() == get_type());
auto &right = *_right->cast<LBAInternalNode>();
auto replacement_left = c.cache.alloc_new_extent<LBAInternalNode>(
c.trans, LBA_BLOCK_SIZE);
auto replacement_right = c.cache.alloc_new_extent<LBAInternalNode>(
c.trans, LBA_BLOCK_SIZE);
auto pivot = balance_into_new_nodes(
*this,
right,
prefer_left,
*replacement_left,
*replacement_right);
replacement_left->pin.set_range(replacement_left->get_meta());
replacement_right->pin.set_range(replacement_right->get_meta());
return std::make_tuple(
replacement_left,
replacement_right,
pivot);
}
/**
* Internal relative addresses on read or in memory prior to commit
* are either record or block relative depending on whether this
* physical node is is_initial_pending() or just is_pending().
*
* User passes appropriate base depending on lifecycle and
* resolve_relative_addrs fixes up relative internal references
* based on base.
*/
void resolve_relative_addrs(paddr_t base);
void node_resolve_vals(iterator from, iterator to) const final {
if (is_initial_pending()) {
for (auto i = from; i != to; ++i) {
if (i->get_val().is_relative()) {
assert(i->get_val().is_block_relative());
i->set_val(get_paddr().add_relative(i->get_val()));
}
}
}
}
void node_unresolve_vals(iterator from, iterator to) const final {
if (is_initial_pending()) {
for (auto i = from; i != to; ++i) {
if (i->get_val().is_relative()) {
assert(i->get_val().is_record_relative());
i->set_val(i->get_val() - get_paddr());
}
}
}
}
extent_types_t get_type() const final {
return TYPE;
}
std::ostream &print_detail(std::ostream &out) const final;
ceph::bufferlist get_delta() final {
ceph::buffer::ptr bptr(delta_buffer.get_bytes());
delta_buffer.copy_out(bptr.c_str(), bptr.length());
ceph::bufferlist bl;
bl.push_back(bptr);
return bl;
}
void apply_delta_and_adjust_crc(
paddr_t base, const ceph::bufferlist &_bl) final {
assert(_bl.length());
ceph::bufferlist bl = _bl;
bl.rebuild();
delta_buffer_t buffer;
buffer.copy_in(bl.front().c_str(), bl.front().length());
buffer.replay(*this);
set_last_committed_crc(get_crc32c());
resolve_relative_addrs(base);
}
constexpr static size_t get_min_capacity() {
return (get_capacity() - 1) / 2;
}
bool at_max_capacity() const {
assert(get_size() <= get_capacity());
return get_size() == get_capacity();
}
bool at_min_capacity() const {
assert(get_size() >= (get_min_capacity() - 1));
return get_size() <= get_min_capacity();
}
bool below_min_capacity() const {
assert(get_size() >= (get_min_capacity() - 1));
return get_size() < get_min_capacity();
}
};
using LBAInternalNodeRef = LBAInternalNode::Ref;
@ -380,36 +142,23 @@ struct lba_map_val_le_t {
};
struct LBALeafNode
: LBANode,
common::FixedKVNodeLayout<
: FixedKVLeafNode<
LEAF_NODE_CAPACITY,
lba_node_meta_t, lba_node_meta_le_t,
laddr_t, laddr_le_t,
lba_map_val_t, lba_map_val_le_t> {
lba_map_val_t, lba_map_val_le_t,
LBA_BLOCK_SIZE,
LBALeafNode> {
using Ref = TCachedExtentRef<LBALeafNode>;
using internal_iterator_t = const_iterator;
template <typename... T>
LBALeafNode(T&&... t) :
LBANode(std::forward<T>(t)...),
FixedKVNodeLayout(get_bptr().c_str()) {}
FixedKVLeafNode(std::forward<T>(t)...) {}
static constexpr extent_types_t TYPE = extent_types_t::LADDR_LEAF;
lba_node_meta_t get_node_meta() const { return get_meta(); }
CachedExtentRef duplicate_for_write() final {
assert(delta_buffer.empty());
return CachedExtentRef(new LBALeafNode(*this));
};
delta_buffer_t delta_buffer;
delta_buffer_t *maybe_get_delta_buffer() {
return is_mutation_pending() ? &delta_buffer : nullptr;
}
void update(
const_iterator iter,
lba_map_val_t val) {
lba_map_val_t val) final {
val.paddr = maybe_generate_relative(val.paddr);
return journal_update(
iter,
@ -417,10 +166,10 @@ struct LBALeafNode
maybe_get_delta_buffer());
}
auto insert(
const_iterator insert(
const_iterator iter,
laddr_t addr,
lba_map_val_t val) {
lba_map_val_t val) final {
val.paddr = maybe_generate_relative(val.paddr);
journal_insert(
iter,
@ -430,65 +179,12 @@ struct LBALeafNode
return iter;
}
void remove(const_iterator iter) {
void remove(const_iterator iter) final {
return journal_remove(
iter,
maybe_get_delta_buffer());
}
std::tuple<Ref, Ref, laddr_t>
make_split_children(op_context_t c) {
auto left = c.cache.alloc_new_extent<LBALeafNode>(
c.trans, LBA_BLOCK_SIZE);
auto right = c.cache.alloc_new_extent<LBALeafNode>(
c.trans, LBA_BLOCK_SIZE);
auto pivot = split_into(*left, *right);
left->pin.set_range(left->get_meta());
right->pin.set_range(right->get_meta());
return std::make_tuple(
left,
right,
pivot);
}
Ref make_full_merge(
op_context_t c,
Ref &right) {
auto replacement = c.cache.alloc_new_extent<LBALeafNode>(
c.trans, LBA_BLOCK_SIZE);
replacement->merge_from(*this, *right->cast<LBALeafNode>());
replacement->pin.set_range(replacement->get_meta());
return replacement;
}
std::tuple<Ref, Ref, laddr_t>
make_balanced(
op_context_t c,
Ref &_right,
bool prefer_left) {
ceph_assert(_right->get_type() == get_type());
auto &right = *_right->cast<LBALeafNode>();
auto replacement_left = c.cache.alloc_new_extent<LBALeafNode>(
c.trans, LBA_BLOCK_SIZE);
auto replacement_right = c.cache.alloc_new_extent<LBALeafNode>(
c.trans, LBA_BLOCK_SIZE);
auto pivot = balance_into_new_nodes(
*this,
right,
prefer_left,
*replacement_left,
*replacement_right);
replacement_left->pin.set_range(replacement_left->get_meta());
replacement_right->pin.set_range(replacement_right->get_meta());
return std::make_tuple(
replacement_left,
replacement_right,
pivot);
}
// See LBAInternalNode, same concept
void resolve_relative_addrs(paddr_t base);
void node_resolve_vals(iterator from, iterator to) const final {
@ -517,50 +213,11 @@ struct LBALeafNode
}
}
ceph::bufferlist get_delta() final {
ceph::buffer::ptr bptr(delta_buffer.get_bytes());
delta_buffer.copy_out(bptr.c_str(), bptr.length());
ceph::bufferlist bl;
bl.push_back(bptr);
return bl;
}
void apply_delta_and_adjust_crc(
paddr_t base, const ceph::bufferlist &_bl) final {
assert(_bl.length());
ceph::bufferlist bl = _bl;
bl.rebuild();
delta_buffer_t buffer;
buffer.copy_in(bl.front().c_str(), bl.front().length());
buffer.replay(*this);
set_last_committed_crc(get_crc32c());
resolve_relative_addrs(base);
}
extent_types_t get_type() const final {
return TYPE;
}
std::ostream &print_detail(std::ostream &out) const final;
constexpr static size_t get_min_capacity() {
return (get_capacity() - 1) / 2;
}
bool at_max_capacity() const {
assert(get_size() <= get_capacity());
return get_size() == get_capacity();
}
bool at_min_capacity() const {
assert(get_size() >= (get_min_capacity() - 1));
return get_size() <= get_min_capacity();
}
bool below_min_capacity() const {
assert(get_size() >= (get_min_capacity() - 1));
return get_size() < get_min_capacity();
}
};
using LBALeafNodeRef = TCachedExtentRef<LBALeafNode>;

View File

@ -69,10 +69,10 @@ ObjectDataHandler::write_ret do_removals(
LOG_PREFIX(object_data_handler.cc::do_removals);
DEBUGT("decreasing ref: {}",
ctx.t,
pin->get_laddr());
pin->get_key());
return ctx.tm.dec_ref(
ctx.t,
pin->get_laddr()
pin->get_key()
).si_then(
[](auto){},
ObjectDataHandler::write_iertr::pass_further{},
@ -129,14 +129,14 @@ ObjectDataHandler::write_ret do_insertions(
region.len
).si_then([FNAME, ctx, &region](auto pin) {
ceph_assert(pin->get_length() == region.len);
if (pin->get_laddr() != region.addr) {
if (pin->get_key() != region.addr) {
ERRORT(
"inconsistent laddr: pin: {} region {}",
ctx.t,
pin->get_laddr(),
pin->get_key(),
region.addr);
}
ceph_assert(pin->get_laddr() == region.addr);
ceph_assert(pin->get_key() == region.addr);
return ObjectDataHandler::write_iertr::now();
});
}
@ -156,7 +156,7 @@ using split_ret_bare = std::pair<
using split_ret = get_iertr::future<split_ret_bare>;
split_ret split_pin_left(context_t ctx, LBAPinRef &pin, laddr_t offset)
{
const auto pin_offset = pin->get_laddr();
const auto pin_offset = pin->get_key();
assert_aligned(pin_offset);
ceph_assert(offset >= pin_offset);
if (offset == pin_offset) {
@ -181,7 +181,7 @@ split_ret split_pin_left(context_t ctx, LBAPinRef &pin, laddr_t offset)
);
} else {
// Data, return up to offset to prepend
auto to_prepend = offset - pin->get_laddr();
auto to_prepend = offset - pin->get_key();
return read_pin(ctx, pin->duplicate()
).si_then([to_prepend](auto extent) {
return get_iertr::make_ready_future<split_ret_bare>(
@ -194,8 +194,8 @@ split_ret split_pin_left(context_t ctx, LBAPinRef &pin, laddr_t offset)
/// Reverse of split_pin_left
split_ret split_pin_right(context_t ctx, LBAPinRef &pin, laddr_t end)
{
const auto pin_begin = pin->get_laddr();
const auto pin_end = pin->get_laddr() + pin->get_length();
const auto pin_begin = pin->get_key();
const auto pin_end = pin->get_key() + pin->get_length();
assert_aligned(pin_end);
ceph_assert(pin_end >= end);
if (end == pin_end) {
@ -273,7 +273,7 @@ ObjectDataHandler::write_ret ObjectDataHandler::prepare_data_reservation(
).si_then([max_object_size=max_object_size, &object_data](auto pin) {
ceph_assert(pin->get_length() == max_object_size);
object_data.update_reserved(
pin->get_laddr(),
pin->get_key(),
pin->get_length());
return write_iertr::now();
});
@ -302,17 +302,17 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
_pins.swap(pins);
ceph_assert(pins.size());
auto &pin = *pins.front();
ceph_assert(pin.get_laddr() >= object_data.get_reserved_data_base());
ceph_assert(pin.get_key() >= object_data.get_reserved_data_base());
ceph_assert(
pin.get_laddr() <= object_data.get_reserved_data_base() + size);
auto pin_offset = pin.get_laddr() -
pin.get_key() <= object_data.get_reserved_data_base() + size);
auto pin_offset = pin.get_key() -
object_data.get_reserved_data_base();
if ((pin.get_laddr() == (object_data.get_reserved_data_base() + size)) ||
if ((pin.get_key() == (object_data.get_reserved_data_base() + size)) ||
(pin.get_paddr().is_zero())) {
/* First pin is exactly at the boundary or is a zero pin. Either way,
* remove all pins and add a single zero pin to the end. */
to_write.emplace_back(
pin.get_laddr(),
pin.get_key(),
object_data.get_reserved_data_len() - pin_offset);
return clear_iertr::now();
} else {
@ -332,7 +332,7 @@ ObjectDataHandler::clear_ret ObjectDataHandler::trim_data_reservation(
));
bl.append_zero(p2roundup(size, ctx.tm.get_block_size()) - size);
to_write.emplace_back(
pin.get_laddr(),
pin.get_key(),
bl);
to_write.emplace_back(
object_data.get_reserved_data_base() +
@ -387,9 +387,9 @@ ObjectDataHandler::write_ret ObjectDataHandler::overwrite(
offset,
bl.length());
ceph_assert(pins.size() >= 1);
auto pin_begin = pins.front()->get_laddr();
auto pin_begin = pins.front()->get_key();
ceph_assert(pin_begin <= offset);
auto pin_end = pins.back()->get_laddr() + pins.back()->get_length();
auto pin_end = pins.back()->get_key() + pins.back()->get_length();
ceph_assert(pin_end >= (offset + bl.length()));
return split_pin_left(
@ -500,7 +500,7 @@ ObjectDataHandler::read_ret ObjectDataHandler::read(
).si_then([ctx, loffset, len, &ret](auto _pins) {
// offset~len falls within reserved region and len > 0
ceph_assert(_pins.size() >= 1);
ceph_assert((*_pins.begin())->get_laddr() <= loffset);
ceph_assert((*_pins.begin())->get_key() <= loffset);
return seastar::do_with(
std::move(_pins),
loffset,
@ -511,9 +511,9 @@ ObjectDataHandler::read_ret ObjectDataHandler::read(
-> read_iertr::future<> {
ceph_assert(current <= (loffset + len));
ceph_assert(
(loffset + len) > pin->get_laddr());
(loffset + len) > pin->get_key());
laddr_t end = std::min(
pin->get_laddr() + pin->get_length(),
pin->get_key() + pin->get_length(),
loffset + len);
if (pin->get_paddr().is_zero()) {
ceph_assert(end > current); // See LBAManager::get_mappings
@ -583,12 +583,12 @@ ObjectDataHandler::fiemap_ret ObjectDataHandler::fiemap(
len
).si_then([loffset, len, &object_data, &ret](auto &&pins) {
ceph_assert(pins.size() >= 1);
ceph_assert((*pins.begin())->get_laddr() <= loffset);
ceph_assert((*pins.begin())->get_key() <= loffset);
for (auto &&i: pins) {
if (!(i->get_paddr().is_zero())) {
auto ret_left = std::max(i->get_laddr(), loffset);
auto ret_left = std::max(i->get_key(), loffset);
auto ret_right = std::min(
i->get_laddr() + i->get_length(),
i->get_key() + i->get_length(),
loffset + len);
assert(ret_right > ret_left);
ret.emplace(

View File

@ -712,6 +712,8 @@ struct __attribute((packed)) paddr_le_t {
ceph_le64 dev_addr =
ceph_le64(P_ADDR_NULL.dev_addr);
using orig_type = paddr_t;
paddr_le_t() = default;
paddr_le_t(const paddr_t &addr) : dev_addr(ceph_le64(addr.dev_addr)) {}
@ -800,6 +802,8 @@ constexpr laddr_t L_ADDR_LBAT = L_ADDR_MAX - 2;
struct __attribute((packed)) laddr_le_t {
ceph_le64 laddr = ceph_le64(L_ADDR_NULL);
using orig_type = laddr_t;
laddr_le_t() = default;
laddr_le_t(const laddr_le_t &) = default;
explicit laddr_le_t(const laddr_t &addr)
@ -1081,22 +1085,22 @@ public:
};
/**
* lba_root_t
* phy_tree_root_t
*/
class __attribute__((packed)) lba_root_t {
class __attribute__((packed)) phy_tree_root_t {
paddr_le_t root_addr;
depth_le_t depth = init_extent_len_le(0);
public:
lba_root_t() = default;
phy_tree_root_t() = default;
lba_root_t(paddr_t addr, depth_t depth)
phy_tree_root_t(paddr_t addr, depth_t depth)
: root_addr(addr), depth(init_depth_le(depth)) {}
lba_root_t(const lba_root_t &o) = default;
lba_root_t(lba_root_t &&o) = default;
lba_root_t &operator=(const lba_root_t &o) = default;
lba_root_t &operator=(lba_root_t &&o) = default;
phy_tree_root_t(const phy_tree_root_t &o) = default;
phy_tree_root_t(phy_tree_root_t &&o) = default;
phy_tree_root_t &operator=(const phy_tree_root_t &o) = default;
phy_tree_root_t &operator=(phy_tree_root_t &&o) = default;
paddr_t get_location() const {
return root_addr;
@ -1188,6 +1192,7 @@ public:
}
};
using lba_root_t = phy_tree_root_t;
/**
* root_t

View File

@ -481,14 +481,14 @@ TransactionManager::get_extent_if_live_ret TransactionManager::get_extent_if_liv
return lba_manager->get_mapping(
t,
laddr).si_then([=, &t] (LBAPinRef pin) -> inner_ret {
ceph_assert(pin->get_laddr() == laddr);
ceph_assert(pin->get_key() == laddr);
if (pin->get_paddr() == addr) {
if (pin->get_length() != (extent_len_t)len) {
ERRORT(
"Invalid pin {}~{} {} found for "
"extent {} {}~{} {}",
t,
pin->get_laddr(),
pin->get_key(),
pin->get_length(),
pin->get_paddr(),
type,

View File

@ -141,7 +141,7 @@ struct lba_btree_test : btree_test_base {
std::map<laddr_t, lba_map_val_t> check;
auto get_op_context(Transaction &t) {
return op_context_t{*cache, t};
return op_context_t<laddr_t>{*cache, t};
}
LBAManager::mkfs_ret test_structure_setup(Transaction &t) final {
@ -376,11 +376,11 @@ struct btree_lba_manager_test : btree_test_base {
}).unsafe_get0();
logger().debug("alloc'd: {}", *ret);
EXPECT_EQ(len, ret->get_length());
auto [b, e] = get_overlap(t, ret->get_laddr(), len);
auto [b, e] = get_overlap(t, ret->get_key(), len);
EXPECT_EQ(b, e);
t.mappings.emplace(
std::make_pair(
ret->get_laddr(),
ret->get_key(),
test_extent_t{
ret->get_paddr(),
ret->get_length(),
@ -474,7 +474,7 @@ struct btree_lba_manager_test : btree_test_base {
EXPECT_EQ(ret_list.size(), 1);
auto &ret = *ret_list.begin();
EXPECT_EQ(i.second.addr, ret->get_paddr());
EXPECT_EQ(laddr, ret->get_laddr());
EXPECT_EQ(laddr, ret->get_key());
EXPECT_EQ(len, ret->get_length());
auto ret_pin = with_trans_intr(
@ -484,7 +484,7 @@ struct btree_lba_manager_test : btree_test_base {
t, laddr);
}).unsafe_get0();
EXPECT_EQ(i.second.addr, ret_pin->get_paddr());
EXPECT_EQ(laddr, ret_pin->get_laddr());
EXPECT_EQ(laddr, ret_pin->get_key());
EXPECT_EQ(len, ret_pin->get_length());
}
with_trans_intr(
@ -554,8 +554,8 @@ TEST_F(btree_lba_manager_test, force_split_merge)
check_mappings(t);
check_mappings();
}
incref_mapping(t, ret->get_laddr());
decref_mapping(t, ret->get_laddr());
incref_mapping(t, ret->get_key());
decref_mapping(t, ret->get_key());
}
logger().debug("submitting transaction");
submit_test_transaction(std::move(t));